From fcb4253c9cdaece2dd3a2ab8c6842ed41e24f123 Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 24 Jul 2015 16:54:51 -0700 Subject: [PATCH] update libyuv to r1456 picks up build warning fixes for visual studio 2015 Change-Id: Idea85fa70d1aeb2a46ea355b87fe41ec5b2b9520 --- examples.mk | 5 +- third_party/libyuv/README.libvpx | 3 +- third_party/libyuv/include/libyuv/convert.h | 2 + third_party/libyuv/include/libyuv/convert_argb.h | 16 +- third_party/libyuv/include/libyuv/convert_from.h | 11 + .../libyuv/include/libyuv/convert_from_argb.h | 15 +- .../libyuv/include/libyuv/planar_functions.h | 30 +- third_party/libyuv/include/libyuv/rotate_row.h | 138 +++ third_party/libyuv/include/libyuv/row.h | 241 ++++-- third_party/libyuv/include/libyuv/scale_row.h | 236 ++++- third_party/libyuv/include/libyuv/version.h | 2 +- third_party/libyuv/source/compare.cc | 6 +- .../source/{compare_posix.cc => compare_gcc.cc} | 0 third_party/libyuv/source/compare_neon64.cc | 2 +- third_party/libyuv/source/compare_win.cc | 15 +- third_party/libyuv/source/convert.cc | 229 ++--- third_party/libyuv/source/convert_argb.cc | 100 ++- third_party/libyuv/source/convert_from.cc | 127 +++ third_party/libyuv/source/convert_from_argb.cc | 127 ++- third_party/libyuv/source/cpu_id.cc | 57 +- third_party/libyuv/source/mjpeg_decoder.cc | 6 + third_party/libyuv/source/mjpeg_validate.cc | 2 +- third_party/libyuv/source/planar_functions.cc | 299 ++++++- third_party/libyuv/source/rotate.cc | 865 +------------------ third_party/libyuv/source/rotate_any.cc | 55 ++ third_party/libyuv/source/rotate_argb.cc | 24 +- third_party/libyuv/source/rotate_common.cc | 92 ++ third_party/libyuv/source/rotate_gcc.cc | 493 +++++++++++ third_party/libyuv/source/rotate_mips.cc | 9 +- third_party/libyuv/source/rotate_neon.cc | 1 + third_party/libyuv/source/rotate_neon64.cc | 6 +- third_party/libyuv/source/rotate_win.cc | 248 ++++++ third_party/libyuv/source/row_any.cc | 933 ++++++++++---------- third_party/libyuv/source/row_common.cc | 227 +++-- .../libyuv/source/{row_posix.cc => row_gcc.cc} | 310 ++++--- third_party/libyuv/source/row_neon.cc | 95 +- third_party/libyuv/source/row_neon64.cc | 276 +++--- third_party/libyuv/source/row_win.cc | 955 +++++++++++++++------ third_party/libyuv/source/scale.cc | 404 ++++----- third_party/libyuv/source/scale_any.cc | 200 +++++ third_party/libyuv/source/scale_argb.cc | 116 ++- third_party/libyuv/source/scale_common.cc | 48 +- .../libyuv/source/{scale_posix.cc => scale_gcc.cc} | 35 +- third_party/libyuv/source/scale_neon.cc | 272 ++++++ third_party/libyuv/source/scale_neon64.cc | 374 ++++++-- third_party/libyuv/source/scale_win.cc | 376 +++++--- 46 files changed, 5264 insertions(+), 2819 deletions(-) create mode 100644 third_party/libyuv/include/libyuv/rotate_row.h rename third_party/libyuv/source/{compare_posix.cc => compare_gcc.cc} (100%) create mode 100644 third_party/libyuv/source/rotate_any.cc create mode 100644 third_party/libyuv/source/rotate_common.cc create mode 100644 third_party/libyuv/source/rotate_gcc.cc create mode 100644 third_party/libyuv/source/rotate_win.cc rename third_party/libyuv/source/{row_posix.cc => row_gcc.cc} (97%) create mode 100644 third_party/libyuv/source/scale_any.cc rename third_party/libyuv/source/{scale_posix.cc => scale_gcc.cc} (97%) diff --git a/examples.mk b/examples.mk index fad02cf..8327ca0 100644 --- a/examples.mk +++ b/examples.mk @@ -22,17 +22,18 @@ LIBYUV_SRCS += third_party/libyuv/include/libyuv/basic_types.h \ third_party/libyuv/source/planar_functions.cc \ third_party/libyuv/source/row_any.cc \ third_party/libyuv/source/row_common.cc \ + third_party/libyuv/source/row_gcc.cc \ third_party/libyuv/source/row_mips.cc \ third_party/libyuv/source/row_neon.cc \ third_party/libyuv/source/row_neon64.cc \ - third_party/libyuv/source/row_posix.cc \ third_party/libyuv/source/row_win.cc \ third_party/libyuv/source/scale.cc \ + third_party/libyuv/source/scale_any.cc \ third_party/libyuv/source/scale_common.cc \ + third_party/libyuv/source/scale_gcc.cc \ third_party/libyuv/source/scale_mips.cc \ third_party/libyuv/source/scale_neon.cc \ third_party/libyuv/source/scale_neon64.cc \ - third_party/libyuv/source/scale_posix.cc \ third_party/libyuv/source/scale_win.cc \ LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \ diff --git a/third_party/libyuv/README.libvpx b/third_party/libyuv/README.libvpx index 6a5b60c..09693c1 100644 --- a/third_party/libyuv/README.libvpx +++ b/third_party/libyuv/README.libvpx @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1305 +Version: 1456 License: BSD License File: LICENSE @@ -13,4 +13,3 @@ which down-samples the original input video (f.g. 1280x720) a number of times in order to encode multiple resolution bit streams. Local Modifications: -cherry pick r1311 'disable nv12 avx2 for vs9/10 that dont support avx2 instructions.' diff --git a/third_party/libyuv/include/libyuv/convert.h b/third_party/libyuv/include/libyuv/convert.h index 97936ad..a8d3fa0 100644 --- a/third_party/libyuv/include/libyuv/convert.h +++ b/third_party/libyuv/include/libyuv/convert.h @@ -71,6 +71,8 @@ int I400ToI420(const uint8* src_y, int src_stride_y, uint8* dst_v, int dst_stride_v, int width, int height); +#define J400ToJ420 I400ToI420 + // Convert NV12 to I420. LIBYUV_API int NV12ToI420(const uint8* src_y, int src_stride_y, diff --git a/third_party/libyuv/include/libyuv/convert_argb.h b/third_party/libyuv/include/libyuv/convert_argb.h index a0de89e..360c6d3 100644 --- a/third_party/libyuv/include/libyuv/convert_argb.h +++ b/third_party/libyuv/include/libyuv/convert_argb.h @@ -68,20 +68,20 @@ int I411ToARGB(const uint8* src_y, int src_stride_y, uint8* dst_argb, int dst_stride_argb, int width, int height); -// Convert I400 (grey) to ARGB. +// Convert I400 (grey) to ARGB. Reverse of ARGBToI400. LIBYUV_API int I400ToARGB(const uint8* src_y, int src_stride_y, uint8* dst_argb, int dst_stride_argb, int width, int height); -// Alias. -#define YToARGB I400ToARGB_Reference - -// Convert I400 to ARGB. Reverse of ARGBToI400. +// Convert J400 (jpeg grey) to ARGB. LIBYUV_API -int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height); +int J400ToARGB(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Alias. +#define YToARGB I400ToARGB // Convert NV12 to ARGB. LIBYUV_API diff --git a/third_party/libyuv/include/libyuv/convert_from.h b/third_party/libyuv/include/libyuv/convert_from.h index d6c0e3d..9fd8d4d 100644 --- a/third_party/libyuv/include/libyuv/convert_from.h +++ b/third_party/libyuv/include/libyuv/convert_from.h @@ -137,6 +137,17 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, uint8* dst_frame, int dst_stride_frame, int width, int height); +// Convert I420 To RGB565 with 4x4 dither matrix (16 bytes). +// Values in dither matrix from 0 to 7 recommended. +// The order of the dither matrix is first byte is upper left. + +LIBYUV_API +int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_frame, int dst_stride_frame, + const uint8* dither4x4, int width, int height); + LIBYUV_API int I420ToARGB1555(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, diff --git a/third_party/libyuv/include/libyuv/convert_from_argb.h b/third_party/libyuv/include/libyuv/convert_from_argb.h index c592fc2..1df5320 100644 --- a/third_party/libyuv/include/libyuv/convert_from_argb.h +++ b/third_party/libyuv/include/libyuv/convert_from_argb.h @@ -61,12 +61,15 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, uint8* dst_rgb565, int dst_stride_rgb565, int width, int height); -// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes). -// Values in dither matrix from 0 to 255. 128 is best for no dither. +// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). +// Values in dither matrix from 0 to 7 recommended. +// The order of the dither matrix is first byte is upper left. +// TODO(fbarchard): Consider pointer to 2d array for dither4x4. +// const uint8(*dither)[4][4]; LIBYUV_API int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, uint8* dst_rgb565, int dst_stride_rgb565, - const uint8* dither8x8, int width, int height); + const uint8* dither4x4, int width, int height); // Convert ARGB To ARGB1555. LIBYUV_API @@ -140,6 +143,12 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, uint8* dst_y, int dst_stride_y, int width, int height); +// Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB) +LIBYUV_API +int ARGBToG(const uint8* src_argb, int src_stride_argb, + uint8* dst_g, int dst_stride_g, + int width, int height); + // Convert ARGB To NV12. LIBYUV_API int ARGBToNV12(const uint8* src_argb, int src_stride_argb, diff --git a/third_party/libyuv/include/libyuv/planar_functions.h b/third_party/libyuv/include/libyuv/planar_functions.h index d10a169..ae994db 100644 --- a/third_party/libyuv/include/libyuv/planar_functions.h +++ b/third_party/libyuv/include/libyuv/planar_functions.h @@ -45,6 +45,7 @@ int I400ToI400(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, int width, int height); +#define J400ToJ400 I400ToI400 // Copy I422 to I422. #define I422ToI422 I422Copy @@ -84,6 +85,18 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_v, int dst_stride_v, int width, int height); +LIBYUV_API +int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height); + +LIBYUV_API +int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height); + // Convert I420 to I400. (calls CopyPlane ignoring u/v). LIBYUV_API int I420ToI400(const uint8* src_y, int src_stride_y, @@ -93,6 +106,7 @@ int I420ToI400(const uint8* src_y, int src_stride_y, int width, int height); // Alias +#define J420ToJ400 I420ToI400 #define I420ToI420Mirror I420Mirror // I420 mirror. @@ -387,24 +401,24 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0, uint8* dst_argb, int dst_stride_argb, int width, int height, int interpolation); -#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \ - defined(TARGET_IPHONE_SIMULATOR) +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__i386__) && !defined(__SSE2__)) #define LIBYUV_DISABLE_X86 #endif +// The following are available on all x86 platforms: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#define HAS_ARGBAFFINEROW_SSE2 +#endif -// Row functions for copying a pixels from a source with a slope to a row +// Row function for copying pixels from a source with a slope to a row // of destination. Useful for scaling, rotation, mirror, texture mapping. LIBYUV_API void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, uint8* dst_argb, const float* uv_dudv, int width); -// The following are available on all x86 platforms: -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, uint8* dst_argb, const float* uv_dudv, int width); -#define HAS_ARGBAFFINEROW_SSE2 -#endif // LIBYUV_DISABLE_X86 // Shuffle ARGB channel order. e.g. BGRA to ARGB. // shuffler is 16 bytes and must be aligned. diff --git a/third_party/libyuv/include/libyuv/rotate_row.h b/third_party/libyuv/include/libyuv/rotate_row.h new file mode 100644 index 0000000..c41cf32 --- /dev/null +++ b/third_party/libyuv/include/libyuv/rotate_row.h @@ -0,0 +1,138 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef INCLUDE_LIBYUV_ROTATE_ROW_H_ // NOLINT +#define INCLUDE_LIBYUV_ROTATE_ROW_H_ + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__i386__) && !defined(__SSE2__)) +#define LIBYUV_DISABLE_X86 +#endif + +// Visual C 2012 required for AVX2. +#if defined(_M_IX86) && !defined(__clang__) && \ + defined(_MSC_VER) && _MSC_VER >= 1700 +#define VISUALC_HAS_AVX2 1 +#endif // VisualStudio >= 2012 + +// TODO(fbarchard): switch to standard form of inline; fails on clangcl. +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#if defined(__APPLE__) && defined(__i386__) +#define DECLARE_FUNCTION(name) \ + ".text \n" \ + ".private_extern _" #name " \n" \ + ".align 4,0x90 \n" \ +"_" #name ": \n" +#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__) +#define DECLARE_FUNCTION(name) \ + ".text \n" \ + ".align 4,0x90 \n" \ +"_" #name ": \n" +#else +#define DECLARE_FUNCTION(name) \ + ".text \n" \ + ".align 4,0x90 \n" \ +#name ": \n" +#endif +#endif + +// The following are available for Visual C: +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ + defined(_MSC_VER) && !defined(__clang__) +#define HAS_TRANSPOSEWX8_SSSE3 +#define HAS_TRANSPOSEUVWX8_SSE2 +#endif + +// The following are available for GCC but not NaCL: +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) +#define HAS_TRANSPOSEWX8_SSSE3 +#endif + +// The following are available for 32 bit GCC: +#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__) +#define HAS_TRANSPOSEUVWX8_SSE2 +#endif + +// The following are available for 64 bit GCC but not NaCL: +#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ + defined(__x86_64__) +#define HAS_TRANSPOSEWX8_FAST_SSSE3 +#define HAS_TRANSPOSEUVWX8_SSE2 +#endif + +#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) +#define HAS_TRANSPOSEWX8_NEON +#define HAS_TRANSPOSEUVWX8_NEON +#endif + +#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ + defined(__mips__) && \ + defined(__mips_dsp) && (__mips_dsp_rev >= 2) +#define HAS_TRANSPOSEWX8_MIPS_DSPR2 +#define HAS_TRANSPOSEUVWx8_MIPS_DSPR2 +#endif // defined(__mips__) + +void TransposeWxH_C(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width, int height); + +void TransposeWx8_C(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +void TransposeWx8_NEON(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +void TransposeWx8_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); + +void TransposeWx8_Any_NEON(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +void TransposeWx8_Any_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +void TransposeWx8_Fast_Any_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +void TransposeWx8_Any_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); + +void TransposeUVWxH_C(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height); + +void TransposeUVWx8_C(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width); +void TransposeUVWx8_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width); +void TransposeUVWx8_NEON(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width); +void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width); + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + +#endif // INCLUDE_LIBYUV_ROTATE_ROW_H_ NOLINT diff --git a/third_party/libyuv/include/libyuv/row.h b/third_party/libyuv/include/libyuv/row.h index 80e844b..ebae3e7 100644 --- a/third_party/libyuv/include/libyuv/row.h +++ b/third_party/libyuv/include/libyuv/row.h @@ -37,10 +37,8 @@ extern "C" { free(var##_mem); \ var = 0 -#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \ - defined(TARGET_IPHONE_SIMULATOR) || \ - (defined(__i386__) && !defined(__SSE2__)) || \ - (defined(_MSC_VER) && defined(__clang__)) +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__i386__) && !defined(__SSE2__)) #define LIBYUV_DISABLE_X86 #endif // True if compiling for SSSE3 as a requirement. @@ -48,6 +46,9 @@ extern "C" { #define LIBYUV_SSSE3_ONLY #endif +#if defined(__native_client__) +#define LIBYUV_DISABLE_NEON +#endif // clang >= 3.5.0 required for Arm64. #if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON) #if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5)) @@ -63,11 +64,11 @@ extern "C" { #define HAS_ABGRTOYROW_SSSE3 #define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2 +#define HAS_ARGBSETROW_X86 #define HAS_ARGBSHUFFLEROW_SSE2 #define HAS_ARGBSHUFFLEROW_SSSE3 #define HAS_ARGBTOARGB1555ROW_SSE2 #define HAS_ARGBTOARGB4444ROW_SSE2 -#define HAS_ARGBTOBAYERGGROW_SSE2 #define HAS_ARGBTORAWROW_SSSE3 #define HAS_ARGBTORGB24ROW_SSSE3 #define HAS_ARGBTORGB565ROW_SSE2 @@ -95,7 +96,8 @@ extern "C" { #define HAS_I422TOUYVYROW_SSE2 #define HAS_I422TOYUY2ROW_SSE2 #define HAS_I444TOARGBROW_SSSE3 -// #define HAS_J422TOARGBROW_SSSE3 +#define HAS_J400TOARGBROW_SSE2 +#define HAS_J422TOARGBROW_SSSE3 #define HAS_MERGEUVROW_SSE2 #define HAS_MIRRORROW_SSE2 #define HAS_MIRRORROW_SSSE3 @@ -112,15 +114,13 @@ extern "C" { #define HAS_RGB565TOARGBROW_SSE2 #define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOYROW_SSSE3 -#define HAS_SETROW_X86 #define HAS_SETROW_ERMS -#define HAS_ARGBSETROW_X86 +#define HAS_SETROW_X86 #define HAS_SPLITUVROW_SSE2 #define HAS_UYVYTOARGBROW_SSSE3 #define HAS_UYVYTOUV422ROW_SSE2 #define HAS_UYVYTOUVROW_SSE2 #define HAS_UYVYTOYROW_SSE2 -#define HAS_YTOARGBROW_SSE2 #define HAS_YUY2TOARGBROW_SSSE3 #define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUVROW_SSE2 @@ -157,8 +157,9 @@ extern "C" { #define HAS_SOBELYROW_SSE2 #endif -// The following are available on x64 Visual C: -#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) +// The following are available on x64 Visual C and clangcl. +#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64) && \ + (!defined(__clang__) || defined(__SSSE3__)) #define HAS_I422TOARGBROW_SSSE3 #endif @@ -177,27 +178,31 @@ extern "C" { #endif // __clang__ // Visual C 2012 required for AVX2. -#if defined(_M_IX86) && defined(_MSC_VER) && _MSC_VER >= 1700 +#if defined(_M_IX86) && !defined(__clang__) && \ + defined(_MSC_VER) && _MSC_VER >= 1700 #define VISUALC_HAS_AVX2 1 #endif // VisualStudio >= 2012 // The following are available require VS2012. Port to GCC. #if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) -// TODO(fbarchard): fix AVX2 versions of YUV conversion. bug=393 -#define HAS_I422TOABGRROW_AVX2 -#define HAS_I422TOARGBROW_AVX2 -#define HAS_I422TOBGRAROW_AVX2 -#define HAS_I422TORGBAROW_AVX2 -#define HAS_NV12TOARGBROW_AVX2 -#define HAS_NV21TOARGBROW_AVX2 -#define HAS_ARGBTORGB565ROW_AVX2 +#define HAS_ARGB1555TOARGBROW_AVX2 +#define HAS_ARGB4444TOARGBROW_AVX2 #define HAS_ARGBTOARGB1555ROW_AVX2 #define HAS_ARGBTOARGB4444ROW_AVX2 -#define HAS_NV12TORGB565ROW_AVX2 -#define HAS_NV21TORGB565ROW_AVX2 -#define HAS_I422TORGB565ROW_AVX2 +#define HAS_ARGBTORGB565DITHERROW_AVX2 +#define HAS_ARGBTORGB565DITHERROW_SSE2 +#define HAS_ARGBTORGB565ROW_AVX2 +#define HAS_I411TOARGBROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2 +#define HAS_I422TORGB565ROW_AVX2 +#define HAS_I444TOARGBROW_AVX2 +#define HAS_J400TOARGBROW_AVX2 +#define HAS_NV12TOARGBROW_AVX2 +#define HAS_NV12TORGB565ROW_AVX2 +#define HAS_NV21TOARGBROW_AVX2 +#define HAS_NV21TORGB565ROW_AVX2 +#define HAS_RGB565TOARGBROW_AVX2 #endif // The following are available on all x86 platforms, but @@ -214,24 +219,27 @@ extern "C" { #define HAS_ARGBTOYJROW_AVX2 #define HAS_ARGBTOYROW_AVX2 #define HAS_COPYROW_AVX +#define HAS_I400TOARGBROW_AVX2 +#define HAS_I422TOABGRROW_AVX2 +#define HAS_I422TOARGBROW_AVX2 +#define HAS_I422TOBGRAROW_AVX2 +#define HAS_I422TORAWROW_AVX2 +#define HAS_I422TORGB24ROW_AVX2 +#define HAS_I422TORGBAROW_AVX2 #define HAS_INTERPOLATEROW_AVX2 +#define HAS_J422TOARGBROW_AVX2 #define HAS_MERGEUVROW_AVX2 #define HAS_MIRRORROW_AVX2 #define HAS_SPLITUVROW_AVX2 +#define HAS_UYVYTOARGBROW_AVX2 #define HAS_UYVYTOUV422ROW_AVX2 #define HAS_UYVYTOUVROW_AVX2 #define HAS_UYVYTOYROW_AVX2 -#define HAS_YTOARGBROW_AVX2 +#define HAS_YUY2TOARGBROW_AVX2 #define HAS_YUY2TOUV422ROW_AVX2 #define HAS_YUY2TOUVROW_AVX2 #define HAS_YUY2TOYROW_AVX2 -// The following require HAS_I422TOARGBROW_AVX2 -#if defined(HAS_I422TOARGBROW_AVX2) -#define HAS_YUY2TOARGBROW_AVX2 -#define HAS_UYVYTOARGBROW_AVX2 -#endif - // Effects: #define HAS_ARGBADDROW_AVX2 #define HAS_ARGBATTENUATEROW_AVX2 @@ -240,22 +248,6 @@ extern "C" { #define HAS_ARGBUNATTENUATEROW_AVX2 #endif - -// The following are Yasm x86 only: -// TODO(fbarchard): Port AVX2 to inline. -#if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM) - (defined(_M_IX86) || defined(_M_X64) || \ - defined(__x86_64__) || defined(__i386__)) -#define HAS_MERGEUVROW_AVX2 -#define HAS_MERGEUVROW_MMX -#define HAS_SPLITUVROW_AVX2 -#define HAS_SPLITUVROW_MMX -#define HAS_UYVYTOYROW_AVX2 -#define HAS_UYVYTOYROW_MMX -#define HAS_YUY2TOYROW_AVX2 -#define HAS_YUY2TOYROW_MMX -#endif - // The following are disabled when SSSE3 is available: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ @@ -278,7 +270,6 @@ extern "C" { #define HAS_ARGB4444TOYROW_NEON #define HAS_ARGBTOARGB1555ROW_NEON #define HAS_ARGBTOARGB4444ROW_NEON -#define HAS_ARGBTOBAYERGGROW_NEON #define HAS_ARGBTORAWROW_NEON #define HAS_ARGBTORGB24ROW_NEON #define HAS_ARGBTORGB565ROW_NEON @@ -292,7 +283,7 @@ extern "C" { #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOYROW_NEON #define HAS_COPYROW_NEON -#define HAS_I400TOARGBROW_NEON +#define HAS_J400TOARGBROW_NEON #define HAS_I411TOARGBROW_NEON #define HAS_I422TOABGRROW_NEON #define HAS_I422TOARGB1555ROW_NEON @@ -331,11 +322,12 @@ extern "C" { #define HAS_UYVYTOUV422ROW_NEON #define HAS_UYVYTOUVROW_NEON #define HAS_UYVYTOYROW_NEON -#define HAS_YTOARGBROW_NEON +#define HAS_I400TOARGBROW_NEON #define HAS_YUY2TOARGBROW_NEON #define HAS_YUY2TOUV422ROW_NEON #define HAS_YUY2TOUVROW_NEON #define HAS_YUY2TOYROW_NEON +#define HAS_ARGBTORGB565DITHERROW_NEON // Effects: #define HAS_ARGBADDROW_NEON @@ -388,7 +380,6 @@ typedef __declspec(align(32)) int8 lvec8[32]; typedef __declspec(align(32)) uint16 ulvec16[16]; typedef __declspec(align(32)) uint32 ulvec32[8]; typedef __declspec(align(32)) uint8 ulvec8[32]; - #elif defined(__GNUC__) // Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) @@ -869,6 +860,11 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, int pix); void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, int pix); +void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, int pix); +void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, + int pix); void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix); @@ -884,12 +880,20 @@ void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); void RGB24ToARGBRow_Any_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix); void RAWToARGBRow_Any_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix); + void RGB565ToARGBRow_Any_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix); void ARGB1555ToARGBRow_Any_SSE2(const uint8* src_argb1555, uint8* dst_argb, int pix); void ARGB4444ToARGBRow_Any_SSE2(const uint8* src_argb4444, uint8* dst_argb, int pix); +void RGB565ToARGBRow_Any_AVX2(const uint8* src_rgb565, uint8* dst_argb, + int pix); +void ARGB1555ToARGBRow_Any_AVX2(const uint8* src_argb1555, uint8* dst_argb, + int pix); +void ARGB4444ToARGBRow_Any_AVX2(const uint8* src_argb4444, uint8* dst_argb, + int pix); + void RGB24ToARGBRow_Any_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); void RAWToARGBRow_Any_NEON(const uint8* src_raw, uint8* dst_argb, int pix); void RGB565ToARGBRow_Any_NEON(const uint8* src_rgb565, uint8* dst_argb, @@ -905,6 +909,13 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int pix); +void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int pix); +void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int pix); + void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); @@ -914,6 +925,8 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int width); void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); @@ -922,14 +935,13 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, - const uint8* dither8x8, int pix); - -void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); -void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix); -void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); -void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix); -void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix); +void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix); +void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix); +void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); +void J400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +void J400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int pix); +void J400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix); void I444ToARGBRow_C(const uint8* src_y, const uint8* src_u, @@ -1038,6 +1050,11 @@ void I444ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_argb, int width); +void I444ToARGBRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I422ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1048,6 +1065,11 @@ void I411ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_argb, int width); +void I411ToARGBRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void NV12ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, @@ -1097,6 +1119,11 @@ void J422ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_argb, int width); +void J422ToARGBRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I422ToBGRARow_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1147,11 +1174,21 @@ void I422ToRGB24Row_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_rgb24, int width); +void I422ToRGB24Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width); void I422ToRAWRow_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_raw, int width); +void I422ToRAWRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width); void I422ToARGBRow_Any_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1177,6 +1214,11 @@ void I444ToARGBRow_Any_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_argb, int width); +void I444ToARGBRow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I422ToARGBRow_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1187,6 +1229,11 @@ void I411ToARGBRow_Any_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_argb, int width); +void I411ToARGBRow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void NV12ToARGBRow_Any_SSSE3(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, @@ -1231,6 +1278,16 @@ void YUY2ToARGBRow_Any_AVX2(const uint8* src_yuy2, void UYVYToARGBRow_Any_AVX2(const uint8* src_uyvy, uint8* dst_argb, int width); +void J422ToARGBRow_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); +void J422ToARGBRow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I422ToBGRARow_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1281,33 +1338,29 @@ void I422ToRGB24Row_Any_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_argb, int width); +void I422ToRGB24Row_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I422ToRAWRow_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, int width); +void I422ToRAWRow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); -void YToARGBRow_C(const uint8* src_y, - uint8* dst_argb, - int width); -void YToARGBRow_SSE2(const uint8* src_y, - uint8* dst_argb, - int width); -void YToARGBRow_AVX2(const uint8* src_y, - uint8* dst_argb, - int width); -void YToARGBRow_NEON(const uint8* src_y, - uint8* dst_argb, - int width); -void YToARGBRow_Any_SSE2(const uint8* src_y, - uint8* dst_argb, - int width); -void YToARGBRow_Any_AVX2(const uint8* src_y, - uint8* dst_argb, - int width); -void YToARGBRow_Any_NEON(const uint8* src_y, - uint8* dst_argb, - int width); +void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width); +void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); +void I400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); +void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width); +void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width); +void I400ToARGBRow_Any_AVX2(const uint8* src_y, uint8* dst_argb, int width); +void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); // ARGB preattenuated alpha blend. void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1, @@ -1375,6 +1428,11 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int pix); +void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int pix); + void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); @@ -1384,6 +1442,8 @@ void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565DitherRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int width); void I444ToARGBRow_Any_NEON(const uint8* src_y, const uint8* src_u, @@ -1570,17 +1630,6 @@ void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy, void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix); -void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer, - uint32 /* selector */, int pix); -void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, - uint32 /* selector */, int pix); -void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, - uint32 /* selector */, int pix); -void ARGBToBayerGGRow_Any_SSE2(const uint8* src_argb, uint8* dst_bayer, - uint32 /* selector */, int pix); -void ARGBToBayerGGRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer, - uint32 /* selector */, int pix); - void I422ToYUY2Row_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1770,6 +1819,18 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width); void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width); +void SobelRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelToPlaneRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width); +void SobelToPlaneRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_y, int width); +void SobelXYRow_Any_SSE2(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); +void SobelXYRow_Any_NEON(const uint8* src_sobelx, const uint8* src_sobely, + uint8* dst_argb, int width); void ARGBPolynomialRow_C(const uint8* src_argb, uint8* dst_argb, const float* poly, diff --git a/third_party/libyuv/include/libyuv/scale_row.h b/third_party/libyuv/include/libyuv/scale_row.h index 27aa04b..94ad9cf 100644 --- a/third_party/libyuv/include/libyuv/scale_row.h +++ b/third_party/libyuv/include/libyuv/scale_row.h @@ -12,45 +12,66 @@ #define INCLUDE_LIBYUV_SCALE_ROW_H_ #include "libyuv/basic_types.h" +#include "libyuv/scale.h" #ifdef __cplusplus namespace libyuv { extern "C" { #endif -#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \ - defined(TARGET_IPHONE_SIMULATOR) +#if defined(__pnacl__) || defined(__CLR_VER) || \ + (defined(__i386__) && !defined(__SSE2__)) #define LIBYUV_DISABLE_X86 #endif +// Visual C 2012 required for AVX2. +#if defined(_M_IX86) && !defined(__clang__) && \ + defined(_MSC_VER) && _MSC_VER >= 1700 +#define VISUALC_HAS_AVX2 1 +#endif // VisualStudio >= 2012 + // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +#define HAS_FIXEDDIV1_X86 +#define HAS_FIXEDDIV_X86 +#define HAS_SCALEARGBCOLS_SSE2 +#define HAS_SCALEARGBCOLSUP2_SSE2 +#define HAS_SCALEARGBFILTERCOLS_SSSE3 +#define HAS_SCALEARGBROWDOWN2_SSE2 +#define HAS_SCALEARGBROWDOWNEVEN_SSE2 +#define HAS_SCALECOLSUP2_SSE2 +#define HAS_SCALEFILTERCOLS_SSSE3 #define HAS_SCALEROWDOWN2_SSE2 -#define HAS_SCALEROWDOWN4_SSE2 #define HAS_SCALEROWDOWN34_SSSE3 #define HAS_SCALEROWDOWN38_SSSE3 -#define HAS_SCALEADDROWS_SSE2 -#define HAS_SCALEFILTERCOLS_SSSE3 -#define HAS_SCALECOLSUP2_SSE2 -#define HAS_SCALEARGBROWDOWN2_SSE2 -#define HAS_SCALEARGBROWDOWNEVEN_SSE2 -#define HAS_SCALEARGBCOLS_SSE2 -#define HAS_SCALEARGBFILTERCOLS_SSSE3 -#define HAS_SCALEARGBCOLSUP2_SSE2 -#define HAS_FIXEDDIV_X86 -#define HAS_FIXEDDIV1_X86 +#define HAS_SCALEROWDOWN4_SSE2 +#endif + +// The following are available on VS2012: +#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) +#define HAS_SCALEADDROW_AVX2 +#define HAS_SCALEROWDOWN2_AVX2 +#define HAS_SCALEROWDOWN4_AVX2 +#endif + +// The following are available on Visual C: +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && !defined(__clang__) +#define HAS_SCALEADDROW_SSE2 #endif // The following are available on Neon platforms: #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) +#define HAS_SCALEARGBCOLS_NEON +#define HAS_SCALEARGBROWDOWN2_NEON +#define HAS_SCALEARGBROWDOWNEVEN_NEON +#define HAS_SCALEFILTERCOLS_NEON #define HAS_SCALEROWDOWN2_NEON -#define HAS_SCALEROWDOWN4_NEON #define HAS_SCALEROWDOWN34_NEON #define HAS_SCALEROWDOWN38_NEON -#define HAS_SCALEARGBROWDOWNEVEN_NEON -#define HAS_SCALEARGBROWDOWN2_NEON +#define HAS_SCALEROWDOWN4_NEON +#define HAS_SCALEARGBFILTERCOLS_NEON #endif // The following are available on Mips platforms: @@ -164,10 +185,8 @@ void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int dst_width); -void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height); -void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint32* dst_ptr, int src_width, int src_height); +void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width); void ScaleARGBRowDown2_C(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width); @@ -194,16 +213,28 @@ void ScaleARGBFilterCols_C(uint8* dst_argb, const uint8* src_argb, void ScaleARGBFilterCols64_C(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx); +// Specialized scalers for x86. void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); +void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); +void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, @@ -220,46 +251,124 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); -void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, - int src_height); +void ScaleRowDown2_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Linear_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown2Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_Any_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_Any_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +void ScaleRowDown34_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Box_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Box_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_Any_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Box_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_2_Box_Any_SSSE3(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_Any_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_Any_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width); + void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx); void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx); -void ScaleARGBRowDown2_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, - ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width); + + +// ARGB Column functions void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx); void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx); void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx); -// Row functions. +void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBFilterCols_Any_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); +void ScaleARGBCols_Any_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx); + +// ARGB Row functions +void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleARGBRowDown2_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Linear_Any_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Box_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleARGBRowDown2Linear_Any_NEON(const uint8* src_argb, + ptrdiff_t src_stride, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); + +void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width); void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width); -void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); -void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width); +void ScaleARGBRowDownEven_Any_SSE2(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_Any_SSE2(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEven_Any_NEON(const uint8* src_argb, ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); +void ScaleARGBRowDownEvenBox_Any_NEON(const uint8* src_argb, + ptrdiff_t src_stride, + int src_stepx, + uint8* dst_argb, int dst_width); // ScaleRowDown2Box also used by planar functions // NEON downscalers with interpolation. @@ -267,7 +376,8 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, // Note - not static due to reuse in convert for 444 to 420. void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); - +void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); @@ -302,6 +412,42 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); +void ScaleRowDown2_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Linear_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown2Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +void ScaleRowDown4_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown4Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_0_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown34_1_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32 -> 12 +void ScaleRowDown38_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32x3 -> 12x1 +void ScaleRowDown38_3_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +// 32x2 -> 12x1 +void ScaleRowDown38_2_Box_Any_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); + +void ScaleAddRow_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width); +void ScaleAddRow_Any_NEON(const uint8* src_ptr, uint16* dst_ptr, int src_width); + +void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); + +void ScaleFilterCols_Any_NEON(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx); + + void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, diff --git a/third_party/libyuv/include/libyuv/version.h b/third_party/libyuv/include/libyuv/version.h index 9236a7f..9d1d746 100644 --- a/third_party/libyuv/include/libyuv/version.h +++ b/third_party/libyuv/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1305 +#define LIBYUV_VERSION 1456 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/third_party/libyuv/source/compare.cc b/third_party/libyuv/source/compare.cc index f84a08e..46aa847 100644 --- a/third_party/libyuv/source/compare.cc +++ b/third_party/libyuv/source/compare.cc @@ -37,7 +37,7 @@ uint32 HashDjb2_C(const uint8* src, int count, uint32 seed); #define HAS_HASHDJB2_SSE41 uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed); -#if _MSC_VER >= 1700 +#ifdef VISUALC_HAS_AVX2 #define HAS_HASHDJB2_AVX2 uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed); #endif @@ -138,8 +138,8 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count); #define HAS_SUMSQUAREERROR_SSE2 uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count); #endif -// Visual C 2012 required for AVX2. -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700 + +#ifdef VISUALC_HAS_AVX2 #define HAS_SUMSQUAREERROR_AVX2 uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count); #endif diff --git a/third_party/libyuv/source/compare_posix.cc b/third_party/libyuv/source/compare_gcc.cc similarity index 100% rename from third_party/libyuv/source/compare_posix.cc rename to third_party/libyuv/source/compare_gcc.cc diff --git a/third_party/libyuv/source/compare_neon64.cc b/third_party/libyuv/source/compare_neon64.cc index cc078f8..6d1e5e1 100644 --- a/third_party/libyuv/source/compare_neon64.cc +++ b/third_party/libyuv/source/compare_neon64.cc @@ -32,7 +32,7 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { "ld1 {v0.16b}, [%0], #16 \n" MEMACCESS(1) "ld1 {v1.16b}, [%1], #16 \n" - "subs %2, %2, #16 \n" + "subs %w2, %w2, #16 \n" "usubl v2.8h, v0.8b, v1.8b \n" "usubl2 v3.8h, v0.16b, v1.16b \n" "smlal v16.4s, v2.4h, v2.4h \n" diff --git a/third_party/libyuv/source/compare_win.cc b/third_party/libyuv/source/compare_win.cc index e99009a..19806f2 100644 --- a/third_party/libyuv/source/compare_win.cc +++ b/third_party/libyuv/source/compare_win.cc @@ -16,9 +16,11 @@ namespace libyuv { extern "C" { #endif -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +// This module is for Visual C x86. +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ + defined(_MSC_VER) && !defined(__clang__) -__declspec(naked) __declspec(align(16)) +__declspec(naked) uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { __asm { mov eax, [esp + 4] // src_a @@ -59,7 +61,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) { #if _MSC_VER >= 1700 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. #pragma warning(disable: 4752) -__declspec(naked) __declspec(align(16)) +__declspec(naked) uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) { __asm { mov eax, [esp + 4] // src_a @@ -133,7 +135,7 @@ static uvec32 kHashMul3 = { #define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \ _asm _emit 0x40 _asm _emit reg -__declspec(naked) __declspec(align(16)) +__declspec(naked) uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { __asm { mov eax, [esp + 4] // src @@ -184,7 +186,7 @@ uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) { // Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 -__declspec(naked) __declspec(align(16)) +__declspec(naked) uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { __asm { mov eax, [esp + 4] // src @@ -219,8 +221,7 @@ uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) { } } #endif // _MSC_VER >= 1700 - -#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) #ifdef __cplusplus } // extern "C" diff --git a/third_party/libyuv/source/convert.cc b/third_party/libyuv/source/convert.cc index 41696c1..3ad6bd7 100644 --- a/third_party/libyuv/source/convert.cc +++ b/third_party/libyuv/source/convert.cc @@ -817,22 +817,20 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, src_stride_rgb24 = -src_stride_rgb24; } +// Neon version does direct RGB24 to YUV. #if defined(HAS_RGB24TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { + RGB24ToUVRow = RGB24ToUVRow_Any_NEON; RGB24ToYRow = RGB24ToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RGB24ToYRow = RGB24ToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24ToUVRow = RGB24ToUVRow_NEON; + } } } -#endif -#if defined(HAS_RGB24TOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RGB24ToUVRow = RGB24ToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - RGB24ToUVRow = RGB24ToUVRow_NEON; - } - } -#endif +// Other platforms do intermediate conversion from RGB24 to ARGB. +#else #if defined(HAS_RGB24TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3; @@ -841,27 +839,29 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; } } -#endif // HAS_ARGBTOUVROW_SSSE3 - +#endif { -#if !defined(HAS_RGB24TOYROW_NEON) // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; + const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif @@ -894,8 +894,8 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, } #if !defined(HAS_RGB24TOYROW_NEON) free_aligned_buffer_64(row); -#endif } +#endif return 0; } @@ -931,22 +931,20 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, src_stride_raw = -src_stride_raw; } +// Neon version does direct RAW to YUV. #if defined(HAS_RAWTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { + RAWToUVRow = RAWToUVRow_Any_NEON; RAWToYRow = RAWToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RAWToYRow = RAWToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToUVRow = RAWToUVRow_NEON; + } } } -#endif -#if defined(HAS_RAWTOUVROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - RAWToUVRow = RAWToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - RAWToUVRow = RAWToUVRow_NEON; - } - } -#endif +// Other platforms do intermediate conversion from RAW to ARGB. +#else #if defined(HAS_RAWTOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { RAWToARGBRow = RAWToARGBRow_Any_SSSE3; @@ -955,59 +953,63 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToUVRow = ARGBToUVRow_Any_SSSE3; + ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGBToYRow = ARGBToYRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; } } -#endif // HAS_ARGBTOUVROW_SSSE3 - +#endif { // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; + const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); +#endif for (y = 0; y < height - 1; y += 2) { - #if defined(HAS_RAWTOYROW_NEON) +#if defined(HAS_RAWTOYROW_NEON) RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); RAWToYRow(src_raw, dst_y, width); RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); - #else +#else RAWToARGBRow(src_raw, row, width); RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width); ARGBToUVRow(row, kRowSize, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width); - #endif +#endif src_raw += src_stride_raw * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; dst_v += dst_stride_v; } if (height & 1) { - #if defined(HAS_RAWTOYROW_NEON) +#if defined(HAS_RAWTOYROW_NEON) RAWToUVRow(src_raw, 0, dst_u, dst_v, width); RAWToYRow(src_raw, dst_y, width); - #else +#else RAWToARGBRow(src_raw, row, width); ARGBToUVRow(row, 0, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); - #endif +#endif } - #if !defined(HAS_RAWTOYROW_NEON) +#if !defined(HAS_RAWTOYROW_NEON) free_aligned_buffer_64(row); - #endif } +#endif return 0; } @@ -1043,19 +1045,20 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, src_stride_rgb565 = -src_stride_rgb565; } +// Neon version does direct RGB565 to YUV. #if defined(HAS_RGB565TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { + RGB565ToUVRow = RGB565ToUVRow_Any_NEON; RGB565ToYRow = RGB565ToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RGB565ToYRow = RGB565ToYRow_NEON; - } - RGB565ToUVRow = RGB565ToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - RGB565ToUVRow = RGB565ToUVRow_NEON; + if (IS_ALIGNED(width, 16)) { + RGB565ToUVRow = RGB565ToUVRow_NEON; + } } } -#else // HAS_RGB565TOYROW_NEON - +// Other platforms do intermediate conversion from RGB565 to ARGB. +#else #if defined(HAS_RGB565TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2; @@ -1064,28 +1067,37 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; +#if defined(HAS_RGB565TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; + RGB565ToARGBRow = RGB565ToARGBRow_AVX2; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3; } } -#endif // HAS_ARGBTOUVROW_SSSE3 -#endif // HAS_RGB565TOYROW_NEON - +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif { -#if !defined(HAS_RGB565TOYROW_NEON) // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; + const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif @@ -1118,8 +1130,8 @@ int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565, } #if !defined(HAS_RGB565TOYROW_NEON) free_aligned_buffer_64(row); -#endif } +#endif return 0; } @@ -1155,19 +1167,20 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, src_stride_argb1555 = -src_stride_argb1555; } +// Neon version does direct ARGB1555 to YUV. #if defined(HAS_ARGB1555TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON; ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGB1555ToYRow = ARGB1555ToYRow_NEON; - } - ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGB1555ToUVRow = ARGB1555ToUVRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_NEON; + } } } -#else // HAS_ARGB1555TOYROW_NEON - +// Other platforms do intermediate conversion from ARGB1555 to ARGB. +#else #if defined(HAS_ARGB1555TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; @@ -1176,30 +1189,40 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; +#if defined(HAS_ARGB1555TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3; } } -#endif // HAS_ARGBTOUVROW_SSSE3 -#endif // HAS_ARGB1555TOYROW_NEON - +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif { -#if !defined(HAS_ARGB1555TOYROW_NEON) // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; + const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif + for (y = 0; y < height - 1; y += 2) { #if defined(HAS_ARGB1555TOYROW_NEON) ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); @@ -1230,9 +1253,9 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, #endif } #if !defined(HAS_ARGB1555TOYROW_NEON) - free_aligned_buffer_64(row); -#endif + free_aligned_buffer_64(row); } +#endif return 0; } @@ -1268,19 +1291,20 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, src_stride_argb4444 = -src_stride_argb4444; } +// Neon version does direct ARGB4444 to YUV. #if defined(HAS_ARGB4444TOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON; ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGB4444ToYRow = ARGB4444ToYRow_NEON; - } - ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGB4444ToUVRow = ARGB4444ToUVRow_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_NEON; + } } } -#else // HAS_ARGB4444TOYROW_NEON - +// Other platforms do intermediate conversion from ARGB4444 to ARGB. +#else #if defined(HAS_ARGB4444TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; @@ -1289,28 +1313,37 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; +#if defined(HAS_ARGB4444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; + ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2; } } #endif -#if defined(HAS_ARGBTOUVROW_SSSE3) +#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3; } } -#endif // HAS_ARGBTOUVROW_SSSE3 -#endif // HAS_ARGB4444TOYROW_NEON - +#endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif { -#if !defined(HAS_ARGB4444TOYROW_NEON) // Allocate 2 rows of ARGB. - const int kRowSize = (width * 4 + 15) & ~15; + const int kRowSize = (width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); #endif @@ -1345,8 +1378,8 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, } #if !defined(HAS_ARGB4444TOYROW_NEON) free_aligned_buffer_64(row); -#endif } +#endif return 0; } diff --git a/third_party/libyuv/source/convert_argb.cc b/third_party/libyuv/source/convert_argb.cc index 66f7660..44756bc 100644 --- a/third_party/libyuv/source/convert_argb.cc +++ b/third_party/libyuv/source/convert_argb.cc @@ -85,6 +85,14 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I444ToARGBRow = I444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I444ToARGBRow = I444ToARGBRow_AVX2; + } + } +#endif #if defined(HAS_I444TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I444ToARGBRow = I444ToARGBRow_Any_NEON; @@ -222,6 +230,14 @@ int I411ToARGB(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I411TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I411ToARGBRow = I411ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I411ToARGBRow = I411ToARGBRow_AVX2; + } + } +#endif #if defined(HAS_I411TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I411ToARGBRow = I411ToARGBRow_Any_NEON; @@ -243,13 +259,13 @@ int I411ToARGB(const uint8* src_y, int src_stride_y, // Convert I400 to ARGB. LIBYUV_API -int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { +int I400ToARGB(const uint8* src_y, int src_stride_y, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { int y; - void (*YToARGBRow)(const uint8* y_buf, + void (*I400ToARGBRow)(const uint8* y_buf, uint8* rgb_buf, - int width) = YToARGBRow_C; + int width) = I400ToARGBRow_C; if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; @@ -267,47 +283,47 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, height = 1; src_stride_y = dst_stride_argb = 0; } -#if defined(HAS_YTOARGBROW_SSE2) +#if defined(HAS_I400TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - YToARGBRow = YToARGBRow_Any_SSE2; + I400ToARGBRow = I400ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - YToARGBRow = YToARGBRow_SSE2; + I400ToARGBRow = I400ToARGBRow_SSE2; } } #endif -#if defined(HAS_YTOARGBROW_AVX2) +#if defined(HAS_I400TOARGBROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - YToARGBRow = YToARGBRow_Any_AVX2; + I400ToARGBRow = I400ToARGBRow_Any_AVX2; if (IS_ALIGNED(width, 16)) { - YToARGBRow = YToARGBRow_AVX2; + I400ToARGBRow = I400ToARGBRow_AVX2; } } #endif -#if defined(HAS_YTOARGBROW_NEON) +#if defined(HAS_I400TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - YToARGBRow = YToARGBRow_Any_NEON; + I400ToARGBRow = I400ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { - YToARGBRow = YToARGBRow_NEON; + I400ToARGBRow = I400ToARGBRow_NEON; } } #endif for (y = 0; y < height; ++y) { - YToARGBRow(src_y, dst_argb, width); + I400ToARGBRow(src_y, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; } return 0; } -// Convert I400 to ARGB. +// Convert J400 to ARGB. LIBYUV_API -int I400ToARGB(const uint8* src_y, int src_stride_y, +int J400ToARGB(const uint8* src_y, int src_stride_y, uint8* dst_argb, int dst_stride_argb, int width, int height) { int y; - void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) = - I400ToARGBRow_C; + void (*J400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) = + J400ToARGBRow_C; if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; @@ -325,24 +341,32 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, height = 1; src_stride_y = dst_stride_argb = 0; } -#if defined(HAS_I400TOARGBROW_SSE2) +#if defined(HAS_J400TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - I400ToARGBRow = I400ToARGBRow_Any_SSE2; + J400ToARGBRow = J400ToARGBRow_Any_SSE2; if (IS_ALIGNED(width, 8)) { - I400ToARGBRow = I400ToARGBRow_SSE2; + J400ToARGBRow = J400ToARGBRow_SSE2; } } #endif -#if defined(HAS_I400TOARGBROW_NEON) +#if defined(HAS_J400TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + J400ToARGBRow = J400ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + J400ToARGBRow = J400ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_J400TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - I400ToARGBRow = I400ToARGBRow_Any_NEON; + J400ToARGBRow = J400ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { - I400ToARGBRow = I400ToARGBRow_NEON; + J400ToARGBRow = J400ToARGBRow_NEON; } } #endif for (y = 0; y < height; ++y) { - I400ToARGBRow(src_y, dst_argb, width); + J400ToARGBRow(src_y, dst_argb, width); src_y += src_stride_y; dst_argb += dst_stride_argb; } @@ -552,6 +576,14 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, } } #endif +#if defined(HAS_RGB565TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB565ToARGBRow = RGB565ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + RGB565ToARGBRow = RGB565ToARGBRow_AVX2; + } + } +#endif #if defined(HAS_RGB565TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON; @@ -602,6 +634,14 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, } } #endif +#if defined(HAS_ARGB1555TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_AVX2; + } + } +#endif #if defined(HAS_ARGB1555TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; @@ -652,6 +692,14 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, } } #endif +#if defined(HAS_ARGB4444TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_AVX2; + } + } +#endif #if defined(HAS_ARGB4444TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON; diff --git a/third_party/libyuv/source/convert_from.cc b/third_party/libyuv/source/convert_from.cc index b743cde..31f1ac9 100644 --- a/third_party/libyuv/source/convert_from.cc +++ b/third_party/libyuv/source/convert_from.cc @@ -739,6 +739,14 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TORGB24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRGB24Row = I422ToRGB24Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRGB24Row = I422ToRGB24Row_AVX2; + } + } +#endif #if defined(HAS_I422TORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToRGB24Row = I422ToRGB24Row_Any_NEON; @@ -791,6 +799,14 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_I422TORAWROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToRAWRow = I422ToRAWRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToRAWRow = I422ToRAWRow_AVX2; + } + } +#endif #if defined(HAS_I422TORAWROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { I422ToRAWRow = I422ToRAWRow_Any_NEON; @@ -993,6 +1009,117 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, return 0; } +// Ordered 8x8 dither for 888 to 565. Values from 0 to 7. +static const uint8 kDither565_4x4[16] = { + 0, 4, 1, 5, + 6, 2, 7, 3, + 1, 5, 0, 4, + 7, 3, 6, 2, +}; + +// Convert I420 to RGB565 with dithering. +LIBYUV_API +int I420ToRGB565Dither(const uint8* src_y, int src_stride_y, + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_rgb565, int dst_stride_rgb565, + const uint8* dither4x4, int width, int height) { + int y; + void (*I422ToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) = I422ToARGBRow_C; + void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C; + if (!src_y || !src_u || !src_v || !dst_rgb565 || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } + if (!dither4x4) { + dither4x4 = kDither565_4x4; + } +#if defined(HAS_I422TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + I422ToARGBRow = I422ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I422ToARGBRow = I422ToARGBRow_NEON; + } + } +#endif +#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; + } + } +#endif + { + // Allocate a row of argb. + align_buffer_64(row_argb, width * 4); + for (y = 0; y < height; ++y) { + I422ToARGBRow(src_y, src_u, src_v, row_argb, width); + ARGBToRGB565DitherRow(row_argb, dst_rgb565, + *(uint32*)(dither4x4 + ((y & 3) << 2)), width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + free_aligned_buffer_64(row_argb); + } + return 0; +} + // Convert I420 to specified format LIBYUV_API int ConvertFromI420(const uint8* y, int y_stride, diff --git a/third_party/libyuv/source/convert_from_argb.cc b/third_party/libyuv/source/convert_from_argb.cc index dc2186a..8d1e97a 100644 --- a/third_party/libyuv/source/convert_from_argb.cc +++ b/third_party/libyuv/source/convert_from_argb.cc @@ -72,7 +72,14 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, ARGBToYRow = ARGBToYRow_SSSE3; } } - +#endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } #endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { @@ -139,7 +146,6 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, } } #endif - #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYRow = ARGBToYRow_Any_SSSE3; @@ -148,6 +154,14 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; @@ -275,6 +289,16 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; @@ -317,8 +341,8 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, #endif { // Allocate a rows of uv. - align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); - uint8* row_v = row_u + ((halfwidth + 15) & ~15); + align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); + uint8* row_v = row_u + ((halfwidth + 31) & ~31); for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); @@ -374,6 +398,16 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToUVRow = ARGBToUVRow_Any_AVX2; + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToUVRow = ARGBToUVRow_AVX2; + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; @@ -416,8 +450,8 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, #endif { // Allocate a rows of uv. - align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2); - uint8* row_v = row_u + ((halfwidth + 15) & ~15); + align_buffer_64(row_u, ((halfwidth + 31) & ~31) * 2); + uint8* row_v = row_u + ((halfwidth + 31) & ~31); for (y = 0; y < height - 1; y += 2) { ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width); @@ -492,6 +526,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; @@ -591,6 +633,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYRow = ARGBToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYRow = ARGBToYRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYRow = ARGBToYRow_Any_NEON; @@ -804,25 +854,22 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, return 0; } -static const uint8 kDither8x8[64] = { - 0, 128, 32, 160, 8, 136, 40, 168, - 192, 64, 224, 96, 200, 72, 232, 104, - 48, 176, 16, 144, 56, 184, 24, 152, - 240, 112, 208, 80, 248, 120, 216, 88, - 12, 140, 44, 172, 4, 132, 36, 164, - 204, 76, 236, 108, 196, 68, 228, 100, - 60, 188, 28, 156, 52, 180, 20, 148, - 252, 124, 220, 92, 244, 116, 212, 84, +// Ordered 8x8 dither for 888 to 565. Values from 0 to 7. +static const uint8 kDither565_4x4[16] = { + 0, 4, 1, 5, + 6, 2, 7, 3, + 1, 5, 0, 4, + 7, 3, 6, 2, }; -// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes). +// Convert ARGB To RGB565 with 4x4 dither matrix (16 bytes). LIBYUV_API int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, uint8* dst_rgb565, int dst_stride_rgb565, - const uint8* dither8x8, int width, int height) { + const uint8* dither4x4, int width, int height) { int y; void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, - const uint8* dither8x8, int pix) = ARGBToRGB565DitherRow_C; + const uint32 dither4, int pix) = ARGBToRGB565DitherRow_C; if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { return -1; } @@ -831,13 +878,36 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - if (!dither8x8) { - dither8x8 = kDither8x8; - + if (!dither4x4) { + dither4x4 = kDither565_4x4; + } +#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_SSE2; + if (IS_ALIGNED(width, 4)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; + } + } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2; + } } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_NEON; + } + } +#endif for (y = 0; y < height; ++y) { ARGBToRGB565DitherRow(src_argb, dst_rgb565, - dither8x8 + ((y & 7) << 3), width); + *(uint32*)(dither4x4 + ((y & 3) << 2)), width); src_argb += src_stride_argb; dst_rgb565 += dst_stride_rgb565; } @@ -845,6 +915,7 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, } // Convert ARGB To RGB565. +// TODO(fbarchard): Consider using dither function low level with zeros. LIBYUV_API int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, uint8* dst_rgb565, int dst_stride_rgb565, @@ -1021,7 +1092,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, int width, int height) { int y; void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; + uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C; void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) = ARGBToYJRow_C; if (!src_argb || @@ -1045,7 +1116,7 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, } } #endif -#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2) +#if defined(HAS_ARGBTOYJROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { ARGBToYJRow = ARGBToYJRow_Any_AVX2; if (IS_ALIGNED(width, 32)) { @@ -1140,6 +1211,14 @@ int ARGBToJ422(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToYJRow = ARGBToYJRow_Any_NEON; diff --git a/third_party/libyuv/source/cpu_id.cc b/third_party/libyuv/source/cpu_id.cc index 1efa265..8a10b00 100644 --- a/third_party/libyuv/source/cpu_id.cc +++ b/third_party/libyuv/source/cpu_id.cc @@ -10,13 +10,12 @@ #include "libyuv/cpu_id.h" -#if defined(_MSC_VER) && !defined(__clang__) +#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__) #include // For __cpuidex() #endif #if !defined(__pnacl__) && !defined(__CLR_VER) && \ - !defined(__native_client__) && \ - defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) && \ - (defined(_M_IX86) || defined(_M_X64)) + !defined(__native_client__) && (defined(_M_IX86) || defined(_M_X64)) && \ + defined(_MSC_VER) && !defined(__clang__) && (_MSC_FULL_VER >= 160040219) #include // For _xgetbv() #endif @@ -37,23 +36,23 @@ extern "C" { // For functions that use the stack and have runtime checks for overflow, // use SAFEBUFFERS to avoid additional check. -#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) +#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219) #define SAFEBUFFERS __declspec(safebuffers) #else #define SAFEBUFFERS #endif -// Low level cpuid for X86. Returns zeros on other CPUs. -#if !defined(__pnacl__) && !defined(__CLR_VER) && \ - (defined(_M_IX86) || defined(_M_X64) || \ - defined(__i386__) || defined(__x86_64__)) +// Low level cpuid for X86. +#if (defined(_M_IX86) || defined(_M_X64) || \ + defined(__i386__) || defined(__x86_64__)) && \ + !defined(__pnacl__) && !defined(__CLR_VER) LIBYUV_API void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { -#if defined(_MSC_VER) && !defined(__clang__) +#if (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__) +// Visual C version uses intrinsic or inline x86 assembly. #if (_MSC_FULL_VER >= 160040219) __cpuidex((int*)(cpu_info), info_eax, info_ecx); -#endif -#if defined(_M_IX86) +#elif defined(_M_IX86) __asm { mov eax, info_eax mov ecx, info_ecx @@ -71,7 +70,8 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { cpu_info[3] = cpu_info[2] = cpu_info[1] = cpu_info[0] = 0; } #endif -#else // defined(_MSC_VER) +// GCC version uses inline x86 assembly. +#else // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__) uint32 info_ebx, info_edx; asm volatile ( // NOLINT #if defined( __i386__) && defined(__PIC__) @@ -89,36 +89,37 @@ void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) { cpu_info[1] = info_ebx; cpu_info[2] = info_ecx; cpu_info[3] = info_edx; -#endif // defined(_MSC_VER) +#endif // (defined(_MSC_VER) && !defined(__clang__)) && !defined(__clang__) +} +#else // (defined(_M_IX86) || defined(_M_X64) ... +LIBYUV_API +void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) { + cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; } +#endif -#if !defined(__native_client__) +// TODO(fbarchard): Enable xgetbv when validator supports it. +#if (defined(_M_IX86) || defined(_M_X64) || \ + defined(__i386__) || defined(__x86_64__)) && \ + !defined(__pnacl__) && !defined(__CLR_VER) && !defined(__native_client__) #define HAS_XGETBV // X86 CPUs have xgetbv to detect OS saves high parts of ymm registers. int TestOsSaveYmm() { uint32 xcr0 = 0u; -#if defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) +#if (defined(_MSC_VER) && !defined(__clang__)) && (_MSC_FULL_VER >= 160040219) xcr0 = (uint32)(_xgetbv(0)); // VS2010 SP1 required. -#endif -#if defined(_M_IX86) && defined(_MSC_VER) +#elif defined(_M_IX86) && defined(_MSC_VER) && !defined(__clang__) __asm { xor ecx, ecx // xcr 0 _asm _emit 0x0f _asm _emit 0x01 _asm _emit 0xd0 // For VS2010 and earlier. mov xcr0, eax } -#endif -#if defined(__i386__) || defined(__x86_64__) +#elif defined(__i386__) || defined(__x86_64__) asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx"); -#endif // defined(_MSC_VER) +#endif // defined(__i386__) || defined(__x86_64__) return((xcr0 & 6) == 6); // Is ymm saved? } -#endif // !defined(__native_client__) -#else -LIBYUV_API -void CpuId(uint32 eax, uint32 ecx, uint32* cpu_info) { - cpu_info[0] = cpu_info[1] = cpu_info[2] = cpu_info[3] = 0; -} -#endif +#endif // defined(_M_IX86) || defined(_M_X64) .. // based on libvpx arm_cpudetect.c // For Arm, but public to allow testing on any CPU diff --git a/third_party/libyuv/source/mjpeg_decoder.cc b/third_party/libyuv/source/mjpeg_decoder.cc index 36028c3..75f8a61 100644 --- a/third_party/libyuv/source/mjpeg_decoder.cc +++ b/third_party/libyuv/source/mjpeg_decoder.cc @@ -18,6 +18,12 @@ // Must be included before jpeglib. #include #define HAVE_SETJMP + +#if defined(_MSC_VER) +// disable warning 4324: structure was padded due to __declspec(align()) +#pragma warning(disable:4324) +#endif + #endif struct FILE; // For jpeglib.h. diff --git a/third_party/libyuv/source/mjpeg_validate.cc b/third_party/libyuv/source/mjpeg_validate.cc index 40ce2f7..8edfbe1 100644 --- a/third_party/libyuv/source/mjpeg_validate.cc +++ b/third_party/libyuv/source/mjpeg_validate.cc @@ -23,7 +23,7 @@ extern "C" { #ifdef ENABLE_SCASB // Multiple of 1. -__declspec(naked) __declspec(align(16)) +__declspec(naked) const uint8* ScanRow_ERMS(const uint8* src, uint32 val, int count) { __asm { mov edx, edi diff --git a/third_party/libyuv/source/planar_functions.cc b/third_party/libyuv/source/planar_functions.cc index 75ef775..b96bd50 100644 --- a/third_party/libyuv/source/planar_functions.cc +++ b/third_party/libyuv/source/planar_functions.cc @@ -528,7 +528,7 @@ int ARGBMirror(const uint8* src_argb, int src_stride_argb, return 0; } -// Get a blender that optimized for the CPU, alignment and pixel count. +// Get a blender that optimized for the CPU and pixel count. // As there are 6 blenders to choose from, the caller should try to use // the same blend function for all pixels if possible. LIBYUV_API @@ -677,12 +677,12 @@ int ARGBAdd(const uint8* src_argb0, int src_stride_argb0, height = 1; src_stride_argb0 = src_stride_argb1 = dst_stride_argb = 0; } -#if defined(HAS_ARGBADDROW_SSE2) && defined(_MSC_VER) +#if defined(HAS_ARGBADDROW_SSE2) && (defined(_MSC_VER) && !defined(__clang__)) if (TestCpuFlag(kCpuHasSSE2)) { ARGBAddRow = ARGBAddRow_SSE2; } #endif -#if defined(HAS_ARGBADDROW_SSE2) && !defined(_MSC_VER) +#if defined(HAS_ARGBADDROW_SSE2) && !(defined(_MSC_VER) && !defined(__clang__)) if (TestCpuFlag(kCpuHasSSE2)) { ARGBAddRow = ARGBAddRow_Any_SSE2; if (IS_ALIGNED(width, 4)) { @@ -1976,8 +1976,8 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, const uint8* src_sobely, uint8* dst, int width)) { int y; - void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix) = ARGBToBayerGGRow_C; + void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int pix) = + ARGBToYJRow_C; void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width) = SobelYRow_C; void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1, @@ -1993,31 +1993,32 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - // ARGBToBayer used to select G channel from ARGB. -#if defined(HAS_ARGBTOBAYERGGROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2; - if (IS_ALIGNED(width, 8)) { - ARGBToBayerRow = ARGBToBayerGGRow_SSE2; + +#if defined(HAS_ARGBTOYJROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToYJRow = ARGBToYJRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYJRow = ARGBToYJRow_SSSE3; } } #endif -#if defined(HAS_ARGBTOBAYERROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - ARGBToBayerRow = ARGBToBayerRow_SSSE3; +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; } } #endif -#if defined(HAS_ARGBTOBAYERGGROW_NEON) +#if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON; + ARGBToYJRow = ARGBToYJRow_Any_NEON; if (IS_ALIGNED(width, 8)) { - ARGBToBayerRow = ARGBToBayerGGRow_NEON; + ARGBToYJRow = ARGBToYJRow_NEON; } } #endif + #if defined(HAS_SOBELYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { SobelYRow = SobelYRow_SSE2; @@ -2040,7 +2041,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, #endif { // 3 rows with edges before/after. - const int kRowSize = (width + kEdge + 15) & ~15; + const int kRowSize = (width + kEdge + 31) & ~31; align_buffer_64(rows, kRowSize * 2 + (kEdge + kRowSize * 3 + kEdge)); uint8* row_sobelx = rows; uint8* row_sobely = rows + kRowSize; @@ -2050,20 +2051,20 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, uint8* row_y0 = row_y + kEdge; uint8* row_y1 = row_y0 + kRowSize; uint8* row_y2 = row_y1 + kRowSize; - ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width); + ARGBToYJRow(src_argb, row_y0, width); row_y0[-1] = row_y0[0]; memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind. - ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width); + ARGBToYJRow(src_argb, row_y1, width); row_y1[-1] = row_y1[0]; memset(row_y1 + width, row_y1[width - 1], 16); memset(row_y2 + width, 0, 16); for (y = 0; y < height; ++y) { - // Convert next row of ARGB to Y. + // Convert next row of ARGB to G. if (y < (height - 1)) { src_argb += src_stride_argb; } - ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width); + ARGBToYJRow(src_argb, row_y2, width); row_y2[-1] = row_y2[0]; row_y2[width] = row_y2[width - 1]; @@ -2094,13 +2095,19 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, void (*SobelRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) = SobelRow_C; #if defined(HAS_SOBELROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { - SobelRow = SobelRow_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + SobelRow = SobelRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SobelRow = SobelRow_SSE2; + } } #endif #if defined(HAS_SOBELROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - SobelRow = SobelRow_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + SobelRow = SobelRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + SobelRow = SobelRow_NEON; + } } #endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, @@ -2115,13 +2122,19 @@ int ARGBSobelToPlane(const uint8* src_argb, int src_stride_argb, void (*SobelToPlaneRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_, int width) = SobelToPlaneRow_C; #if defined(HAS_SOBELTOPLANEROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { - SobelToPlaneRow = SobelToPlaneRow_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + SobelToPlaneRow = SobelToPlaneRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SobelToPlaneRow = SobelToPlaneRow_SSE2; + } } #endif #if defined(HAS_SOBELTOPLANEROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { - SobelToPlaneRow = SobelToPlaneRow_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + SobelToPlaneRow = SobelToPlaneRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SobelToPlaneRow = SobelToPlaneRow_NEON; + } } #endif return ARGBSobelize(src_argb, src_stride_argb, dst_y, dst_stride_y, @@ -2137,13 +2150,19 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, void (*SobelXYRow)(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) = SobelXYRow_C; #if defined(HAS_SOBELXYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16)) { - SobelXYRow = SobelXYRow_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + SobelXYRow = SobelXYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SobelXYRow = SobelXYRow_SSE2; + } } #endif #if defined(HAS_SOBELXYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { - SobelXYRow = SobelXYRow_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + SobelXYRow = SobelXYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + SobelXYRow = SobelXYRow_NEON; + } } #endif return ARGBSobelize(src_argb, src_stride_argb, dst_argb, dst_stride_argb, @@ -2322,6 +2341,214 @@ int ARGBCopyYToAlpha(const uint8* src_y, int src_stride_y, return 0; } +LIBYUV_API +int YUY2ToNV12(const uint8* src_yuy2, int src_stride_yuy2, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) = + SplitUVRow_C; + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + if (!src_yuy2 || + !dst_y || !dst_uv || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; + src_stride_yuy2 = -src_stride_yuy2; + } +#if defined(HAS_SPLITUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SplitUVRow = SplitUVRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_SSE2; + } + } +#endif +#if defined(HAS_SPLITUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitUVRow = SplitUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitUVRow = SplitUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif + + { + int awidth = halfwidth * 2; + // 2 rows of uv + align_buffer_64(rows, awidth * 2); + + for (y = 0; y < height - 1; y += 2) { + // Split Y from UV. + SplitUVRow(src_yuy2, dst_y, rows, awidth); + SplitUVRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, + rows + awidth, awidth); + InterpolateRow(dst_uv, rows, awidth, awidth, 128); + src_yuy2 += src_stride_yuy2 * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + // Split Y from UV. + SplitUVRow(src_yuy2, dst_y, dst_uv, width); + } + free_aligned_buffer_64(rows); + } + return 0; +} + +LIBYUV_API +int UYVYToNV12(const uint8* src_uyvy, int src_stride_uyvy, + uint8* dst_y, int dst_stride_y, + uint8* dst_uv, int dst_stride_uv, + int width, int height) { + int y; + int halfwidth = (width + 1) >> 1; + void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) = + SplitUVRow_C; + void (*InterpolateRow)(uint8* dst_ptr, const uint8* src_ptr, + ptrdiff_t src_stride, int dst_width, + int source_y_fraction) = InterpolateRow_C; + if (!src_uyvy || + !dst_y || !dst_uv || + width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; + src_stride_uyvy = -src_stride_uyvy; + } +#if defined(HAS_SPLITUVROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + SplitUVRow = SplitUVRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_SSE2; + } + } +#endif +#if defined(HAS_SPLITUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SplitUVRow = SplitUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + SplitUVRow = SplitUVRow_AVX2; + } + } +#endif +#if defined(HAS_SPLITUVROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + SplitUVRow = SplitUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SplitUVRow = SplitUVRow_NEON; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + InterpolateRow = InterpolateRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_SSE2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + InterpolateRow = InterpolateRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_SSSE3; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + InterpolateRow = InterpolateRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + InterpolateRow = InterpolateRow_AVX2; + } + } +#endif +#if defined(HAS_INTERPOLATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + InterpolateRow = InterpolateRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + InterpolateRow = InterpolateRow_NEON; + } + } +#endif + + { + int awidth = halfwidth * 2; + // 2 rows of uv + align_buffer_64(rows, awidth * 2); + + for (y = 0; y < height - 1; y += 2) { + // Split Y from UV. + SplitUVRow(src_uyvy, rows, dst_y, awidth); + SplitUVRow(src_uyvy + src_stride_uyvy, rows + awidth, + dst_y + dst_stride_y, awidth); + InterpolateRow(dst_uv, rows, awidth, awidth, 128); + src_uyvy += src_stride_uyvy * 2; + dst_y += dst_stride_y * 2; + dst_uv += dst_stride_uv; + } + if (height & 1) { + // Split Y from UV. + SplitUVRow(src_uyvy, dst_y, dst_uv, width); + } + free_aligned_buffer_64(rows); + } + return 0; +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/third_party/libyuv/source/rotate.cc b/third_party/libyuv/source/rotate.cc index 5acaccf..be3d589 100644 --- a/third_party/libyuv/source/rotate.cc +++ b/third_party/libyuv/source/rotate.cc @@ -13,6 +13,7 @@ #include "libyuv/cpu_id.h" #include "libyuv/convert.h" #include "libyuv/planar_functions.h" +#include "libyuv/rotate_row.h" #include "libyuv/row.h" #ifdef __cplusplus @@ -20,809 +21,39 @@ namespace libyuv { extern "C" { #endif -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) -#if defined(__APPLE__) && defined(__i386__) -#define DECLARE_FUNCTION(name) \ - ".text \n" \ - ".private_extern _" #name " \n" \ - ".align 4,0x90 \n" \ -"_" #name ": \n" -#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__) -#define DECLARE_FUNCTION(name) \ - ".text \n" \ - ".align 4,0x90 \n" \ -"_" #name ": \n" -#else -#define DECLARE_FUNCTION(name) \ - ".text \n" \ - ".align 4,0x90 \n" \ -#name ": \n" -#endif -#endif - -#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ - (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) -#define HAS_TRANSPOSE_WX8_NEON -void TransposeWx8_NEON(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -#define HAS_TRANSPOSE_UVWX8_NEON -void TransposeUVWx8_NEON(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width); -#endif - -#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \ - defined(__mips__) && \ - defined(__mips_dsp) && (__mips_dsp_rev >= 2) -#define HAS_TRANSPOSE_WX8_MIPS_DSPR2 -void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); - -void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width); -#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2 -void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width); -#endif // defined(__mips__) - -#if !defined(LIBYUV_DISABLE_X86) && \ - defined(_M_IX86) && defined(_MSC_VER) -#define HAS_TRANSPOSE_WX8_SSSE3 -__declspec(naked) __declspec(align(16)) -static void TransposeWx8_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { - __asm { - push edi - push esi - push ebp - mov eax, [esp + 12 + 4] // src - mov edi, [esp + 12 + 8] // src_stride - mov edx, [esp + 12 + 12] // dst - mov esi, [esp + 12 + 16] // dst_stride - mov ecx, [esp + 12 + 20] // width - - // Read in the data from the source pointer. - // First round of bit swap. - align 4 - convertloop: - movq xmm0, qword ptr [eax] - lea ebp, [eax + 8] - movq xmm1, qword ptr [eax + edi] - lea eax, [eax + 2 * edi] - punpcklbw xmm0, xmm1 - movq xmm2, qword ptr [eax] - movdqa xmm1, xmm0 - palignr xmm1, xmm1, 8 - movq xmm3, qword ptr [eax + edi] - lea eax, [eax + 2 * edi] - punpcklbw xmm2, xmm3 - movdqa xmm3, xmm2 - movq xmm4, qword ptr [eax] - palignr xmm3, xmm3, 8 - movq xmm5, qword ptr [eax + edi] - punpcklbw xmm4, xmm5 - lea eax, [eax + 2 * edi] - movdqa xmm5, xmm4 - movq xmm6, qword ptr [eax] - palignr xmm5, xmm5, 8 - movq xmm7, qword ptr [eax + edi] - punpcklbw xmm6, xmm7 - mov eax, ebp - movdqa xmm7, xmm6 - palignr xmm7, xmm7, 8 - // Second round of bit swap. - punpcklwd xmm0, xmm2 - punpcklwd xmm1, xmm3 - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - palignr xmm2, xmm2, 8 - palignr xmm3, xmm3, 8 - punpcklwd xmm4, xmm6 - punpcklwd xmm5, xmm7 - movdqa xmm6, xmm4 - movdqa xmm7, xmm5 - palignr xmm6, xmm6, 8 - palignr xmm7, xmm7, 8 - // Third round of bit swap. - // Write to the destination pointer. - punpckldq xmm0, xmm4 - movq qword ptr [edx], xmm0 - movdqa xmm4, xmm0 - palignr xmm4, xmm4, 8 - movq qword ptr [edx + esi], xmm4 - lea edx, [edx + 2 * esi] - punpckldq xmm2, xmm6 - movdqa xmm6, xmm2 - palignr xmm6, xmm6, 8 - movq qword ptr [edx], xmm2 - punpckldq xmm1, xmm5 - movq qword ptr [edx + esi], xmm6 - lea edx, [edx + 2 * esi] - movdqa xmm5, xmm1 - movq qword ptr [edx], xmm1 - palignr xmm5, xmm5, 8 - punpckldq xmm3, xmm7 - movq qword ptr [edx + esi], xmm5 - lea edx, [edx + 2 * esi] - movq qword ptr [edx], xmm3 - movdqa xmm7, xmm3 - palignr xmm7, xmm7, 8 - sub ecx, 8 - movq qword ptr [edx + esi], xmm7 - lea edx, [edx + 2 * esi] - jg convertloop - - pop ebp - pop esi - pop edi - ret - } -} - -#define HAS_TRANSPOSE_UVWX8_SSE2 -__declspec(naked) __declspec(align(16)) -static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int w) { - __asm { - push ebx - push esi - push edi - push ebp - mov eax, [esp + 16 + 4] // src - mov edi, [esp + 16 + 8] // src_stride - mov edx, [esp + 16 + 12] // dst_a - mov esi, [esp + 16 + 16] // dst_stride_a - mov ebx, [esp + 16 + 20] // dst_b - mov ebp, [esp + 16 + 24] // dst_stride_b - mov ecx, esp - sub esp, 4 + 16 - and esp, ~15 - mov [esp + 16], ecx - mov ecx, [ecx + 16 + 28] // w - - align 4 - convertloop: - // Read in the data from the source pointer. - // First round of bit swap. - movdqu xmm0, [eax] - movdqu xmm1, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm0 // use xmm7 as temp register. - punpcklbw xmm0, xmm1 - punpckhbw xmm7, xmm1 - movdqa xmm1, xmm7 - movdqu xmm2, [eax] - movdqu xmm3, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm2 - punpcklbw xmm2, xmm3 - punpckhbw xmm7, xmm3 - movdqa xmm3, xmm7 - movdqu xmm4, [eax] - movdqu xmm5, [eax + edi] - lea eax, [eax + 2 * edi] - movdqa xmm7, xmm4 - punpcklbw xmm4, xmm5 - punpckhbw xmm7, xmm5 - movdqa xmm5, xmm7 - movdqu xmm6, [eax] - movdqu xmm7, [eax + edi] - lea eax, [eax + 2 * edi] - movdqu [esp], xmm5 // backup xmm5 - neg edi - movdqa xmm5, xmm6 // use xmm5 as temp register. - punpcklbw xmm6, xmm7 - punpckhbw xmm5, xmm7 - movdqa xmm7, xmm5 - lea eax, [eax + 8 * edi + 16] - neg edi - // Second round of bit swap. - movdqa xmm5, xmm0 - punpcklwd xmm0, xmm2 - punpckhwd xmm5, xmm2 - movdqa xmm2, xmm5 - movdqa xmm5, xmm1 - punpcklwd xmm1, xmm3 - punpckhwd xmm5, xmm3 - movdqa xmm3, xmm5 - movdqa xmm5, xmm4 - punpcklwd xmm4, xmm6 - punpckhwd xmm5, xmm6 - movdqa xmm6, xmm5 - movdqu xmm5, [esp] // restore xmm5 - movdqu [esp], xmm6 // backup xmm6 - movdqa xmm6, xmm5 // use xmm6 as temp register. - punpcklwd xmm5, xmm7 - punpckhwd xmm6, xmm7 - movdqa xmm7, xmm6 - // Third round of bit swap. - // Write to the destination pointer. - movdqa xmm6, xmm0 - punpckldq xmm0, xmm4 - punpckhdq xmm6, xmm4 - movdqa xmm4, xmm6 - movdqu xmm6, [esp] // restore xmm6 - movlpd qword ptr [edx], xmm0 - movhpd qword ptr [ebx], xmm0 - movlpd qword ptr [edx + esi], xmm4 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm4 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm2 // use xmm0 as the temp register. - punpckldq xmm2, xmm6 - movlpd qword ptr [edx], xmm2 - movhpd qword ptr [ebx], xmm2 - punpckhdq xmm0, xmm6 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm1 // use xmm0 as the temp register. - punpckldq xmm1, xmm5 - movlpd qword ptr [edx], xmm1 - movhpd qword ptr [ebx], xmm1 - punpckhdq xmm0, xmm5 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - movdqa xmm0, xmm3 // use xmm0 as the temp register. - punpckldq xmm3, xmm7 - movlpd qword ptr [edx], xmm3 - movhpd qword ptr [ebx], xmm3 - punpckhdq xmm0, xmm7 - sub ecx, 8 - movlpd qword ptr [edx + esi], xmm0 - lea edx, [edx + 2 * esi] - movhpd qword ptr [ebx + ebp], xmm0 - lea ebx, [ebx + 2 * ebp] - jg convertloop - - mov esp, [esp + 16] - pop ebp - pop edi - pop esi - pop ebx - ret - } -} -#endif -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) -#define HAS_TRANSPOSE_WX8_SSSE3 -static void TransposeWx8_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { - asm volatile ( - // Read in the data from the source pointer. - // First round of bit swap. - ".p2align 2 \n" - "1: \n" - "movq (%0),%%xmm0 \n" - "movq (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "movq (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "movq (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movq (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "movq (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movq (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "lea 0x8(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "sub $0x8,%2 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" - ); -} - -#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) -#define HAS_TRANSPOSE_UVWX8_SSE2 -void TransposeUVWx8_SSE2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int w); - asm ( - DECLARE_FUNCTION(TransposeUVWx8_SSE2) - "push %ebx \n" - "push %esi \n" - "push %edi \n" - "push %ebp \n" - "mov 0x14(%esp),%eax \n" - "mov 0x18(%esp),%edi \n" - "mov 0x1c(%esp),%edx \n" - "mov 0x20(%esp),%esi \n" - "mov 0x24(%esp),%ebx \n" - "mov 0x28(%esp),%ebp \n" - "mov %esp,%ecx \n" - "sub $0x14,%esp \n" - "and $0xfffffff0,%esp \n" - "mov %ecx,0x10(%esp) \n" - "mov 0x2c(%ecx),%ecx \n" - -"1: \n" - "movdqu (%eax),%xmm0 \n" - "movdqu (%eax,%edi,1),%xmm1 \n" - "lea (%eax,%edi,2),%eax \n" - "movdqa %xmm0,%xmm7 \n" - "punpcklbw %xmm1,%xmm0 \n" - "punpckhbw %xmm1,%xmm7 \n" - "movdqa %xmm7,%xmm1 \n" - "movdqu (%eax),%xmm2 \n" - "movdqu (%eax,%edi,1),%xmm3 \n" - "lea (%eax,%edi,2),%eax \n" - "movdqa %xmm2,%xmm7 \n" - "punpcklbw %xmm3,%xmm2 \n" - "punpckhbw %xmm3,%xmm7 \n" - "movdqa %xmm7,%xmm3 \n" - "movdqu (%eax),%xmm4 \n" - "movdqu (%eax,%edi,1),%xmm5 \n" - "lea (%eax,%edi,2),%eax \n" - "movdqa %xmm4,%xmm7 \n" - "punpcklbw %xmm5,%xmm4 \n" - "punpckhbw %xmm5,%xmm7 \n" - "movdqa %xmm7,%xmm5 \n" - "movdqu (%eax),%xmm6 \n" - "movdqu (%eax,%edi,1),%xmm7 \n" - "lea (%eax,%edi,2),%eax \n" - "movdqu %xmm5,(%esp) \n" - "neg %edi \n" - "movdqa %xmm6,%xmm5 \n" - "punpcklbw %xmm7,%xmm6 \n" - "punpckhbw %xmm7,%xmm5 \n" - "movdqa %xmm5,%xmm7 \n" - "lea 0x10(%eax,%edi,8),%eax \n" - "neg %edi \n" - "movdqa %xmm0,%xmm5 \n" - "punpcklwd %xmm2,%xmm0 \n" - "punpckhwd %xmm2,%xmm5 \n" - "movdqa %xmm5,%xmm2 \n" - "movdqa %xmm1,%xmm5 \n" - "punpcklwd %xmm3,%xmm1 \n" - "punpckhwd %xmm3,%xmm5 \n" - "movdqa %xmm5,%xmm3 \n" - "movdqa %xmm4,%xmm5 \n" - "punpcklwd %xmm6,%xmm4 \n" - "punpckhwd %xmm6,%xmm5 \n" - "movdqa %xmm5,%xmm6 \n" - "movdqu (%esp),%xmm5 \n" - "movdqu %xmm6,(%esp) \n" - "movdqa %xmm5,%xmm6 \n" - "punpcklwd %xmm7,%xmm5 \n" - "punpckhwd %xmm7,%xmm6 \n" - "movdqa %xmm6,%xmm7 \n" - "movdqa %xmm0,%xmm6 \n" - "punpckldq %xmm4,%xmm0 \n" - "punpckhdq %xmm4,%xmm6 \n" - "movdqa %xmm6,%xmm4 \n" - "movdqu (%esp),%xmm6 \n" - "movlpd %xmm0,(%edx) \n" - "movhpd %xmm0,(%ebx) \n" - "movlpd %xmm4,(%edx,%esi,1) \n" - "lea (%edx,%esi,2),%edx \n" - "movhpd %xmm4,(%ebx,%ebp,1) \n" - "lea (%ebx,%ebp,2),%ebx \n" - "movdqa %xmm2,%xmm0 \n" - "punpckldq %xmm6,%xmm2 \n" - "movlpd %xmm2,(%edx) \n" - "movhpd %xmm2,(%ebx) \n" - "punpckhdq %xmm6,%xmm0 \n" - "movlpd %xmm0,(%edx,%esi,1) \n" - "lea (%edx,%esi,2),%edx \n" - "movhpd %xmm0,(%ebx,%ebp,1) \n" - "lea (%ebx,%ebp,2),%ebx \n" - "movdqa %xmm1,%xmm0 \n" - "punpckldq %xmm5,%xmm1 \n" - "movlpd %xmm1,(%edx) \n" - "movhpd %xmm1,(%ebx) \n" - "punpckhdq %xmm5,%xmm0 \n" - "movlpd %xmm0,(%edx,%esi,1) \n" - "lea (%edx,%esi,2),%edx \n" - "movhpd %xmm0,(%ebx,%ebp,1) \n" - "lea (%ebx,%ebp,2),%ebx \n" - "movdqa %xmm3,%xmm0 \n" - "punpckldq %xmm7,%xmm3 \n" - "movlpd %xmm3,(%edx) \n" - "movhpd %xmm3,(%ebx) \n" - "punpckhdq %xmm7,%xmm0 \n" - "sub $0x8,%ecx \n" - "movlpd %xmm0,(%edx,%esi,1) \n" - "lea (%edx,%esi,2),%edx \n" - "movhpd %xmm0,(%ebx,%ebp,1) \n" - "lea (%ebx,%ebp,2),%ebx \n" - "jg 1b \n" - "mov 0x10(%esp),%esp \n" - "pop %ebp \n" - "pop %edi \n" - "pop %esi \n" - "pop %ebx \n" -#if defined(__native_client__) - "pop %ecx \n" - "and $0xffffffe0,%ecx \n" - "jmp *%ecx \n" -#else - "ret \n" -#endif -); -#endif -#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ - defined(__x86_64__) -// 64 bit version has enough registers to do 16x8 to 8x16 at a time. -#define HAS_TRANSPOSE_WX8_FAST_SSSE3 -static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride, - uint8* dst, int dst_stride, int width) { - asm volatile ( - // Read in the data from the source pointer. - // First round of bit swap. - ".p2align 2 \n" -"1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%3),%%xmm1 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqu (%0),%%xmm2 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm8,%%xmm9 \n" - "palignr $0x8,%%xmm1,%%xmm1 \n" - "palignr $0x8,%%xmm9,%%xmm9 \n" - "movdqu (%0,%3),%%xmm3 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm2,%%xmm10 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm10 \n" - "movdqa %%xmm2,%%xmm3 \n" - "movdqa %%xmm10,%%xmm11 \n" - "movdqu (%0),%%xmm4 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "movdqu (%0,%3),%%xmm5 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm4,%%xmm12 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm12 \n" - "movdqa %%xmm4,%%xmm5 \n" - "movdqa %%xmm12,%%xmm13 \n" - "movdqu (%0),%%xmm6 \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movdqu (%0,%3),%%xmm7 \n" - "lea (%0,%3,2),%0 \n" - "movdqa %%xmm6,%%xmm14 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "punpckhbw %%xmm7,%%xmm14 \n" - "neg %3 \n" - "movdqa %%xmm6,%%xmm7 \n" - "movdqa %%xmm14,%%xmm15 \n" - "lea 0x10(%0,%3,8),%0 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "neg %3 \n" - // Second round of bit swap. - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "palignr $0x8,%%xmm2,%%xmm2 \n" - "palignr $0x8,%%xmm3,%%xmm3 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm4,%%xmm6 \n" - "movdqa %%xmm5,%%xmm7 \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "punpcklwd %%xmm10,%%xmm8 \n" - "punpcklwd %%xmm11,%%xmm9 \n" - "movdqa %%xmm8,%%xmm10 \n" - "movdqa %%xmm9,%%xmm11 \n" - "palignr $0x8,%%xmm10,%%xmm10 \n" - "palignr $0x8,%%xmm11,%%xmm11 \n" - "punpcklwd %%xmm14,%%xmm12 \n" - "punpcklwd %%xmm15,%%xmm13 \n" - "movdqa %%xmm12,%%xmm14 \n" - "movdqa %%xmm13,%%xmm15 \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - // Third round of bit swap. - // Write to the destination pointer. - "punpckldq %%xmm4,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "movdqa %%xmm0,%%xmm4 \n" - "palignr $0x8,%%xmm4,%%xmm4 \n" - "movq %%xmm4,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movdqa %%xmm2,%%xmm6 \n" - "movq %%xmm2,(%1) \n" - "palignr $0x8,%%xmm6,%%xmm6 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movq %%xmm6,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm1,%%xmm5 \n" - "movq %%xmm1,(%1) \n" - "palignr $0x8,%%xmm5,%%xmm5 \n" - "movq %%xmm5,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movq %%xmm3,(%1) \n" - "movdqa %%xmm3,%%xmm7 \n" - "palignr $0x8,%%xmm7,%%xmm7 \n" - "movq %%xmm7,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm12,%%xmm8 \n" - "movq %%xmm8,(%1) \n" - "movdqa %%xmm8,%%xmm12 \n" - "palignr $0x8,%%xmm12,%%xmm12 \n" - "movq %%xmm12,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm14,%%xmm10 \n" - "movdqa %%xmm10,%%xmm14 \n" - "movq %%xmm10,(%1) \n" - "palignr $0x8,%%xmm14,%%xmm14 \n" - "punpckldq %%xmm13,%%xmm9 \n" - "movq %%xmm14,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "movdqa %%xmm9,%%xmm13 \n" - "movq %%xmm9,(%1) \n" - "palignr $0x8,%%xmm13,%%xmm13 \n" - "movq %%xmm13,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "punpckldq %%xmm15,%%xmm11 \n" - "movq %%xmm11,(%1) \n" - "movdqa %%xmm11,%%xmm15 \n" - "palignr $0x8,%%xmm15,%%xmm15 \n" - "sub $0x10,%2 \n" - "movq %%xmm15,(%1,%4) \n" - "lea (%1,%4,2),%1 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((intptr_t)(src_stride)), // %3 - "r"((intptr_t)(dst_stride)) // %4 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" -); -} - -#define HAS_TRANSPOSE_UVWX8_SSE2 -static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int w) { - asm volatile ( - // Read in the data from the source pointer. - // First round of bit swap. - ".p2align 2 \n" -"1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu (%0,%4),%%xmm1 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm0,%%xmm8 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm8 \n" - "movdqa %%xmm8,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "movdqu (%0,%4),%%xmm3 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "punpckhbw %%xmm3,%%xmm8 \n" - "movdqa %%xmm8,%%xmm3 \n" - "movdqu (%0),%%xmm4 \n" - "movdqu (%0,%4),%%xmm5 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm4,%%xmm8 \n" - "punpcklbw %%xmm5,%%xmm4 \n" - "punpckhbw %%xmm5,%%xmm8 \n" - "movdqa %%xmm8,%%xmm5 \n" - "movdqu (%0),%%xmm6 \n" - "movdqu (%0,%4),%%xmm7 \n" - "lea (%0,%4,2),%0 \n" - "movdqa %%xmm6,%%xmm8 \n" - "punpcklbw %%xmm7,%%xmm6 \n" - "neg %4 \n" - "lea 0x10(%0,%4,8),%0 \n" - "punpckhbw %%xmm7,%%xmm8 \n" - "movdqa %%xmm8,%%xmm7 \n" - "neg %4 \n" - // Second round of bit swap. - "movdqa %%xmm0,%%xmm8 \n" - "movdqa %%xmm1,%%xmm9 \n" - "punpckhwd %%xmm2,%%xmm8 \n" - "punpckhwd %%xmm3,%%xmm9 \n" - "punpcklwd %%xmm2,%%xmm0 \n" - "punpcklwd %%xmm3,%%xmm1 \n" - "movdqa %%xmm8,%%xmm2 \n" - "movdqa %%xmm9,%%xmm3 \n" - "movdqa %%xmm4,%%xmm8 \n" - "movdqa %%xmm5,%%xmm9 \n" - "punpckhwd %%xmm6,%%xmm8 \n" - "punpckhwd %%xmm7,%%xmm9 \n" - "punpcklwd %%xmm6,%%xmm4 \n" - "punpcklwd %%xmm7,%%xmm5 \n" - "movdqa %%xmm8,%%xmm6 \n" - "movdqa %%xmm9,%%xmm7 \n" - // Third round of bit swap. - // Write to the destination pointer. - "movdqa %%xmm0,%%xmm8 \n" - "punpckldq %%xmm4,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" // Write back U channel - "movhpd %%xmm0,(%2) \n" // Write back V channel - "punpckhdq %%xmm4,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm2,%%xmm8 \n" - "punpckldq %%xmm6,%%xmm2 \n" - "movlpd %%xmm2,(%1) \n" - "movhpd %%xmm2,(%2) \n" - "punpckhdq %%xmm6,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm1,%%xmm8 \n" - "punpckldq %%xmm5,%%xmm1 \n" - "movlpd %%xmm1,(%1) \n" - "movhpd %%xmm1,(%2) \n" - "punpckhdq %%xmm5,%%xmm8 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "movdqa %%xmm3,%%xmm8 \n" - "punpckldq %%xmm7,%%xmm3 \n" - "movlpd %%xmm3,(%1) \n" - "movhpd %%xmm3,(%2) \n" - "punpckhdq %%xmm7,%%xmm8 \n" - "sub $0x8,%3 \n" - "movlpd %%xmm8,(%1,%5) \n" - "lea (%1,%5,2),%1 \n" - "movhpd %%xmm8,(%2,%6) \n" - "lea (%2,%6,2),%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_a), // %1 - "+r"(dst_b), // %2 - "+r"(w) // %3 - : "r"((intptr_t)(src_stride)), // %4 - "r"((intptr_t)(dst_stride_a)), // %5 - "r"((intptr_t)(dst_stride_b)) // %6 - : "memory", "cc", - "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", - "xmm8", "xmm9" -); -} -#endif -#endif - -static void TransposeWx8_C(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width) { - int i; - for (i = 0; i < width; ++i) { - dst[0] = src[0 * src_stride]; - dst[1] = src[1 * src_stride]; - dst[2] = src[2 * src_stride]; - dst[3] = src[3 * src_stride]; - dst[4] = src[4 * src_stride]; - dst[5] = src[5 * src_stride]; - dst[6] = src[6 * src_stride]; - dst[7] = src[7 * src_stride]; - ++src; - dst += dst_stride; - } -} - -static void TransposeWxH_C(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { - int i; - for (i = 0; i < width; ++i) { - int j; - for (j = 0; j < height; ++j) { - dst[i * dst_stride + j] = src[j * src_stride + i]; - } - } -} - LIBYUV_API void TransposePlane(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width, int height) { int i = height; void (*TransposeWx8)(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width) = TransposeWx8_C; -#if defined(HAS_TRANSPOSE_WX8_NEON) + uint8* dst, int dst_stride, int width) = TransposeWx8_C; +#if defined(HAS_TRANSPOSEWX8_NEON) if (TestCpuFlag(kCpuHasNEON)) { TransposeWx8 = TransposeWx8_NEON; } #endif -#if defined(HAS_TRANSPOSE_WX8_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { - TransposeWx8 = TransposeWx8_SSSE3; +#if defined(HAS_TRANSPOSEWX8_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + TransposeWx8 = TransposeWx8_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + TransposeWx8 = TransposeWx8_SSSE3; + } } #endif -#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { - TransposeWx8 = TransposeWx8_FAST_SSSE3; +#if defined(HAS_TRANSPOSEWX8_FAST_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + TransposeWx8 = TransposeWx8_Fast_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + TransposeWx8 = TransposeWx8_Fast_SSSE3; + } } #endif -#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2) +#if defined(HAS_TRANSPOSEWX8_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { if (IS_ALIGNED(width, 4) && IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { - TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2; + TransposeWx8 = TransposeWx8_Fast_MIPS_DSPR2; } else { TransposeWx8 = TransposeWx8_MIPS_DSPR2; } @@ -837,7 +68,9 @@ void TransposePlane(const uint8* src, int src_stride, i -= 8; } - TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); + if (i > 0) { + TransposeWxH_C(src, src_stride, dst, dst_stride, width, i); + } } LIBYUV_API @@ -955,48 +188,6 @@ void RotatePlane180(const uint8* src, int src_stride, free_aligned_buffer_64(row); } -static void TransposeUVWx8_C(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width) { - int i; - for (i = 0; i < width; ++i) { - dst_a[0] = src[0 * src_stride + 0]; - dst_b[0] = src[0 * src_stride + 1]; - dst_a[1] = src[1 * src_stride + 0]; - dst_b[1] = src[1 * src_stride + 1]; - dst_a[2] = src[2 * src_stride + 0]; - dst_b[2] = src[2 * src_stride + 1]; - dst_a[3] = src[3 * src_stride + 0]; - dst_b[3] = src[3 * src_stride + 1]; - dst_a[4] = src[4 * src_stride + 0]; - dst_b[4] = src[4 * src_stride + 1]; - dst_a[5] = src[5 * src_stride + 0]; - dst_b[5] = src[5 * src_stride + 1]; - dst_a[6] = src[6 * src_stride + 0]; - dst_b[6] = src[6 * src_stride + 1]; - dst_a[7] = src[7 * src_stride + 0]; - dst_b[7] = src[7 * src_stride + 1]; - src += 2; - dst_a += dst_stride_a; - dst_b += dst_stride_b; - } -} - -static void TransposeUVWxH_C(const uint8* src, int src_stride, - uint8* dst_a, int dst_stride_a, - uint8* dst_b, int dst_stride_b, - int width, int height) { - int i; - for (i = 0; i < width * 2; i += 2) { - int j; - for (j = 0; j < height; ++j) { - dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; - dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; - } - } -} - LIBYUV_API void TransposeUV(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, @@ -1007,17 +198,17 @@ void TransposeUV(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, int width) = TransposeUVWx8_C; -#if defined(HAS_TRANSPOSE_UVWX8_NEON) +#if defined(HAS_TRANSPOSEUVWX8_NEON) if (TestCpuFlag(kCpuHasNEON)) { TransposeUVWx8 = TransposeUVWx8_NEON; } #endif -#if defined(HAS_TRANSPOSE_UVWX8_SSE2) +#if defined(HAS_TRANSPOSEUVWX8_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8)) { TransposeUVWx8 = TransposeUVWx8_SSE2; } #endif -#if defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2) +#if defined(HAS_TRANSPOSEUVWx8_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; @@ -1036,10 +227,12 @@ void TransposeUV(const uint8* src, int src_stride, i -= 8; } - TransposeUVWxH_C(src, src_stride, - dst_a, dst_stride_a, - dst_b, dst_stride_b, - width, i); + if (i > 0) { + TransposeUVWxH_C(src, src_stride, + dst_a, dst_stride_a, + dst_b, dst_stride_b, + width, i); + } } LIBYUV_API diff --git a/third_party/libyuv/source/rotate_any.cc b/third_party/libyuv/source/rotate_any.cc new file mode 100644 index 0000000..4d6eb34 --- /dev/null +++ b/third_party/libyuv/source/rotate_any.cc @@ -0,0 +1,55 @@ +/* + * Copyright 2015 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/rotate.h" +#include "libyuv/rotate_row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#define TANY(NAMEANY, TPOS_SIMD, TPOS_C, MASK) \ + void NAMEANY(const uint8* src, int src_stride, \ + uint8* dst, int dst_stride, int width) { \ + int r = width & MASK; \ + int n = width - r; \ + if (n > 0) { \ + TPOS_SIMD(src, src_stride, dst, dst_stride, n); \ + } \ + TPOS_C(src + n, src_stride, dst + n * dst_stride, dst_stride, r); \ + } + +#ifdef HAS_TRANSPOSEWX8_NEON +TANY(TransposeWx8_Any_NEON, TransposeWx8_NEON, TransposeWx8_C, 7) +#endif +#ifdef HAS_TRANSPOSEWX8_SSSE3 +TANY(TransposeWx8_Any_SSSE3, TransposeWx8_SSSE3, TransposeWx8_C, 7) +#endif +#ifdef HAS_TRANSPOSEWX8_FAST_SSSE3 +TANY(TransposeWx8_Fast_Any_SSSE3, TransposeWx8_Fast_SSSE3, TransposeWx8_C, 15) +#endif +#ifdef HAS_TRANSPOSEWX8_MIPS_DSPR2 +TANY(TransposeWx8_Any_MIPS_DSPR2, TransposeWx8_MIPS_DSPR2, TransposeWx8_C, 7) +#endif + +#undef TANY + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + + + + + diff --git a/third_party/libyuv/source/rotate_argb.cc b/third_party/libyuv/source/rotate_argb.cc index b9673db..787c0ad 100644 --- a/third_party/libyuv/source/rotate_argb.cc +++ b/third_party/libyuv/source/rotate_argb.cc @@ -27,24 +27,20 @@ extern "C" { (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__)) #define HAS_SCALEARGBROWDOWNEVEN_SSE2 void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride, - int src_stepx, - uint8* dst_ptr, int dst_width); + int src_stepx, uint8* dst_ptr, int dst_width); #endif #if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON) || defined(__aarch64__)) #define HAS_SCALEARGBROWDOWNEVEN_NEON void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride, - int src_stepx, - uint8* dst_ptr, int dst_width); + int src_stepx, uint8* dst_ptr, int dst_width); #endif void ScaleARGBRowDownEven_C(const uint8* src_ptr, int, - int src_stepx, - uint8* dst_ptr, int dst_width); + int src_stepx, uint8* dst_ptr, int dst_width); static void ARGBTranspose(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { + uint8* dst, int dst_stride, int width, int height) { int i; int src_pixel_step = src_stride >> 2; void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride, @@ -68,8 +64,7 @@ static void ARGBTranspose(const uint8* src, int src_stride, } void ARGBRotate90(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { + uint8* dst, int dst_stride, int width, int height) { // Rotate by 90 is a ARGBTranspose with the source read // from bottom to top. So set the source pointer to the end // of the buffer and flip the sign of the source stride. @@ -79,8 +74,7 @@ void ARGBRotate90(const uint8* src, int src_stride, } void ARGBRotate270(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { + uint8* dst, int dst_stride, int width, int height) { // Rotate by 270 is a ARGBTranspose with the destination written // from bottom to top. So set the destination pointer to the end // of the buffer and flip the sign of the destination stride. @@ -90,8 +84,7 @@ void ARGBRotate270(const uint8* src, int src_stride, } void ARGBRotate180(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width, int height) { + uint8* dst, int dst_stride, int width, int height) { // Swap first and last row and mirror the content. Uses a temporary row. align_buffer_64(row, width * 4); const uint8* src_bot = src + src_stride * (height - 1); @@ -166,8 +159,7 @@ void ARGBRotate180(const uint8* src, int src_stride, LIBYUV_API int ARGBRotate(const uint8* src_argb, int src_stride_argb, - uint8* dst_argb, int dst_stride_argb, - int width, int height, + uint8* dst_argb, int dst_stride_argb, int width, int height, enum RotationMode mode) { if (!src_argb || width <= 0 || height == 0 || !dst_argb) { return -1; diff --git a/third_party/libyuv/source/rotate_common.cc b/third_party/libyuv/source/rotate_common.cc new file mode 100644 index 0000000..b33a9a0 --- /dev/null +++ b/third_party/libyuv/source/rotate_common.cc @@ -0,0 +1,92 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" +#include "libyuv/rotate_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +void TransposeWx8_C(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + int i; + for (i = 0; i < width; ++i) { + dst[0] = src[0 * src_stride]; + dst[1] = src[1 * src_stride]; + dst[2] = src[2 * src_stride]; + dst[3] = src[3 * src_stride]; + dst[4] = src[4 * src_stride]; + dst[5] = src[5 * src_stride]; + dst[6] = src[6 * src_stride]; + dst[7] = src[7 * src_stride]; + ++src; + dst += dst_stride; + } +} + +void TransposeUVWx8_C(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width) { + int i; + for (i = 0; i < width; ++i) { + dst_a[0] = src[0 * src_stride + 0]; + dst_b[0] = src[0 * src_stride + 1]; + dst_a[1] = src[1 * src_stride + 0]; + dst_b[1] = src[1 * src_stride + 1]; + dst_a[2] = src[2 * src_stride + 0]; + dst_b[2] = src[2 * src_stride + 1]; + dst_a[3] = src[3 * src_stride + 0]; + dst_b[3] = src[3 * src_stride + 1]; + dst_a[4] = src[4 * src_stride + 0]; + dst_b[4] = src[4 * src_stride + 1]; + dst_a[5] = src[5 * src_stride + 0]; + dst_b[5] = src[5 * src_stride + 1]; + dst_a[6] = src[6 * src_stride + 0]; + dst_b[6] = src[6 * src_stride + 1]; + dst_a[7] = src[7 * src_stride + 0]; + dst_b[7] = src[7 * src_stride + 1]; + src += 2; + dst_a += dst_stride_a; + dst_b += dst_stride_b; + } +} + +void TransposeWxH_C(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width, int height) { + int i; + for (i = 0; i < width; ++i) { + int j; + for (j = 0; j < height; ++j) { + dst[i * dst_stride + j] = src[j * src_stride + i]; + } + } +} + +void TransposeUVWxH_C(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width, int height) { + int i; + for (i = 0; i < width * 2; i += 2) { + int j; + for (j = 0; j < height; ++j) { + dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)]; + dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1]; + } + } +} + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/rotate_gcc.cc b/third_party/libyuv/source/rotate_gcc.cc new file mode 100644 index 0000000..fd385bc --- /dev/null +++ b/third_party/libyuv/source/rotate_gcc.cc @@ -0,0 +1,493 @@ +/* + * Copyright 2015 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" +#include "libyuv/rotate_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for GCC x86 and x64. +#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__)) + +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__))) +void TransposeWx8_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + asm volatile ( + // Read in the data from the source pointer. + // First round of bit swap. + ".p2align 2 \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "movq (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movq (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "movq (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movq (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "movq (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movq (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "lea 0x8(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "sub $0x8,%2 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__) && !defined(__clang__) +void TransposeUVWx8_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width); + asm ( + DECLARE_FUNCTION(TransposeUVWx8_SSE2) + "push %ebx \n" + "push %esi \n" + "push %edi \n" + "push %ebp \n" + "mov 0x14(%esp),%eax \n" + "mov 0x18(%esp),%edi \n" + "mov 0x1c(%esp),%edx \n" + "mov 0x20(%esp),%esi \n" + "mov 0x24(%esp),%ebx \n" + "mov 0x28(%esp),%ebp \n" + "mov %esp,%ecx \n" + "sub $0x14,%esp \n" + "and $0xfffffff0,%esp \n" + "mov %ecx,0x10(%esp) \n" + "mov 0x2c(%ecx),%ecx \n" + +"1: \n" + "movdqu (%eax),%xmm0 \n" + "movdqu (%eax,%edi,1),%xmm1 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm0,%xmm7 \n" + "punpcklbw %xmm1,%xmm0 \n" + "punpckhbw %xmm1,%xmm7 \n" + "movdqa %xmm7,%xmm1 \n" + "movdqu (%eax),%xmm2 \n" + "movdqu (%eax,%edi,1),%xmm3 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm2,%xmm7 \n" + "punpcklbw %xmm3,%xmm2 \n" + "punpckhbw %xmm3,%xmm7 \n" + "movdqa %xmm7,%xmm3 \n" + "movdqu (%eax),%xmm4 \n" + "movdqu (%eax,%edi,1),%xmm5 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqa %xmm4,%xmm7 \n" + "punpcklbw %xmm5,%xmm4 \n" + "punpckhbw %xmm5,%xmm7 \n" + "movdqa %xmm7,%xmm5 \n" + "movdqu (%eax),%xmm6 \n" + "movdqu (%eax,%edi,1),%xmm7 \n" + "lea (%eax,%edi,2),%eax \n" + "movdqu %xmm5,(%esp) \n" + "neg %edi \n" + "movdqa %xmm6,%xmm5 \n" + "punpcklbw %xmm7,%xmm6 \n" + "punpckhbw %xmm7,%xmm5 \n" + "movdqa %xmm5,%xmm7 \n" + "lea 0x10(%eax,%edi,8),%eax \n" + "neg %edi \n" + "movdqa %xmm0,%xmm5 \n" + "punpcklwd %xmm2,%xmm0 \n" + "punpckhwd %xmm2,%xmm5 \n" + "movdqa %xmm5,%xmm2 \n" + "movdqa %xmm1,%xmm5 \n" + "punpcklwd %xmm3,%xmm1 \n" + "punpckhwd %xmm3,%xmm5 \n" + "movdqa %xmm5,%xmm3 \n" + "movdqa %xmm4,%xmm5 \n" + "punpcklwd %xmm6,%xmm4 \n" + "punpckhwd %xmm6,%xmm5 \n" + "movdqa %xmm5,%xmm6 \n" + "movdqu (%esp),%xmm5 \n" + "movdqu %xmm6,(%esp) \n" + "movdqa %xmm5,%xmm6 \n" + "punpcklwd %xmm7,%xmm5 \n" + "punpckhwd %xmm7,%xmm6 \n" + "movdqa %xmm6,%xmm7 \n" + "movdqa %xmm0,%xmm6 \n" + "punpckldq %xmm4,%xmm0 \n" + "punpckhdq %xmm4,%xmm6 \n" + "movdqa %xmm6,%xmm4 \n" + "movdqu (%esp),%xmm6 \n" + "movlpd %xmm0,(%edx) \n" + "movhpd %xmm0,(%ebx) \n" + "movlpd %xmm4,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm4,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm2,%xmm0 \n" + "punpckldq %xmm6,%xmm2 \n" + "movlpd %xmm2,(%edx) \n" + "movhpd %xmm2,(%ebx) \n" + "punpckhdq %xmm6,%xmm0 \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm1,%xmm0 \n" + "punpckldq %xmm5,%xmm1 \n" + "movlpd %xmm1,(%edx) \n" + "movhpd %xmm1,(%ebx) \n" + "punpckhdq %xmm5,%xmm0 \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "movdqa %xmm3,%xmm0 \n" + "punpckldq %xmm7,%xmm3 \n" + "movlpd %xmm3,(%edx) \n" + "movhpd %xmm3,(%ebx) \n" + "punpckhdq %xmm7,%xmm0 \n" + "sub $0x8,%ecx \n" + "movlpd %xmm0,(%edx,%esi,1) \n" + "lea (%edx,%esi,2),%edx \n" + "movhpd %xmm0,(%ebx,%ebp,1) \n" + "lea (%ebx,%ebp,2),%ebx \n" + "jg 1b \n" + "mov 0x10(%esp),%esp \n" + "pop %ebp \n" + "pop %edi \n" + "pop %esi \n" + "pop %ebx \n" +#if defined(__native_client__) + "pop %ecx \n" + "and $0xffffffe0,%ecx \n" + "jmp *%ecx \n" +#else + "ret \n" +#endif +); +#endif +#if !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \ + defined(__x86_64__) +// 64 bit version has enough registers to do 16x8 to 8x16 at a time. +void TransposeWx8_Fast_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + asm volatile ( + // Read in the data from the source pointer. + // First round of bit swap. + ".p2align 2 \n" +"1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%3),%%xmm1 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqu (%0),%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm8,%%xmm9 \n" + "palignr $0x8,%%xmm1,%%xmm1 \n" + "palignr $0x8,%%xmm9,%%xmm9 \n" + "movdqu (%0,%3),%%xmm3 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm2,%%xmm10 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm10 \n" + "movdqa %%xmm2,%%xmm3 \n" + "movdqa %%xmm10,%%xmm11 \n" + "movdqu (%0),%%xmm4 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "movdqu (%0,%3),%%xmm5 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm4,%%xmm12 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm12 \n" + "movdqa %%xmm4,%%xmm5 \n" + "movdqa %%xmm12,%%xmm13 \n" + "movdqu (%0),%%xmm6 \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movdqu (%0,%3),%%xmm7 \n" + "lea (%0,%3,2),%0 \n" + "movdqa %%xmm6,%%xmm14 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "punpckhbw %%xmm7,%%xmm14 \n" + "neg %3 \n" + "movdqa %%xmm6,%%xmm7 \n" + "movdqa %%xmm14,%%xmm15 \n" + "lea 0x10(%0,%3,8),%0 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "neg %3 \n" + // Second round of bit swap. + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "palignr $0x8,%%xmm2,%%xmm2 \n" + "palignr $0x8,%%xmm3,%%xmm3 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm4,%%xmm6 \n" + "movdqa %%xmm5,%%xmm7 \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "punpcklwd %%xmm10,%%xmm8 \n" + "punpcklwd %%xmm11,%%xmm9 \n" + "movdqa %%xmm8,%%xmm10 \n" + "movdqa %%xmm9,%%xmm11 \n" + "palignr $0x8,%%xmm10,%%xmm10 \n" + "palignr $0x8,%%xmm11,%%xmm11 \n" + "punpcklwd %%xmm14,%%xmm12 \n" + "punpcklwd %%xmm15,%%xmm13 \n" + "movdqa %%xmm12,%%xmm14 \n" + "movdqa %%xmm13,%%xmm15 \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + // Third round of bit swap. + // Write to the destination pointer. + "punpckldq %%xmm4,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "movdqa %%xmm0,%%xmm4 \n" + "palignr $0x8,%%xmm4,%%xmm4 \n" + "movq %%xmm4,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movdqa %%xmm2,%%xmm6 \n" + "movq %%xmm2,(%1) \n" + "palignr $0x8,%%xmm6,%%xmm6 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movq %%xmm6,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm1,%%xmm5 \n" + "movq %%xmm1,(%1) \n" + "palignr $0x8,%%xmm5,%%xmm5 \n" + "movq %%xmm5,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movq %%xmm3,(%1) \n" + "movdqa %%xmm3,%%xmm7 \n" + "palignr $0x8,%%xmm7,%%xmm7 \n" + "movq %%xmm7,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm12,%%xmm8 \n" + "movq %%xmm8,(%1) \n" + "movdqa %%xmm8,%%xmm12 \n" + "palignr $0x8,%%xmm12,%%xmm12 \n" + "movq %%xmm12,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm14,%%xmm10 \n" + "movdqa %%xmm10,%%xmm14 \n" + "movq %%xmm10,(%1) \n" + "palignr $0x8,%%xmm14,%%xmm14 \n" + "punpckldq %%xmm13,%%xmm9 \n" + "movq %%xmm14,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "movdqa %%xmm9,%%xmm13 \n" + "movq %%xmm9,(%1) \n" + "palignr $0x8,%%xmm13,%%xmm13 \n" + "movq %%xmm13,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "punpckldq %%xmm15,%%xmm11 \n" + "movq %%xmm11,(%1) \n" + "movdqa %%xmm11,%%xmm15 \n" + "palignr $0x8,%%xmm15,%%xmm15 \n" + "sub $0x10,%2 \n" + "movq %%xmm15,(%1,%4) \n" + "lea (%1,%4,2),%1 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)) // %4 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15" +); +} + +void TransposeUVWx8_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, int width) { + asm volatile ( + // Read in the data from the source pointer. + // First round of bit swap. + ".p2align 2 \n" +"1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu (%0,%4),%%xmm1 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm0,%%xmm8 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm8 \n" + "movdqa %%xmm8,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "movdqu (%0,%4),%%xmm3 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "punpckhbw %%xmm3,%%xmm8 \n" + "movdqa %%xmm8,%%xmm3 \n" + "movdqu (%0),%%xmm4 \n" + "movdqu (%0,%4),%%xmm5 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm4,%%xmm8 \n" + "punpcklbw %%xmm5,%%xmm4 \n" + "punpckhbw %%xmm5,%%xmm8 \n" + "movdqa %%xmm8,%%xmm5 \n" + "movdqu (%0),%%xmm6 \n" + "movdqu (%0,%4),%%xmm7 \n" + "lea (%0,%4,2),%0 \n" + "movdqa %%xmm6,%%xmm8 \n" + "punpcklbw %%xmm7,%%xmm6 \n" + "neg %4 \n" + "lea 0x10(%0,%4,8),%0 \n" + "punpckhbw %%xmm7,%%xmm8 \n" + "movdqa %%xmm8,%%xmm7 \n" + "neg %4 \n" + // Second round of bit swap. + "movdqa %%xmm0,%%xmm8 \n" + "movdqa %%xmm1,%%xmm9 \n" + "punpckhwd %%xmm2,%%xmm8 \n" + "punpckhwd %%xmm3,%%xmm9 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpcklwd %%xmm3,%%xmm1 \n" + "movdqa %%xmm8,%%xmm2 \n" + "movdqa %%xmm9,%%xmm3 \n" + "movdqa %%xmm4,%%xmm8 \n" + "movdqa %%xmm5,%%xmm9 \n" + "punpckhwd %%xmm6,%%xmm8 \n" + "punpckhwd %%xmm7,%%xmm9 \n" + "punpcklwd %%xmm6,%%xmm4 \n" + "punpcklwd %%xmm7,%%xmm5 \n" + "movdqa %%xmm8,%%xmm6 \n" + "movdqa %%xmm9,%%xmm7 \n" + // Third round of bit swap. + // Write to the destination pointer. + "movdqa %%xmm0,%%xmm8 \n" + "punpckldq %%xmm4,%%xmm0 \n" + "movlpd %%xmm0,(%1) \n" // Write back U channel + "movhpd %%xmm0,(%2) \n" // Write back V channel + "punpckhdq %%xmm4,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm2,%%xmm8 \n" + "punpckldq %%xmm6,%%xmm2 \n" + "movlpd %%xmm2,(%1) \n" + "movhpd %%xmm2,(%2) \n" + "punpckhdq %%xmm6,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm1,%%xmm8 \n" + "punpckldq %%xmm5,%%xmm1 \n" + "movlpd %%xmm1,(%1) \n" + "movhpd %%xmm1,(%2) \n" + "punpckhdq %%xmm5,%%xmm8 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "movdqa %%xmm3,%%xmm8 \n" + "punpckldq %%xmm7,%%xmm3 \n" + "movlpd %%xmm3,(%1) \n" + "movhpd %%xmm3,(%2) \n" + "punpckhdq %%xmm7,%%xmm8 \n" + "sub $0x8,%3 \n" + "movlpd %%xmm8,(%1,%5) \n" + "lea (%1,%5,2),%1 \n" + "movhpd %%xmm8,(%2,%6) \n" + "lea (%2,%6,2),%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride)), // %4 + "r"((intptr_t)(dst_stride_a)), // %5 + "r"((intptr_t)(dst_stride_b)) // %6 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9" +); +} +#endif +#endif + +#endif // defined(__x86_64__) || defined(__i386__) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/rotate_mips.cc b/third_party/libyuv/source/rotate_mips.cc index 70770fd..efe6bd9 100644 --- a/third_party/libyuv/source/rotate_mips.cc +++ b/third_party/libyuv/source/rotate_mips.cc @@ -9,6 +9,7 @@ */ #include "libyuv/row.h" +#include "libyuv/rotate_row.h" #include "libyuv/basic_types.h" @@ -22,8 +23,7 @@ extern "C" { (_MIPS_SIM == _MIPS_SIM_ABI32) void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width) { + uint8* dst, int dst_stride, int width) { __asm__ __volatile__ ( ".set push \n" ".set noreorder \n" @@ -106,9 +106,8 @@ void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, ); } -void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width) { +void TransposeWx8_Fast_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { __asm__ __volatile__ ( ".set noat \n" ".set push \n" diff --git a/third_party/libyuv/source/rotate_neon.cc b/third_party/libyuv/source/rotate_neon.cc index a23a40f..76043b3 100644 --- a/third_party/libyuv/source/rotate_neon.cc +++ b/third_party/libyuv/source/rotate_neon.cc @@ -9,6 +9,7 @@ */ #include "libyuv/row.h" +#include "libyuv/rotate_row.h" #include "libyuv/basic_types.h" diff --git a/third_party/libyuv/source/rotate_neon64.cc b/third_party/libyuv/source/rotate_neon64.cc index 92358af..f52c082 100644 --- a/third_party/libyuv/source/rotate_neon64.cc +++ b/third_party/libyuv/source/rotate_neon64.cc @@ -9,6 +9,7 @@ */ #include "libyuv/row.h" +#include "libyuv/rotate_row.h" #include "libyuv/basic_types.h" @@ -21,11 +22,10 @@ extern "C" { #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) static uvec8 kVTbl4x4Transpose = - { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; + { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; void TransposeWx8_NEON(const uint8* src, int src_stride, - uint8* dst, int dst_stride, - int width) { + uint8* dst, int dst_stride, int width) { const uint8* src_temp = NULL; int64 width64 = (int64) width; // Work around clang 3.4 warning. asm volatile ( diff --git a/third_party/libyuv/source/rotate_win.cc b/third_party/libyuv/source/rotate_win.cc new file mode 100644 index 0000000..2760066 --- /dev/null +++ b/third_party/libyuv/source/rotate_win.cc @@ -0,0 +1,248 @@ +/* + * Copyright 2013 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" +#include "libyuv/rotate_row.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// This module is for Visual C x86. +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ + defined(_MSC_VER) && !defined(__clang__) + +__declspec(naked) +void TransposeWx8_SSSE3(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width) { + __asm { + push edi + push esi + push ebp + mov eax, [esp + 12 + 4] // src + mov edi, [esp + 12 + 8] // src_stride + mov edx, [esp + 12 + 12] // dst + mov esi, [esp + 12 + 16] // dst_stride + mov ecx, [esp + 12 + 20] // width + + // Read in the data from the source pointer. + // First round of bit swap. + align 4 + convertloop: + movq xmm0, qword ptr [eax] + lea ebp, [eax + 8] + movq xmm1, qword ptr [eax + edi] + lea eax, [eax + 2 * edi] + punpcklbw xmm0, xmm1 + movq xmm2, qword ptr [eax] + movdqa xmm1, xmm0 + palignr xmm1, xmm1, 8 + movq xmm3, qword ptr [eax + edi] + lea eax, [eax + 2 * edi] + punpcklbw xmm2, xmm3 + movdqa xmm3, xmm2 + movq xmm4, qword ptr [eax] + palignr xmm3, xmm3, 8 + movq xmm5, qword ptr [eax + edi] + punpcklbw xmm4, xmm5 + lea eax, [eax + 2 * edi] + movdqa xmm5, xmm4 + movq xmm6, qword ptr [eax] + palignr xmm5, xmm5, 8 + movq xmm7, qword ptr [eax + edi] + punpcklbw xmm6, xmm7 + mov eax, ebp + movdqa xmm7, xmm6 + palignr xmm7, xmm7, 8 + // Second round of bit swap. + punpcklwd xmm0, xmm2 + punpcklwd xmm1, xmm3 + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + palignr xmm2, xmm2, 8 + palignr xmm3, xmm3, 8 + punpcklwd xmm4, xmm6 + punpcklwd xmm5, xmm7 + movdqa xmm6, xmm4 + movdqa xmm7, xmm5 + palignr xmm6, xmm6, 8 + palignr xmm7, xmm7, 8 + // Third round of bit swap. + // Write to the destination pointer. + punpckldq xmm0, xmm4 + movq qword ptr [edx], xmm0 + movdqa xmm4, xmm0 + palignr xmm4, xmm4, 8 + movq qword ptr [edx + esi], xmm4 + lea edx, [edx + 2 * esi] + punpckldq xmm2, xmm6 + movdqa xmm6, xmm2 + palignr xmm6, xmm6, 8 + movq qword ptr [edx], xmm2 + punpckldq xmm1, xmm5 + movq qword ptr [edx + esi], xmm6 + lea edx, [edx + 2 * esi] + movdqa xmm5, xmm1 + movq qword ptr [edx], xmm1 + palignr xmm5, xmm5, 8 + punpckldq xmm3, xmm7 + movq qword ptr [edx + esi], xmm5 + lea edx, [edx + 2 * esi] + movq qword ptr [edx], xmm3 + movdqa xmm7, xmm3 + palignr xmm7, xmm7, 8 + sub ecx, 8 + movq qword ptr [edx + esi], xmm7 + lea edx, [edx + 2 * esi] + jg convertloop + + pop ebp + pop esi + pop edi + ret + } +} + +__declspec(naked) +void TransposeUVWx8_SSE2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int w) { + __asm { + push ebx + push esi + push edi + push ebp + mov eax, [esp + 16 + 4] // src + mov edi, [esp + 16 + 8] // src_stride + mov edx, [esp + 16 + 12] // dst_a + mov esi, [esp + 16 + 16] // dst_stride_a + mov ebx, [esp + 16 + 20] // dst_b + mov ebp, [esp + 16 + 24] // dst_stride_b + mov ecx, esp + sub esp, 4 + 16 + and esp, ~15 + mov [esp + 16], ecx + mov ecx, [ecx + 16 + 28] // w + + align 4 + convertloop: + // Read in the data from the source pointer. + // First round of bit swap. + movdqu xmm0, [eax] + movdqu xmm1, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm0 // use xmm7 as temp register. + punpcklbw xmm0, xmm1 + punpckhbw xmm7, xmm1 + movdqa xmm1, xmm7 + movdqu xmm2, [eax] + movdqu xmm3, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm2 + punpcklbw xmm2, xmm3 + punpckhbw xmm7, xmm3 + movdqa xmm3, xmm7 + movdqu xmm4, [eax] + movdqu xmm5, [eax + edi] + lea eax, [eax + 2 * edi] + movdqa xmm7, xmm4 + punpcklbw xmm4, xmm5 + punpckhbw xmm7, xmm5 + movdqa xmm5, xmm7 + movdqu xmm6, [eax] + movdqu xmm7, [eax + edi] + lea eax, [eax + 2 * edi] + movdqu [esp], xmm5 // backup xmm5 + neg edi + movdqa xmm5, xmm6 // use xmm5 as temp register. + punpcklbw xmm6, xmm7 + punpckhbw xmm5, xmm7 + movdqa xmm7, xmm5 + lea eax, [eax + 8 * edi + 16] + neg edi + // Second round of bit swap. + movdqa xmm5, xmm0 + punpcklwd xmm0, xmm2 + punpckhwd xmm5, xmm2 + movdqa xmm2, xmm5 + movdqa xmm5, xmm1 + punpcklwd xmm1, xmm3 + punpckhwd xmm5, xmm3 + movdqa xmm3, xmm5 + movdqa xmm5, xmm4 + punpcklwd xmm4, xmm6 + punpckhwd xmm5, xmm6 + movdqa xmm6, xmm5 + movdqu xmm5, [esp] // restore xmm5 + movdqu [esp], xmm6 // backup xmm6 + movdqa xmm6, xmm5 // use xmm6 as temp register. + punpcklwd xmm5, xmm7 + punpckhwd xmm6, xmm7 + movdqa xmm7, xmm6 + // Third round of bit swap. + // Write to the destination pointer. + movdqa xmm6, xmm0 + punpckldq xmm0, xmm4 + punpckhdq xmm6, xmm4 + movdqa xmm4, xmm6 + movdqu xmm6, [esp] // restore xmm6 + movlpd qword ptr [edx], xmm0 + movhpd qword ptr [ebx], xmm0 + movlpd qword ptr [edx + esi], xmm4 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm4 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm2 // use xmm0 as the temp register. + punpckldq xmm2, xmm6 + movlpd qword ptr [edx], xmm2 + movhpd qword ptr [ebx], xmm2 + punpckhdq xmm0, xmm6 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm1 // use xmm0 as the temp register. + punpckldq xmm1, xmm5 + movlpd qword ptr [edx], xmm1 + movhpd qword ptr [ebx], xmm1 + punpckhdq xmm0, xmm5 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + movdqa xmm0, xmm3 // use xmm0 as the temp register. + punpckldq xmm3, xmm7 + movlpd qword ptr [edx], xmm3 + movhpd qword ptr [ebx], xmm3 + punpckhdq xmm0, xmm7 + sub ecx, 8 + movlpd qword ptr [edx + esi], xmm0 + lea edx, [edx + 2 * esi] + movhpd qword ptr [ebx + ebp], xmm0 + lea ebx, [ebx + 2 * ebp] + jg convertloop + + mov esp, [esp + 16] + pop ebp + pop edi + pop esi + pop ebx + ret + } +} + +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/third_party/libyuv/source/row_any.cc b/third_party/libyuv/source/row_any.cc index 19340b3..1cb1f6b 100644 --- a/third_party/libyuv/source/row_any.cc +++ b/third_party/libyuv/source/row_any.cc @@ -10,6 +10,8 @@ #include "libyuv/row.h" +#include // For memset. + #include "libyuv/basic_types.h" #ifdef __cplusplus @@ -17,699 +19,660 @@ namespace libyuv { extern "C" { #endif -// YUV to RGB does multiple of 8 with SIMD and remainder with C. -#define YANY(NAMEANY, I420TORGB_SIMD, I420TORGB_C, UV_SHIFT, BPP, MASK) \ +// Subsampled source needs to be increase by 1 of not even. +#define SS(width, shift) (((width) + (1 << (shift)) - 1) >> (shift)) + +// Any 3 planes to 1. +#define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ void NAMEANY(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, \ - uint8* rgb_buf, int width) { \ + uint8* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 4]); \ + memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \ + int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ - I420TORGB_SIMD(y_buf, u_buf, v_buf, rgb_buf, n); \ + ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, n); \ } \ - I420TORGB_C(y_buf + n, \ - u_buf + (n >> UV_SHIFT), \ - v_buf + (n >> UV_SHIFT), \ - rgb_buf + n * BPP, width & MASK); \ + memcpy(temp, y_buf + n, r); \ + memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \ + ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \ + SS(r, DUVSHIFT) * BPP); \ } #ifdef HAS_I422TOARGBROW_SSSE3 -YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, I422ToARGBRow_C, - 1, 4, 7) +ANY31(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) #endif #ifdef HAS_I444TOARGBROW_SSSE3 -YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, I444ToARGBRow_C, - 0, 4, 7) -YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, I411ToARGBRow_C, - 2, 4, 7) -YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, I422ToBGRARow_C, - 1, 4, 7) -YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, I422ToABGRRow_C, - 1, 4, 7) -YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, I422ToRGBARow_C, - 1, 4, 7) -YANY(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, I422ToARGB4444Row_C, - 1, 2, 7) -YANY(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, I422ToARGB1555Row_C, - 1, 2, 7) -YANY(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, I422ToRGB565Row_C, - 1, 2, 7) -YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7) -YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7) -YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15) -YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15) +ANY31(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) +ANY31(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_SSSE3, 2, 0, 4, 7) +ANY31(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_SSSE3, 1, 0, 4, 7) +ANY31(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_SSSE3, 1, 0, 4, 7) +ANY31(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) +ANY31(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7) +ANY31(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7) +ANY31(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7) +ANY31(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7) +ANY31(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, 1, 0, 3, 7) +ANY31(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, 1, 1, 4, 15) +ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15) #endif // HAS_I444TOARGBROW_SSSE3 +#ifdef HAS_I422TORGB24ROW_AVX2 +ANY31(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15) +#endif +#ifdef HAS_I422TORAWROW_AVX2 +ANY31(I422ToRAWRow_Any_AVX2, I422ToRAWRow_AVX2, 1, 0, 3, 15) +#endif #ifdef HAS_J422TOARGBROW_SSSE3 -YANY(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, J422ToARGBRow_C, - 1, 4, 7) +ANY31(J422ToARGBRow_Any_SSSE3, J422ToARGBRow_SSSE3, 1, 0, 4, 7) +#endif +#ifdef HAS_J422TOARGBROW_AVX2 +ANY31(J422ToARGBRow_Any_AVX2, J422ToARGBRow_AVX2, 1, 0, 4, 15) #endif #ifdef HAS_I422TOARGBROW_AVX2 -YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15) +ANY31(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15) #endif #ifdef HAS_I422TOBGRAROW_AVX2 -YANY(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, I422ToBGRARow_C, 1, 4, 15) +ANY31(I422ToBGRARow_Any_AVX2, I422ToBGRARow_AVX2, 1, 0, 4, 15) #endif #ifdef HAS_I422TORGBAROW_AVX2 -YANY(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, I422ToRGBARow_C, 1, 4, 15) +ANY31(I422ToRGBARow_Any_AVX2, I422ToRGBARow_AVX2, 1, 0, 4, 15) #endif #ifdef HAS_I422TOABGRROW_AVX2 -YANY(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, I422ToABGRRow_C, 1, 4, 15) +ANY31(I422ToABGRRow_Any_AVX2, I422ToABGRRow_AVX2, 1, 0, 4, 15) +#endif +#ifdef HAS_I444TOARGBROW_AVX2 +ANY31(I444ToARGBRow_Any_AVX2, I444ToARGBRow_AVX2, 0, 0, 4, 15) +#endif +#ifdef HAS_I411TOARGBROW_AVX2 +ANY31(I411ToARGBRow_Any_AVX2, I411ToARGBRow_AVX2, 2, 0, 4, 15) #endif #ifdef HAS_I422TOARGB4444ROW_AVX2 -YANY(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, I422ToARGB4444Row_C, - 1, 2, 7) +ANY31(I422ToARGB4444Row_Any_AVX2, I422ToARGB4444Row_AVX2, 1, 0, 2, 7) #endif #ifdef HAS_I422TOARGB1555ROW_AVX2 -YANY(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, I422ToARGB1555Row_C, - 1, 2, 7) +ANY31(I422ToARGB1555Row_Any_AVX2, I422ToARGB1555Row_AVX2, 1, 0, 2, 7) #endif #ifdef HAS_I422TORGB565ROW_AVX2 -YANY(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, I422ToRGB565Row_C, - 1, 2, 7) +ANY31(I422ToRGB565Row_Any_AVX2, I422ToRGB565Row_AVX2, 1, 0, 2, 7) #endif #ifdef HAS_I422TOARGBROW_NEON -YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7) -YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7) -YANY(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, I411ToARGBRow_C, 2, 4, 7) -YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1, 4, 7) -YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1, 4, 7) -YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1, 4, 7) -YANY(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, I422ToRGB24Row_C, 1, 3, 7) -YANY(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, I422ToRAWRow_C, 1, 3, 7) -YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C, - 1, 2, 7) -YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C, - 1, 2, 7) -YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7) +ANY31(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, 0, 0, 4, 7) +ANY31(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, 1, 0, 4, 7) +ANY31(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, 2, 0, 4, 7) +ANY31(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, 1, 0, 4, 7) +ANY31(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, 1, 0, 4, 7) +ANY31(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, 1, 0, 4, 7) +ANY31(I422ToRGB24Row_Any_NEON, I422ToRGB24Row_NEON, 1, 0, 3, 7) +ANY31(I422ToRAWRow_Any_NEON, I422ToRAWRow_NEON, 1, 0, 3, 7) +ANY31(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, 1, 0, 2, 7) +ANY31(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, 1, 0, 2, 7) +ANY31(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, 1, 0, 2, 7) #endif #ifdef HAS_I422TOYUY2ROW_NEON -YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15) +ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15) #endif #ifdef HAS_I422TOUYVYROW_NEON -YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15) +ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15) #endif -#undef YANY +#undef ANY31 -// Wrappers to handle odd width -#define NV2NY(NAMEANY, NV12TORGB_SIMD, NV12TORGB_C, UV_SHIFT, BPP, MASK) \ +// Any 2 planes to 1. +#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ void NAMEANY(const uint8* y_buf, const uint8* uv_buf, \ - uint8* rgb_buf, int width) { \ + uint8* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ - NV12TORGB_SIMD(y_buf, uv_buf, rgb_buf, n); \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ } \ - NV12TORGB_C(y_buf + n, \ - uv_buf + (n >> UV_SHIFT), \ - rgb_buf + n * BPP, width & MASK); \ + memcpy(temp, y_buf + n * SBPP, r * SBPP); \ + memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ + SS(r, UVSHIFT) * SBPP2); \ + ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ } +// Biplanar to RGB. #ifdef HAS_NV12TOARGBROW_SSSE3 -NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, NV12ToARGBRow_C, 0, 4, 7) -NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, NV21ToARGBRow_C, 0, 4, 7) +ANY21(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_SSSE3, 1, 1, 2, 4, 7) +ANY21(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_SSSE3, 1, 1, 2, 4, 7) #endif #ifdef HAS_NV12TOARGBROW_AVX2 -NV2NY(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, NV12ToARGBRow_C, 0, 4, 15) -NV2NY(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, NV21ToARGBRow_C, 0, 4, 15) +ANY21(NV12ToARGBRow_Any_AVX2, NV12ToARGBRow_AVX2, 1, 1, 2, 4, 15) +ANY21(NV21ToARGBRow_Any_AVX2, NV21ToARGBRow_AVX2, 1, 1, 2, 4, 15) #endif #ifdef HAS_NV12TOARGBROW_NEON -NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4, 7) -NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4, 7) +ANY21(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, 1, 1, 2, 4, 7) +ANY21(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, 1, 1, 2, 4, 7) #endif #ifdef HAS_NV12TORGB565ROW_SSSE3 -NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C, - 0, 2, 7) -NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C, - 0, 2, 7) +ANY21(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) +ANY21(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, 1, 1, 2, 2, 7) #endif #ifdef HAS_NV12TORGB565ROW_AVX2 -NV2NY(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, NV12ToRGB565Row_C, - 0, 2, 15) -NV2NY(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, NV21ToRGB565Row_C, - 0, 2, 15) +ANY21(NV12ToRGB565Row_Any_AVX2, NV12ToRGB565Row_AVX2, 1, 1, 2, 2, 15) +ANY21(NV21ToRGB565Row_Any_AVX2, NV21ToRGB565Row_AVX2, 1, 1, 2, 2, 15) #endif #ifdef HAS_NV12TORGB565ROW_NEON -NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, - 0, 2, 7) -NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, - 0, 2, 7) +ANY21(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, 1, 1, 2, 2, 7) +ANY21(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, 1, 1, 2, 2, 7) +#endif + +// Merge functions. +#ifdef HAS_MERGEUVROW_SSE2 +ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15) +#endif +#ifdef HAS_MERGEUVROW_AVX2 +ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31) +#endif +#ifdef HAS_MERGEUVROW_NEON +ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15) #endif -#undef NVANY -#define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src, uint8* dst, int width) { \ +// Math functions. +#ifdef HAS_ARGBMULTIPLYROW_SSE2 +ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBADDROW_SSE2 +ANY21(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBSUBTRACTROW_SSE2 +ANY21(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, 0, 4, 4, 4, 3) +#endif +#ifdef HAS_ARGBMULTIPLYROW_AVX2 +ANY21(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBADDROW_AVX2 +ANY21(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_AVX2 +ANY21(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBMULTIPLYROW_NEON +ANY21(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBADDROW_NEON +ANY21(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_ARGBSUBTRACTROW_NEON +ANY21(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, 0, 4, 4, 4, 7) +#endif +#ifdef HAS_SOBELROW_SSE2 +ANY21(SobelRow_Any_SSE2, SobelRow_SSE2, 0, 1, 1, 4, 15) +#endif +#ifdef HAS_SOBELROW_NEON +ANY21(SobelRow_Any_NEON, SobelRow_NEON, 0, 1, 1, 4, 7) +#endif +#ifdef HAS_SOBELTOPLANEROW_SSE2 +ANY21(SobelToPlaneRow_Any_SSE2, SobelToPlaneRow_SSE2, 0, 1, 1, 1, 15) +#endif +#ifdef HAS_SOBELTOPLANEROW_NEON +ANY21(SobelToPlaneRow_Any_NEON, SobelToPlaneRow_NEON, 0, 1, 1, 1, 15) +#endif +#ifdef HAS_SOBELXYROW_SSE2 +ANY21(SobelXYRow_Any_SSE2, SobelXYRow_SSE2, 0, 1, 1, 4, 15) +#endif +#ifdef HAS_SOBELXYROW_NEON +ANY21(SobelXYRow_Any_NEON, SobelXYRow_NEON, 0, 1, 1, 4, 7) +#endif +#undef ANY21 + +// Any 1 to 1. +#define ANY11(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8 temp[128 * 2]); \ + memset(temp, 0, 128); /* for YUY2 and msan */ \ + int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ - ARGBTORGB_SIMD(src, dst, n); \ + ANY_SIMD(src_ptr, dst_ptr, n); \ } \ - ARGBTORGB_C(src + n * SBPP, dst + n * BPP, width & MASK); \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * SBPP, SS(r, UVSHIFT) * SBPP); \ + ANY_SIMD(temp, temp + 128, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ } +#ifdef HAS_COPYROW_AVX +ANY11(CopyRow_Any_AVX, CopyRow_AVX, 0, 1, 1, 63) +#endif +#ifdef HAS_COPYROW_SSE2 +ANY11(CopyRow_Any_SSE2, CopyRow_SSE2, 0, 1, 1, 31) +#endif +#ifdef HAS_COPYROW_NEON +ANY11(CopyRow_Any_NEON, CopyRow_NEON, 0, 1, 1, 31) +#endif #if defined(HAS_ARGBTORGB24ROW_SSSE3) -RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, ARGBToRGB24Row_C, - 4, 3, 15) -RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, ARGBToRAWRow_C, - 4, 3, 15) -RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, ARGBToRGB565Row_C, - 4, 2, 3) -RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C, - 4, 2, 3) -RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C, - 4, 2, 3) +ANY11(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 0, 4, 3, 15) +ANY11(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 0, 4, 3, 15) +ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3) +ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3) +ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3) #endif #if defined(HAS_ARGBTOARGB4444ROW_AVX2) -RGBANY(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, ARGBToRGB565Row_C, - 4, 2, 7) -RGBANY(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, ARGBToARGB1555Row_C, - 4, 2, 7) -RGBANY(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, ARGBToARGB4444Row_C, - 4, 2, 7) +ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) +ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) +ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) #endif - -#if defined(HAS_I400TOARGBROW_SSE2) -RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C, 1, 4, 7) +#if defined(HAS_J400TOARGBROW_SSE2) +ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7) +#endif +#if defined(HAS_J400TOARGBROW_AVX2) +ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15) #endif -#if defined(HAS_YTOARGBROW_SSE2) -RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, 1, 4, 7) +#if defined(HAS_I400TOARGBROW_SSE2) +ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7) #endif -#if defined(HAS_YTOARGBROW_AVX2) -RGBANY(YToARGBRow_Any_AVX2, YToARGBRow_AVX2, YToARGBRow_C, 1, 4, 15) +#if defined(HAS_I400TOARGBROW_AVX2) +ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15) #endif #if defined(HAS_YUY2TOARGBROW_SSSE3) -RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, YUY2ToARGBRow_C, 2, 4, 15) -RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, UYVYToARGBRow_C, 2, 4, 15) -RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C, - 3, 4, 15) -RGBANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, RAWToARGBRow_C, 3, 4, 15) -RGBANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, RGB565ToARGBRow_C, - 2, 4, 7) -RGBANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, ARGB1555ToARGBRow_C, - 2, 4, 7) -RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C, - 2, 4, 7) +ANY11(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, 1, 4, 4, 15) +ANY11(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_SSSE3, 1, 4, 4, 15) +ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15) +ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15) +ANY11(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 0, 2, 4, 7) +ANY11(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 0, 2, 4, 7) +ANY11(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 0, 2, 4, 7) +#endif +#if defined(HAS_RGB565TOARGBROW_AVX2) +ANY11(RGB565ToARGBRow_Any_AVX2, RGB565ToARGBRow_AVX2, 0, 2, 4, 15) +#endif +#if defined(HAS_ARGB1555TOARGBROW_AVX2) +ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15) +#endif +#if defined(HAS_ARGB4444TOARGBROW_AVX2) +ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15) #endif #if defined(HAS_YUY2TOARGBROW_AVX2) -RGBANY(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, YUY2ToARGBRow_C, 2, 4, 31) -RGBANY(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, UYVYToARGBRow_C, 2, 4, 31) +ANY11(YUY2ToARGBRow_Any_AVX2, YUY2ToARGBRow_AVX2, 1, 4, 4, 31) +ANY11(UYVYToARGBRow_Any_AVX2, UYVYToARGBRow_AVX2, 1, 4, 4, 31) #endif #if defined(HAS_ARGBTORGB24ROW_NEON) -RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 4, 3, 7) -RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, ARGBToRAWRow_C, 4, 3, 7) -RGBANY(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, ARGBToRGB565Row_C, - 4, 2, 7) -RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C, - 4, 2, 7) -RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C, - 4, 2, 7) -RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C, 1, 4, 7) -RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C, 1, 4, 7) -RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C, 2, 4, 7) -RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, 2, 4, 7) -#endif -#undef RGBANY - -// ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst. -#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src, uint8* dst, uint32 selector, int width) { \ - int n = width & ~MASK; \ - if (n > 0) { \ - ARGBTORGB_SIMD(src, dst, selector, n); \ - } \ - ARGBTORGB_C(src + n * SBPP, dst + n * BPP, selector, width & MASK); \ - } - -#if defined(HAS_ARGBTOBAYERGGROW_SSE2) -BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C, - 4, 1, 7) +ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7) +ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7) +ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7) +ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7) +ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7) +ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7) +ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7) +ANY11(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, 1, 4, 4, 7) +ANY11(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, 1, 4, 4, 7) #endif -#if defined(HAS_ARGBTOBAYERGGROW_NEON) -BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C, - 4, 1, 7) -#endif - -#undef BAYERANY - -#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \ - int n = width & ~MASK; \ - if (n > 0) { \ - ARGBTOY_SIMD(src_argb, dst_y, n); \ - } \ - ARGBTOY_C(src_argb + n * SBPP, \ - dst_y + n * BPP, width & MASK); \ - } #ifdef HAS_ARGBTOYROW_AVX2 -YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, ARGBToYRow_C, 4, 1, 31) +ANY11(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 0, 4, 1, 31) #endif #ifdef HAS_ARGBTOYJROW_AVX2 -YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, ARGBToYJRow_C, 4, 1, 31) +ANY11(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 0, 4, 1, 31) #endif #ifdef HAS_UYVYTOYROW_AVX2 -YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, UYVYToYRow_C, 2, 1, 31) +ANY11(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 0, 2, 1, 31) #endif #ifdef HAS_YUY2TOYROW_AVX2 -YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, YUY2ToYRow_C, 2, 1, 31) +ANY11(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 1, 4, 1, 31) #endif #ifdef HAS_ARGBTOYROW_SSSE3 -YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, ARGBToYRow_C, 4, 1, 15) +ANY11(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, 0, 4, 1, 15) #endif #ifdef HAS_BGRATOYROW_SSSE3 -YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, BGRAToYRow_C, 4, 1, 15) -YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, ABGRToYRow_C, 4, 1, 15) -YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, RGBAToYRow_C, 4, 1, 15) -YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, YUY2ToYRow_C, 2, 1, 15) -YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, UYVYToYRow_C, 2, 1, 15) +ANY11(BGRAToYRow_Any_SSSE3, BGRAToYRow_SSSE3, 0, 4, 1, 15) +ANY11(ABGRToYRow_Any_SSSE3, ABGRToYRow_SSSE3, 0, 4, 1, 15) +ANY11(RGBAToYRow_Any_SSSE3, RGBAToYRow_SSSE3, 0, 4, 1, 15) +ANY11(YUY2ToYRow_Any_SSE2, YUY2ToYRow_SSE2, 1, 4, 1, 15) +ANY11(UYVYToYRow_Any_SSE2, UYVYToYRow_SSE2, 1, 4, 1, 15) #endif #ifdef HAS_ARGBTOYJROW_SSSE3 -YANY(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, ARGBToYJRow_C, 4, 1, 15) +ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15) #endif #ifdef HAS_ARGBTOYROW_NEON -YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, ARGBToYRow_C, 4, 1, 7) +ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_ARGBTOYJROW_NEON -YANY(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, ARGBToYJRow_C, 4, 1, 7) +ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_BGRATOYROW_NEON -YANY(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, BGRAToYRow_C, 4, 1, 7) +ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_ABGRTOYROW_NEON -YANY(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, ABGRToYRow_C, 4, 1, 7) +ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_RGBATOYROW_NEON -YANY(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, RGBAToYRow_C, 4, 1, 7) +ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7) #endif #ifdef HAS_RGB24TOYROW_NEON -YANY(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, RGB24ToYRow_C, 3, 1, 7) +ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7) #endif #ifdef HAS_RAWTOYROW_NEON -YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, RAWToYRow_C, 3, 1, 7) +ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7) #endif #ifdef HAS_RGB565TOYROW_NEON -YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, RGB565ToYRow_C, 2, 1, 7) +ANY11(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 0, 2, 1, 7) #endif #ifdef HAS_ARGB1555TOYROW_NEON -YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, ARGB1555ToYRow_C, 2, 1, 7) +ANY11(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 0, 2, 1, 7) #endif #ifdef HAS_ARGB4444TOYROW_NEON -YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, ARGB4444ToYRow_C, 2, 1, 7) +ANY11(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 0, 2, 1, 7) #endif #ifdef HAS_YUY2TOYROW_NEON -YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, YUY2ToYRow_C, 2, 1, 15) +ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15) #endif #ifdef HAS_UYVYTOYROW_NEON -YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, UYVYToYRow_C, 2, 1, 15) +ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15) #endif #ifdef HAS_RGB24TOARGBROW_NEON -YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, RGB24ToARGBRow_C, 3, 4, 7) +ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) #endif #ifdef HAS_RAWTOARGBROW_NEON -YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, RAWToARGBRow_C, 3, 4, 7) +ANY11(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 0, 3, 4, 7) #endif #ifdef HAS_RGB565TOARGBROW_NEON -YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, RGB565ToARGBRow_C, 2, 4, 7) +ANY11(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 0, 2, 4, 7) #endif #ifdef HAS_ARGB1555TOARGBROW_NEON -YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, ARGB1555ToARGBRow_C, - 2, 4, 7) +ANY11(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 0, 2, 4, 7) #endif #ifdef HAS_ARGB4444TOARGBROW_NEON -YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, ARGB4444ToARGBRow_C, - 2, 4, 7) +ANY11(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 0, 2, 4, 7) #endif #ifdef HAS_ARGBATTENUATEROW_SSSE3 -YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C, - 4, 4, 3) +ANY11(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, 0, 4, 4, 3) #endif #ifdef HAS_ARGBATTENUATEROW_SSE2 -YANY(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, ARGBAttenuateRow_C, - 4, 4, 3) +ANY11(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, 0, 4, 4, 3) #endif #ifdef HAS_ARGBUNATTENUATEROW_SSE2 -YANY(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, ARGBUnattenuateRow_C, - 4, 4, 3) +ANY11(ARGBUnattenuateRow_Any_SSE2, ARGBUnattenuateRow_SSE2, 0, 4, 4, 3) #endif #ifdef HAS_ARGBATTENUATEROW_AVX2 -YANY(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, ARGBAttenuateRow_C, - 4, 4, 7) +ANY11(ARGBAttenuateRow_Any_AVX2, ARGBAttenuateRow_AVX2, 0, 4, 4, 7) #endif #ifdef HAS_ARGBUNATTENUATEROW_AVX2 -YANY(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, ARGBUnattenuateRow_C, - 4, 4, 7) +ANY11(ARGBUnattenuateRow_Any_AVX2, ARGBUnattenuateRow_AVX2, 0, 4, 4, 7) #endif #ifdef HAS_ARGBATTENUATEROW_NEON -YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C, - 4, 4, 7) +ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) #endif -#undef YANY +#undef ANY11 -// RGB/YUV to UV does multiple of 16 with SIMD and remainder with C. -#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK) \ - void NAMEANY(const uint8* src_argb, int src_stride_argb, \ - uint8* dst_u, uint8* dst_v, int width) { \ +// Any 1 to 1 with parameter. +#define ANY11P(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \ + T shuffler, int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 2]); \ + memset(temp, 0, 64); /* for msan */ \ + int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ - ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n); \ + ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \ } \ - ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \ - dst_u + (n >> 1), \ - dst_v + (n >> 1), \ - width & MASK); \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ } -#ifdef HAS_ARGBTOUVROW_AVX2 -UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, ARGBToUVRow_C, 4, 31) -#endif -#ifdef HAS_ARGBTOUVROW_SSSE3 -UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, ARGBToUVRow_C, 4, 15) -UVANY(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, ARGBToUVJRow_C, 4, 15) -UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, BGRAToUVRow_C, 4, 15) -UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, ABGRToUVRow_C, 4, 15) -UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, RGBAToUVRow_C, 4, 15) -#endif -#ifdef HAS_YUY2TOUVROW_AVX2 -UVANY(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, YUY2ToUVRow_C, 2, 31) -UVANY(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, UYVYToUVRow_C, 2, 31) -#endif -#ifdef HAS_YUY2TOUVROW_SSE2 -UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, YUY2ToUVRow_C, 2, 15) -UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, UYVYToUVRow_C, 2, 15) -#endif -#ifdef HAS_ARGBTOUVROW_NEON -UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15) -#endif -#ifdef HAS_ARGBTOUVJROW_NEON -UVANY(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, ARGBToUVJRow_C, 4, 15) -#endif -#ifdef HAS_BGRATOUVROW_NEON -UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15) -#endif -#ifdef HAS_ABGRTOUVROW_NEON -UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15) -#endif -#ifdef HAS_RGBATOUVROW_NEON -UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15) -#endif -#ifdef HAS_RGB24TOUVROW_NEON -UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15) +#if defined(HAS_ARGBTORGB565DITHERROW_SSE2) +ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2, + const uint32, 4, 2, 3) #endif -#ifdef HAS_RAWTOUVROW_NEON -UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15) +#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) +ANY11P(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2, + const uint32, 4, 2, 7) #endif -#ifdef HAS_RGB565TOUVROW_NEON -UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15) +#if defined(HAS_ARGBTORGB565DITHERROW_NEON) +ANY11P(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON, + const uint32, 4, 2, 7) #endif -#ifdef HAS_ARGB1555TOUVROW_NEON -UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15) +#ifdef HAS_ARGBSHUFFLEROW_SSE2 +ANY11P(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, const uint8*, 4, 4, 3) #endif -#ifdef HAS_ARGB4444TOUVROW_NEON -UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15) +#ifdef HAS_ARGBSHUFFLEROW_SSSE3 +ANY11P(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, const uint8*, 4, 4, 7) #endif -#ifdef HAS_YUY2TOUVROW_NEON -UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15) +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +ANY11P(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, const uint8*, 4, 4, 15) #endif -#ifdef HAS_UYVYTOUVROW_NEON -UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15) +#ifdef HAS_ARGBSHUFFLEROW_NEON +ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3) #endif -#undef UVANY +#undef ANY11P -#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, SHIFT, MASK) \ - void NAMEANY(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { \ +// Any 1 to 1 interpolate. Takes 2 rows of source via stride. +#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ + void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \ + ptrdiff_t src_stride_ptr, int width, \ + int source_y_fraction) { \ + SIMD_ALIGNED(uint8 temp[64 * 3]); \ + memset(temp, 0, 64 * 2); /* for msan */ \ + int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ - ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \ + ANY_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \ } \ - ANYTOUV_C(src_uv + n * BPP, \ - dst_u + (n >> SHIFT), \ - dst_v + (n >> SHIFT), \ - width & MASK); \ + memcpy(temp, src_ptr + n * SBPP, r * SBPP); \ + memcpy(temp + 64, src_ptr + src_stride_ptr + n * SBPP, r * SBPP); \ + ANY_SIMD(temp + 128, temp, 64, MASK + 1, source_y_fraction); \ + memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ } -#ifdef HAS_ARGBTOUV444ROW_SSSE3 -UV422ANY(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, - ARGBToUV444Row_C, 4, 0, 15) +#ifdef HAS_INTERPOLATEROW_AVX2 +ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31) #endif -#ifdef HAS_YUY2TOUV422ROW_AVX2 -UV422ANY(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, - YUY2ToUV422Row_C, 2, 1, 31) -UV422ANY(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, - UYVYToUV422Row_C, 2, 1, 31) +#ifdef HAS_INTERPOLATEROW_SSSE3 +ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15) #endif -#ifdef HAS_ARGBTOUV422ROW_SSSE3 -UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3, - ARGBToUV422Row_C, 4, 1, 15) +#ifdef HAS_INTERPOLATEROW_SSE2 +ANY11T(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, 1, 1, 15) #endif -#ifdef HAS_YUY2TOUV422ROW_SSE2 -UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, - YUY2ToUV422Row_C, 2, 1, 15) -UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, - UYVYToUV422Row_C, 2, 1, 15) +#ifdef HAS_INTERPOLATEROW_NEON +ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) #endif -#ifdef HAS_YUY2TOUV422ROW_NEON -UV422ANY(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, - ARGBToUV444Row_C, 4, 0, 7) -UV422ANY(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, - ARGBToUV422Row_C, 4, 1, 15) -UV422ANY(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, - ARGBToUV411Row_C, 4, 2, 31) -UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, - YUY2ToUV422Row_C, 2, 1, 15) -UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, - UYVYToUV422Row_C, 2, 1, 15) -#endif -#undef UV422ANY +#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2 +ANY11T(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, 1, 1, 3) +#endif +#undef ANY11T -#define SPLITUVROWANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \ - void NAMEANY(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { \ +// Any 1 to 1 mirror. +#define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8 temp[64 * 2]); \ + memset(temp, 0, 64); /* for msan */ \ + int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ - ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \ + ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ } \ - ANYTOUV_C(src_uv + n * 2, \ - dst_u + n, \ - dst_v + n, \ - width & MASK); \ + memcpy(temp, src_ptr, r * BPP); \ + ANY_SIMD(temp, temp + 64, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 64 + (MASK + 1 - r) * BPP, r * BPP); \ } -#ifdef HAS_SPLITUVROW_SSE2 -SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, SplitUVRow_C, 15) +#ifdef HAS_MIRRORROW_AVX2 +ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) #endif -#ifdef HAS_SPLITUVROW_AVX2 -SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, SplitUVRow_C, 31) +#ifdef HAS_MIRRORROW_SSSE3 +ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) #endif -#ifdef HAS_SPLITUVROW_NEON -SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15) +#ifdef HAS_MIRRORROW_SSE2 +ANY11M(MirrorRow_Any_SSE2, MirrorRow_SSE2, 1, 15) #endif -#ifdef HAS_SPLITUVROW_MIPS_DSPR2 -SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2, - SplitUVRow_C, 15) +#ifdef HAS_MIRRORROW_NEON +ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15) +#endif +#ifdef HAS_ARGBMIRRORROW_AVX2 +ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) #endif -#undef SPLITUVROWANY +#ifdef HAS_ARGBMIRRORROW_SSE2 +ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) +#endif +#ifdef HAS_ARGBMIRRORROW_NEON +ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3) +#endif +#undef ANY11M -#define MERGEUVROW_ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK) \ - void NAMEANY(const uint8* src_u, const uint8* src_v, \ - uint8* dst_uv, int width) { \ +// Any 1 plane. (memset) +#define ANY1(NAMEANY, ANY_SIMD, T, BPP, MASK) \ + void NAMEANY(uint8* dst_ptr, T v32, int width) { \ + SIMD_ALIGNED(uint8 temp[64]); \ + int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ - ANYTOUV_SIMD(src_u, src_v, dst_uv, n); \ + ANY_SIMD(dst_ptr, v32, n); \ } \ - ANYTOUV_C(src_u + n, \ - src_v + n, \ - dst_uv + n * 2, \ - width & MASK); \ + ANY_SIMD(temp, v32, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp, r * BPP); \ } -#ifdef HAS_MERGEUVROW_SSE2 -MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, MergeUVRow_C, 15) +#ifdef HAS_SETROW_X86 +ANY1(SetRow_Any_X86, SetRow_X86, uint8, 1, 3) #endif -#ifdef HAS_MERGEUVROW_AVX2 -MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, MergeUVRow_C, 31) +#ifdef HAS_SETROW_NEON +ANY1(SetRow_Any_NEON, SetRow_NEON, uint8, 1, 15) #endif -#ifdef HAS_MERGEUVROW_NEON -MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15) +#ifdef HAS_ARGBSETROW_NEON +ANY1(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, uint32, 4, 3) #endif -#undef MERGEUVROW_ANY +#undef ANY1 -#define MATHROW_ANY(NAMEANY, ARGBMATH_SIMD, ARGBMATH_C, MASK) \ - void NAMEANY(const uint8* src_argb0, const uint8* src_argb1, \ - uint8* dst_argb, int width) { \ +// Any 1 to 2. Outputs UV planes. +#define ANY12(NAMEANY, ANY_SIMD, UVSHIFT, BPP, DUVSHIFT, MASK) \ + void NAMEANY(const uint8* src_ptr, uint8* dst_u, uint8* dst_v, int width) {\ + SIMD_ALIGNED(uint8 temp[128 * 3]); \ + memset(temp, 0, 128); /* for msan */ \ + int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ - ARGBMATH_SIMD(src_argb0, src_argb1, dst_argb, n); \ + ANY_SIMD(src_ptr, dst_u, dst_v, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + if ((width & 1) && BPP == 4) { /* repeat last 4 bytes for subsampler */ \ + memcpy(temp + SS(r, UVSHIFT) * BPP, \ + temp + SS(r, UVSHIFT) * BPP - BPP, 4); \ } \ - ARGBMATH_C(src_argb0 + n * 4, \ - src_argb1 + n * 4, \ - dst_argb + n * 4, \ - width & MASK); \ + ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ + memcpy(dst_u + (n >> DUVSHIFT), temp + 128, SS(r, DUVSHIFT)); \ + memcpy(dst_v + (n >> DUVSHIFT), temp + 256, SS(r, DUVSHIFT)); \ } -#ifdef HAS_ARGBMULTIPLYROW_SSE2 -MATHROW_ANY(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, ARGBMultiplyRow_C, - 3) -#endif -#ifdef HAS_ARGBADDROW_SSE2 -MATHROW_ANY(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, ARGBAddRow_C, 3) +#ifdef HAS_SPLITUVROW_SSE2 +ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15) #endif -#ifdef HAS_ARGBSUBTRACTROW_SSE2 -MATHROW_ANY(ARGBSubtractRow_Any_SSE2, ARGBSubtractRow_SSE2, ARGBSubtractRow_C, - 3) +#ifdef HAS_SPLITUVROW_AVX2 +ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31) #endif -#ifdef HAS_ARGBMULTIPLYROW_AVX2 -MATHROW_ANY(ARGBMultiplyRow_Any_AVX2, ARGBMultiplyRow_AVX2, ARGBMultiplyRow_C, - 7) +#ifdef HAS_SPLITUVROW_NEON +ANY12(SplitUVRow_Any_NEON, SplitUVRow_NEON, 0, 2, 0, 15) #endif -#ifdef HAS_ARGBADDROW_AVX2 -MATHROW_ANY(ARGBAddRow_Any_AVX2, ARGBAddRow_AVX2, ARGBAddRow_C, 7) +#ifdef HAS_SPLITUVROW_MIPS_DSPR2 +ANY12(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_MIPS_DSPR2, 0, 2, 0, 15) #endif -#ifdef HAS_ARGBSUBTRACTROW_AVX2 -MATHROW_ANY(ARGBSubtractRow_Any_AVX2, ARGBSubtractRow_AVX2, ARGBSubtractRow_C, - 7) +#ifdef HAS_ARGBTOUV444ROW_SSSE3 +ANY12(ARGBToUV444Row_Any_SSSE3, ARGBToUV444Row_SSSE3, 0, 4, 0, 15) #endif -#ifdef HAS_ARGBMULTIPLYROW_NEON -MATHROW_ANY(ARGBMultiplyRow_Any_NEON, ARGBMultiplyRow_NEON, ARGBMultiplyRow_C, - 7) +#ifdef HAS_YUY2TOUV422ROW_AVX2 +ANY12(YUY2ToUV422Row_Any_AVX2, YUY2ToUV422Row_AVX2, 1, 4, 1, 31) +ANY12(UYVYToUV422Row_Any_AVX2, UYVYToUV422Row_AVX2, 1, 4, 1, 31) #endif -#ifdef HAS_ARGBADDROW_NEON -MATHROW_ANY(ARGBAddRow_Any_NEON, ARGBAddRow_NEON, ARGBAddRow_C, 7) +#ifdef HAS_ARGBTOUV422ROW_SSSE3 +ANY12(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_SSSE3, 0, 4, 1, 15) #endif -#ifdef HAS_ARGBSUBTRACTROW_NEON -MATHROW_ANY(ARGBSubtractRow_Any_NEON, ARGBSubtractRow_NEON, ARGBSubtractRow_C, - 7) +#ifdef HAS_YUY2TOUV422ROW_SSE2 +ANY12(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_SSE2, 1, 4, 1, 15) +ANY12(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_SSE2, 1, 4, 1, 15) #endif -#undef MATHROW_ANY - -// Shuffle may want to work in place, so last16 method can not be used. -#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \ - void NAMEANY(const uint8* src_argb, uint8* dst_argb, \ - const uint8* shuffler, int width) { \ +#ifdef HAS_YUY2TOUV422ROW_NEON +ANY12(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, 0, 4, 0, 7) +ANY12(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, 0, 4, 1, 15) +ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31) +ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15) +ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15) +#endif +#undef ANY12 + +// Any 1 to 2 with source stride (2 rows of source). Outputs UV planes. +// 128 byte row allows for 32 avx ARGB pixels. +#define ANY12S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, int src_stride_ptr, \ + uint8* dst_u, uint8* dst_v, int width) { \ + SIMD_ALIGNED(uint8 temp[128 * 4]); \ + memset(temp, 0, 128 * 2); /* for msan */ \ + int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ - ARGBTOY_SIMD(src_argb, dst_argb, shuffler, n); \ + ANY_SIMD(src_ptr, src_stride_ptr, dst_u, dst_v, n); \ } \ - ARGBTOY_C(src_argb + n * SBPP, \ - dst_argb + n * BPP, shuffler, width & MASK); \ - } - -#ifdef HAS_ARGBSHUFFLEROW_SSE2 -YANY(ARGBShuffleRow_Any_SSE2, ARGBShuffleRow_SSE2, - ARGBShuffleRow_C, 4, 4, 3) -#endif -#ifdef HAS_ARGBSHUFFLEROW_SSSE3 -YANY(ARGBShuffleRow_Any_SSSE3, ARGBShuffleRow_SSSE3, - ARGBShuffleRow_C, 4, 4, 7) -#endif -#ifdef HAS_ARGBSHUFFLEROW_AVX2 -YANY(ARGBShuffleRow_Any_AVX2, ARGBShuffleRow_AVX2, - ARGBShuffleRow_C, 4, 4, 15) -#endif -#ifdef HAS_ARGBSHUFFLEROW_NEON -YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, - ARGBShuffleRow_C, 4, 4, 3) -#endif -#undef YANY - -// Interpolate may want to work in place, so last16 method can not be used. -#define NANY(NAMEANY, TERP_SIMD, TERP_C, SBPP, BPP, MASK) \ - void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \ - ptrdiff_t src_stride_ptr, int width, \ - int source_y_fraction) { \ - int n = width & ~MASK; \ - if (n > 0) { \ - TERP_SIMD(dst_ptr, src_ptr, src_stride_ptr, n, source_y_fraction); \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ + SS(r, UVSHIFT) * BPP); \ + if ((width & 1) && BPP == 4) { /* repeat last 4 bytes for subsampler */ \ + memcpy(temp + SS(r, UVSHIFT) * BPP, \ + temp + SS(r, UVSHIFT) * BPP - BPP, 4); \ + memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ + temp + 128 + SS(r, UVSHIFT) * BPP - BPP, 4); \ } \ - TERP_C(dst_ptr + n * BPP, \ - src_ptr + n * SBPP, src_stride_ptr, \ - width & MASK, source_y_fraction); \ + ANY_SIMD(temp, 128, temp + 256, temp + 384, MASK + 1); \ + memcpy(dst_u + (n >> 1), temp + 256, SS(r, 1)); \ + memcpy(dst_v + (n >> 1), temp + 384, SS(r, 1)); \ } -#ifdef HAS_INTERPOLATEROW_AVX2 -NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, InterpolateRow_C, 1, 1, 31) -#endif -#ifdef HAS_INTERPOLATEROW_SSSE3 -NANY(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, InterpolateRow_C, 1, 1, 15) -#endif -#ifdef HAS_INTERPOLATEROW_SSE2 -NANY(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, InterpolateRow_C, 1, 1, 15) +#ifdef HAS_ARGBTOUVROW_AVX2 +ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) #endif -#ifdef HAS_INTERPOLATEROW_NEON -NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON, InterpolateRow_C, 1, 1, 15) +#ifdef HAS_ARGBTOUVROW_SSSE3 +ANY12S(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_SSSE3, 0, 4, 15) +ANY12S(ARGBToUVJRow_Any_SSSE3, ARGBToUVJRow_SSSE3, 0, 4, 15) +ANY12S(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_SSSE3, 0, 4, 15) +ANY12S(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_SSSE3, 0, 4, 15) +ANY12S(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_SSSE3, 0, 4, 15) #endif -#ifdef HAS_INTERPOLATEROW_MIPS_DSPR2 -NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, InterpolateRow_C, - 1, 1, 3) +#ifdef HAS_YUY2TOUVROW_AVX2 +ANY12S(YUY2ToUVRow_Any_AVX2, YUY2ToUVRow_AVX2, 1, 4, 31) +ANY12S(UYVYToUVRow_Any_AVX2, UYVYToUVRow_AVX2, 1, 4, 31) #endif -#undef NANY - -#define MANY(NAMEANY, MIRROR_SIMD, MIRROR_C, BPP, MASK) \ - void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \ - int n = width & ~MASK; \ - int r = width & MASK; \ - if (n > 0) { \ - MIRROR_SIMD(src_y, dst_y + r * BPP, n); \ - } \ - MIRROR_C(src_y + n * BPP, dst_y, r); \ - } - -#ifdef HAS_MIRRORROW_AVX2 -MANY(MirrorRow_Any_AVX2, MirrorRow_AVX2, MirrorRow_C, 1, 31) +#ifdef HAS_YUY2TOUVROW_SSE2 +ANY12S(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_SSE2, 1, 4, 15) +ANY12S(UYVYToUVRow_Any_SSE2, UYVYToUVRow_SSE2, 1, 4, 15) #endif -#ifdef HAS_MIRRORROW_SSSE3 -MANY(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, MirrorRow_C, 1, 15) +#ifdef HAS_ARGBTOUVROW_NEON +ANY12S(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, 0, 4, 15) #endif -#ifdef HAS_MIRRORROW_SSE2 -MANY(MirrorRow_Any_SSE2, MirrorRow_SSE2, MirrorRow_C, 1, 15) +#ifdef HAS_ARGBTOUVJROW_NEON +ANY12S(ARGBToUVJRow_Any_NEON, ARGBToUVJRow_NEON, 0, 4, 15) #endif -#ifdef HAS_MIRRORROW_NEON -MANY(MirrorRow_Any_NEON, MirrorRow_NEON, MirrorRow_C, 1, 15) +#ifdef HAS_BGRATOUVROW_NEON +ANY12S(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, 0, 4, 15) #endif -#ifdef HAS_ARGBMIRRORROW_AVX2 -MANY(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, ARGBMirrorRow_C, 4, 7) +#ifdef HAS_ABGRTOUVROW_NEON +ANY12S(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, 0, 4, 15) #endif -#ifdef HAS_ARGBMIRRORROW_SSE2 -MANY(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, ARGBMirrorRow_C, 4, 3) +#ifdef HAS_RGBATOUVROW_NEON +ANY12S(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, 0, 4, 15) #endif -#ifdef HAS_ARGBMIRRORROW_NEON -MANY(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, ARGBMirrorRow_C, 4, 3) +#ifdef HAS_RGB24TOUVROW_NEON +ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15) #endif -#undef MANY - -#define MANY(NAMEANY, COPY_SIMD, COPY_C, BPP, MASK) \ - void NAMEANY(const uint8* src_y, uint8* dst_y, int width) { \ - int n = width & ~MASK; \ - int r = width & MASK; \ - if (n > 0) { \ - COPY_SIMD(src_y, dst_y, n); \ - } \ - COPY_C(src_y + n * BPP, dst_y + n * BPP, r); \ - } - -#ifdef HAS_COPYROW_AVX -MANY(CopyRow_Any_AVX, CopyRow_AVX, CopyRow_C, 1, 63) +#ifdef HAS_RAWTOUVROW_NEON +ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15) #endif -#ifdef HAS_COPYROW_SSE2 -MANY(CopyRow_Any_SSE2, CopyRow_SSE2, CopyRow_C, 1, 31) +#ifdef HAS_RGB565TOUVROW_NEON +ANY12S(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, 0, 2, 15) #endif -#ifdef HAS_COPYROW_NEON -MANY(CopyRow_Any_NEON, CopyRow_NEON, CopyRow_C, 1, 31) +#ifdef HAS_ARGB1555TOUVROW_NEON +ANY12S(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, 0, 2, 15) #endif -#undef MANY - -#define SETANY(NAMEANY, SET_SIMD, SET_C, T, BPP, MASK) \ - void NAMEANY(uint8* dst_y, T v8, int width) { \ - int n = width & ~MASK; \ - int r = width & MASK; \ - if (n > 0) { \ - SET_SIMD(dst_y, v8, n); \ - } \ - SET_C(dst_y + n * BPP, v8, r); \ - } - -#ifdef HAS_SETROW_X86 -SETANY(SetRow_Any_X86, SetRow_X86, SetRow_ERMS, uint8, 1, 3) +#ifdef HAS_ARGB4444TOUVROW_NEON +ANY12S(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, 0, 2, 15) #endif -#ifdef HAS_SETROW_NEON -SETANY(SetRow_Any_NEON, SetRow_NEON, SetRow_C, uint8, 1, 15) +#ifdef HAS_YUY2TOUVROW_NEON +ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15) #endif -#ifdef HAS_ARGBSETROW_NEON -SETANY(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, ARGBSetRow_C, uint32, 4, 3) +#ifdef HAS_UYVYTOUVROW_NEON +ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15) #endif -#undef SETANY +#undef ANY12S #ifdef __cplusplus } // extern "C" diff --git a/third_party/libyuv/source/row_common.cc b/third_party/libyuv/source/row_common.cc index e0e2bf4..4987589 100644 --- a/third_party/libyuv/source/row_common.cc +++ b/third_party/libyuv/source/row_common.cc @@ -199,28 +199,36 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { } } +// dither4 is a row of 4 values from 4x4 dither matrix. +// The 4x4 matrix contains values to increase RGB. When converting to +// fewer bits (565) this provides an ordered dither. +// The order in the 4x4 matrix in first byte is upper left. +// The 4 values are passed as an int, then referenced as an array, so +// endian will not affect order of the original matrix. But the dither4 +// will containing the first pixel in the lower byte for little endian +// or the upper byte for big endian. void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, - const uint8* dither8x8, int width) { + const uint32 dither4, int width) { int x; for (x = 0; x < width - 1; x += 2) { - int dither0 = dither8x8[x & 7] - 128; - int dither1 = dither8x8[(x & 7) + 1] - 128; - uint8 b0 = Clamp(src_argb[0] + dither0) >> 3; - uint8 g0 = Clamp(src_argb[1] + dither0) >> 2; - uint8 r0 = Clamp(src_argb[2] + dither0) >> 3; - uint8 b1 = Clamp(src_argb[4] + dither1) >> 3; - uint8 g1 = Clamp(src_argb[5] + dither1) >> 2; - uint8 r1 = Clamp(src_argb[6] + dither1) >> 3; + int dither0 = ((const unsigned char*)(&dither4))[x & 3]; + int dither1 = ((const unsigned char*)(&dither4))[(x + 1) & 3]; + uint8 b0 = clamp255(src_argb[0] + dither0) >> 3; + uint8 g0 = clamp255(src_argb[1] + dither0) >> 2; + uint8 r0 = clamp255(src_argb[2] + dither0) >> 3; + uint8 b1 = clamp255(src_argb[4] + dither1) >> 3; + uint8 g1 = clamp255(src_argb[5] + dither1) >> 2; + uint8 r1 = clamp255(src_argb[6] + dither1) >> 3; WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27)); dst_rgb += 4; src_argb += 8; } if (width & 1) { - int dither0 = dither8x8[(width - 1) & 7] - 128; - uint8 b0 = Clamp(src_argb[0] + dither0) >> 3; - uint8 g0 = Clamp(src_argb[1] + dither0) >> 2; - uint8 r0 = Clamp(src_argb[2] + dither0) >> 3; + int dither0 = ((const unsigned char*)(&dither4))[(width - 1) & 3]; + uint8 b0 = clamp255(src_argb[0] + dither0) >> 3; + uint8 g0 = clamp255(src_argb[1] + dither0) >> 2; + uint8 r0 = clamp255(src_argb[2] + dither0) >> 3; *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); } } @@ -974,7 +982,7 @@ void SobelXYRow_C(const uint8* src_sobelx, const uint8* src_sobely, } } -void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { +void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { // Copy a Y to RGB. int x; for (x = 0; x < width; ++x) { @@ -986,38 +994,42 @@ void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { } } -// YUV to RGB conversion constants. +// BT.601 YUV to RGB reference +// R = (Y - 16) * 1.164 - V * -1.596 +// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 +// B = (Y - 16) * 1.164 - U * -2.018 + // Y contribution to R,G,B. Scale and bias. // TODO(fbarchard): Consider moving constants into a common header. #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ -#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ +#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ // U and V contributions to R,G,B. -#define UB -128 /* -min(128, round(2.018 * 64)) */ -#define UG 25 /* -round(-0.391 * 64) */ -#define VG 52 /* -round(-0.813 * 64) */ -#define VR -102 /* -round(1.596 * 64) */ +#define UB -128 /* max(-128, round(-2.018 * 64)) */ +#define UG 25 /* round(0.391 * 64) */ +#define VG 52 /* round(0.813 * 64) */ +#define VR -102 /* round(-1.596 * 64) */ // Bias values to subtract 16 from Y and 128 from U and V. -#define BB (UB * 128 - YGB) -#define BG (UG * 128 + VG * 128 - YGB) -#define BR (VR * 128 - YGB) +#define BB (UB * 128 + YGB) +#define BG (UG * 128 + VG * 128 + YGB) +#define BR (VR * 128 + YGB) // C reference code that mimics the YUV assembly. static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* b, uint8* g, uint8* r) { uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16; - *b = Clamp((int32)(BB - ( u * UB) + y1) >> 6); - *g = Clamp((int32)(BG - (v * VG + u * UG) + y1) >> 6); - *r = Clamp((int32)(BR - (v * VR ) + y1) >> 6); + *b = Clamp((int32)(-(u * UB) + y1 + BB) >> 6); + *g = Clamp((int32)(-(v * VG + u * UG) + y1 + BG) >> 6); + *r = Clamp((int32)(-(v * VR)+ y1 + BR) >> 6); } // C reference code that mimics the YUV assembly. static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) { uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16; - *b = Clamp((int32)(y1 - YGB) >> 6); - *g = Clamp((int32)(y1 - YGB) >> 6); - *r = Clamp((int32)(y1 - YGB) >> 6); + *b = Clamp((int32)(y1 + YGB) >> 6); + *g = Clamp((int32)(y1 + YGB) >> 6); + *r = Clamp((int32)(y1 + YGB) >> 6); } #undef YG @@ -1030,6 +1042,46 @@ static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) { #undef BG #undef BR +// JPEG YUV to RGB reference +// * R = Y - V * -1.40200 +// * G = Y - U * 0.34414 - V * 0.71414 +// * B = Y - U * -1.77200 + +// Y contribution to R,G,B. Scale and bias. +// TODO(fbarchard): Consider moving constants into a common header. +#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ +#define YGBJ 32 /* 64 / 2 */ + +// U and V contributions to R,G,B. +#define UBJ -113 /* round(-1.77200 * 64) */ +#define UGJ 22 /* round(0.34414 * 64) */ +#define VGJ 46 /* round(0.71414 * 64) */ +#define VRJ -90 /* round(-1.40200 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BBJ (UBJ * 128 + YGBJ) +#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ) +#define BRJ (VRJ * 128 + YGBJ) + +// C reference code that mimics the YUV assembly. +static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v, + uint8* b, uint8* g, uint8* r) { + uint32 y1 = (uint32)(y * 0x0101 * YGJ) >> 16; + *b = Clamp((int32)(-(u * UBJ) + y1 + BBJ) >> 6); + *g = Clamp((int32)(-(v * VGJ + u * UGJ) + y1 + BGJ) >> 6); + *r = Clamp((int32)(-(v * VRJ) + y1 + BRJ) >> 6); +} + +#undef YGJ +#undef YGBJ +#undef UBJ +#undef UGJ +#undef VGJ +#undef VRJ +#undef BBJ +#undef BGJ +#undef BRJ + #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON)) // C mimic assembly. @@ -1102,34 +1154,6 @@ void I422ToARGBRow_C(const uint8* src_y, } } -// C reference code that mimics the YUV assembly. -// * R = Y + 1.40200 * Cr -// * G = Y - 0.34414 * Cb - 0.71414 * Cr -// * B = Y + 1.77200 * Cb - -#define YGJ 64 /* (int8)round(1.000 * 64) */ - -#define UBJ 113 /* (int8)round(1.772 * 64) */ -#define UGJ -22 /* (int8)round(-0.34414 * 64) */ -#define URJ 0 - -#define VBJ 0 -#define VGJ -46 /* (int8)round(-0.71414 * 64) */ -#define VRJ 90 /* (int8)round(1.402 * 64) */ - -// Bias -#define BBJ (UBJ * 128 + VBJ * 128) -#define BGJ (UGJ * 128 + VGJ * 128) -#define BRJ (URJ * 128 + VRJ * 128) - -static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v, - uint8* b, uint8* g, uint8* r) { - uint32 y1 = (uint32)(y * YGJ); - *b = Clamp((int32)(u * UBJ + v * VBJ + y1 - BBJ) >> 6); - *g = Clamp((int32)(u * UGJ + v * VGJ + y1 - BGJ) >> 6); - *r = Clamp((int32)(u * URJ + v * VRJ + y1 - BRJ) >> 6); -} - void J422ToARGBRow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1354,23 +1378,23 @@ void I411ToARGBRow_C(const uint8* src_y, } void NV12ToARGBRow_C(const uint8* src_y, - const uint8* usrc_v, + const uint8* src_uv, uint8* rgb_buf, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], usrc_v[0], usrc_v[1], + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); rgb_buf[3] = 255; - YuvPixel(src_y[1], usrc_v[0], usrc_v[1], + YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); rgb_buf[7] = 255; src_y += 2; - usrc_v += 2; + src_uv += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], usrc_v[0], usrc_v[1], + YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); rgb_buf[3] = 255; } @@ -1402,7 +1426,7 @@ void NV21ToARGBRow_C(const uint8* src_y, } void NV12ToRGB565Row_C(const uint8* src_y, - const uint8* usrc_v, + const uint8* src_uv, uint8* dst_rgb565, int width) { uint8 b0; @@ -1413,8 +1437,8 @@ void NV12ToRGB565Row_C(const uint8* src_y, uint8 r1; int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0); - YuvPixel(src_y[1], usrc_v[0], usrc_v[1], &b1, &g1, &r1); + YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0); + YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; @@ -1424,11 +1448,11 @@ void NV12ToRGB565Row_C(const uint8* src_y, *(uint32*)(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27); src_y += 2; - usrc_v += 2; + src_uv += 2; dst_rgb565 += 4; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], usrc_v[0], usrc_v[1], &b0, &g0, &r0); + YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; @@ -1588,7 +1612,7 @@ void I422ToRGBARow_C(const uint8* src_y, } } -void YToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) { +void I400ToARGBRow_C(const uint8* src_y, uint8* rgb_buf, int width) { int x; for (x = 0; x < width - 1; x += 2) { YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); @@ -2062,22 +2086,6 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr, } } -// Select G channel from ARGB. e.g. GGGGGGGG -void ARGBToBayerGGRow_C(const uint8* src_argb, - uint8* dst_bayer, uint32 selector, int pix) { - // Copy a row of G. - int x; - for (x = 0; x < pix - 1; x += 2) { - dst_bayer[0] = src_argb[1]; - dst_bayer[1] = src_argb[5]; - src_argb += 8; - dst_bayer += 2; - } - if (pix & 1) { - dst_bayer[0] = src_argb[1]; - } -} - // Use first 4 shuffler values to reorder ARGB channels. void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { @@ -2120,7 +2128,7 @@ void I422ToYUY2Row_C(const uint8* src_y, if (width & 1) { dst_frame[0] = src_y[0]; dst_frame[1] = src_u[0]; - dst_frame[2] = src_y[0]; // duplicate last y + dst_frame[2] = 0; dst_frame[3] = src_v[0]; } } @@ -2144,14 +2152,15 @@ void I422ToUYVYRow_C(const uint8* src_y, dst_frame[0] = src_u[0]; dst_frame[1] = src_y[0]; dst_frame[2] = src_v[0]; - dst_frame[3] = src_y[0]; // duplicate last y + dst_frame[3] = 0; } } // Maximum temporary width for wrappers to process at a time, in pixels. #define MAXTWIDTH 2048 -#if !defined(_MSC_VER) && defined(HAS_I422TORGB565ROW_SSSE3) +#if !(defined(_MSC_VER) && !defined(__clang__)) && \ + defined(HAS_I422TORGB565ROW_SSSE3) // row_win.cc has asm version, but GCC uses 2 step wrapper. void I422ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_u, @@ -2346,6 +2355,50 @@ void I422ToARGB4444Row_AVX2(const uint8* src_y, } #endif +#if defined(HAS_I422TORGB24ROW_AVX2) +void I422ToRGB24Row_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); + // TODO(fbarchard): ARGBToRGB24Row_AVX2 + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + +#if defined(HAS_I422TORAWROW_AVX2) +void I422ToRAWRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED32(uint8 row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_AVX2(src_y, src_u, src_v, row, twidth); + // TODO(fbarchard): ARGBToRAWRow_AVX2 + ARGBToRAWRow_SSSE3(row, dst_raw, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_raw += twidth * 3; + width -= twidth; + } +} +#endif + #if defined(HAS_NV12TORGB565ROW_AVX2) void NV12ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_uv, uint8* dst_rgb565, int width) { diff --git a/third_party/libyuv/source/row_posix.cc b/third_party/libyuv/source/row_gcc.cc similarity index 97% rename from third_party/libyuv/source/row_posix.cc rename to third_party/libyuv/source/row_gcc.cc index 1a6f7dc..820de0a 100644 --- a/third_party/libyuv/source/row_posix.cc +++ b/third_party/libyuv/source/row_gcc.cc @@ -236,8 +236,8 @@ void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { } #endif // TESTING -#ifdef HAS_I400TOARGBROW_SSE2 -void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { +#ifdef HAS_J400TOARGBROW_SSE2 +void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "pslld $0x18,%%xmm5 \n" @@ -262,7 +262,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { :: "memory", "cc", "xmm0", "xmm1", "xmm5" ); } -#endif // HAS_I400TOARGBROW_SSE2 +#endif // HAS_J400TOARGBROW_SSE2 #ifdef HAS_RGB24TOARGBROW_SSSE3 void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { @@ -953,7 +953,6 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, #endif // HAS_ARGBTOUVROW_AVX2 #ifdef HAS_ARGBTOUVJROW_SSSE3 -// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3. void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( @@ -1414,22 +1413,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, #if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2) -// YUV to RGB conversion constants. -// Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ -#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ - -// U and V contributions to R,G,B. -#define UB -128 /* -min(128, round(2.018 * 64)) */ -#define UG 25 /* -round(-0.391 * 64) */ -#define VG 52 /* -round(-0.813 * 64) */ -#define VR -102 /* -round(1.596 * 64) */ - -// Bias values to subtract 16 from Y and 128 from U and V. -#define BB (UB * 128 - YGB) -#define BG (UG * 128 + VG * 128 - YGB) -#define BR (VR * 128 - YGB) - struct YuvConstants { lvec8 kUVToB; // 0 lvec8 kUVToG; // 32 @@ -1440,6 +1423,27 @@ struct YuvConstants { lvec16 kYToRgb; // 192 }; +// BT.601 YUV to RGB reference +// R = (Y - 16) * 1.164 - V * -1.596 +// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 +// B = (Y - 16) * 1.164 - U * -2.018 + +// Y contribution to R,G,B. Scale and bias. +// TODO(fbarchard): Consider moving constants into a common header. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ + +// U and V contributions to R,G,B. +#define UB -128 /* max(-128, round(-2.018 * 64)) */ +#define UG 25 /* round(0.391 * 64) */ +#define VG 52 /* round(0.813 * 64) */ +#define VR -102 /* round(-1.596 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BB (UB * 128 + YGB) +#define BG (UG * 128 + VG * 128 + YGB) +#define BR (VR * 128 + YGB) + // BT601 constants for YUV to RGB. static YuvConstants SIMD_ALIGNED(kYuvConstants) = { { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, @@ -1468,6 +1472,67 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = { { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } }; +#undef YG +#undef YGB +#undef UB +#undef UG +#undef VG +#undef VR +#undef BB +#undef BG +#undef BR + +// JPEG YUV to RGB reference +// * R = Y - V * -1.40200 +// * G = Y - U * 0.34414 - V * 0.71414 +// * B = Y - U * -1.77200 + +// Y contribution to R,G,B. Scale and bias. +// TODO(fbarchard): Consider moving constants into a common header. +#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ +#define YGBJ 32 /* 64 / 2 */ + +// U and V contributions to R,G,B. +#define UBJ -113 /* round(-1.77200 * 64) */ +#define UGJ 22 /* round(0.34414 * 64) */ +#define VGJ 46 /* round(0.71414 * 64) */ +#define VRJ -90 /* round(-1.40200 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BBJ (UBJ * 128 + YGBJ) +#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ) +#define BRJ (VRJ * 128 + YGBJ) + +// JPEG constants for YUV to RGB. +YuvConstants SIMD_ALIGNED(kYuvJConstants) = { + { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, + UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 }, + { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, + UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, + UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, + UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ }, + { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, + 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ }, + { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, + BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ }, + { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, + BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ }, + { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, + BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ }, + { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, + YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ } +}; + +#undef YGJ +#undef YGBJ +#undef UBJ +#undef UGJ +#undef VGJ +#undef VRJ +#undef BBJ +#undef BGJ +#undef BRJ + // Read 8 UV from 411 #define READYUV444 \ "movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \ @@ -1534,8 +1599,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = { "punpcklwd %%xmm2,%%xmm0 \n" \ "punpckhwd %%xmm2,%%xmm1 \n" \ "movdqu %%xmm0," MEMACCESS([dst_argb]) " \n" \ - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_argb]) " \n" \ - "lea " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n" + "movdqu %%xmm1," MEMACCESS2(0x10, [dst_argb]) " \n" \ + "lea " MEMLEA(0x20, [dst_argb]) ", %[dst_argb] \n" // Store 8 BGRA values. Assumes XMM5 is zero. #define STOREBGRA \ @@ -1546,8 +1611,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = { "punpcklwd %%xmm1,%%xmm5 \n" \ "punpckhwd %%xmm1,%%xmm0 \n" \ "movdqu %%xmm5," MEMACCESS([dst_bgra]) " \n" \ - "movdqu %%xmm0," MEMACCESS2(0x10,[dst_bgra]) " \n" \ - "lea " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n" + "movdqu %%xmm0," MEMACCESS2(0x10, [dst_bgra]) " \n" \ + "lea " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra] \n" // Store 8 ABGR values. Assumes XMM5 is zero. #define STOREABGR \ @@ -1557,8 +1622,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = { "punpcklwd %%xmm0,%%xmm2 \n" \ "punpckhwd %%xmm0,%%xmm1 \n" \ "movdqu %%xmm2," MEMACCESS([dst_abgr]) " \n" \ - "movdqu %%xmm1," MEMACCESS2(0x10,[dst_abgr]) " \n" \ - "lea " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n" + "movdqu %%xmm1," MEMACCESS2(0x10, [dst_abgr]) " \n" \ + "lea " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr] \n" // Store 8 RGBA values. Assumes XMM5 is zero. #define STORERGBA \ @@ -1569,8 +1634,8 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = { "punpcklwd %%xmm1,%%xmm5 \n" \ "punpckhwd %%xmm1,%%xmm0 \n" \ "movdqu %%xmm5," MEMACCESS([dst_rgba]) " \n" \ - "movdqu %%xmm0," MEMACCESS2(0x10,[dst_rgba]) " \n" \ - "lea " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n" + "movdqu %%xmm0," MEMACCESS2(0x10, [dst_rgba]) " \n" \ + "lea " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba] \n" void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, @@ -1713,6 +1778,32 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, ); } +void OMITFP J422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB(kYuvConstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} + void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1881,10 +1972,10 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, "vpmaddubsw " MEMACCESS([YuvConstants]) ",%%ymm0,%%ymm0 \n" \ "vmovdqu " MEMACCESS2(160, [YuvConstants]) ",%%ymm3 \n" \ "vpsubw %%ymm2,%%ymm3,%%ymm2 \n" \ - "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm2 \n" \ - "vpsubw %%ymm1,%%ymm2,%%ymm1 \n" \ - "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm1 \n" \ - "vpsubw %%ymm0,%%ymm1,%%ymm0 \n" \ + "vmovdqu " MEMACCESS2(128, [YuvConstants]) ",%%ymm3 \n" \ + "vpsubw %%ymm1,%%ymm3,%%ymm1 \n" \ + "vmovdqu " MEMACCESS2(96, [YuvConstants]) ",%%ymm3 \n" \ + "vpsubw %%ymm0,%%ymm3,%%ymm0 \n" \ "vmovdqu " MEMACCESS([y_buf]) ",%%xmm3 \n" \ "lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n" \ "vpermq $0xd8,%%ymm3,%%ymm3 \n" \ @@ -1984,6 +2075,48 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, } #endif // HAS_I422TOARGBROW_AVX2 +#if defined(HAS_J422TOARGBROW_AVX2) +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP J422ToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + asm volatile ( + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvConstants) + + // Step 3: Weave into ARGB + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" // BG + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpcklbw %%ymm5,%%ymm2,%%ymm2 \n" // RA + "vpermq $0xd8,%%ymm2,%%ymm2 \n" + "vpunpcklwd %%ymm2,%%ymm0,%%ymm1 \n" // BGRA first 8 pixels + "vpunpckhwd %%ymm2,%%ymm0,%%ymm0 \n" // BGRA next 8 pixels + + "vmovdqu %%ymm1," MEMACCESS([dst_argb]) "\n" + "vmovdqu %%ymm0," MEMACCESS2(0x20,[dst_argb]) "\n" + "lea " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvJConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc", NACL_R14 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + ); +} +#endif // HAS_J422TOARGBROW_AVX2 + #if defined(HAS_I422TOABGRROW_AVX2) // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). @@ -2066,8 +2199,8 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, } #endif // HAS_I422TORGBAROW_AVX2 -#ifdef HAS_YTOARGBROW_SSE2 -void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { +#ifdef HAS_I400TOARGBROW_SSE2 +void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { asm volatile ( "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 "movd %%eax,%%xmm2 \n" @@ -2109,12 +2242,12 @@ void YToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) { , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" ); } -#endif // HAS_YTOARGBROW_SSE2 +#endif // HAS_I400TOARGBROW_SSE2 -#ifdef HAS_YTOARGBROW_AVX2 +#ifdef HAS_I400TOARGBROW_AVX2 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). // note: vpunpcklbw mutates and vpackuswb unmutates. -void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { +void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { asm volatile ( "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * 16 "vmovd %%eax,%%xmm2 \n" @@ -2156,7 +2289,7 @@ void YToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) { , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" ); } -#endif // HAS_YTOARGBROW_AVX2 +#endif // HAS_I400TOARGBROW_AVX2 #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. @@ -3096,41 +3229,7 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, "psllw $0x8,%%xmm5 \n" "pcmpeqb %%xmm4,%%xmm4 \n" "pslld $0x18,%%xmm4 \n" - "sub $0x1,%3 \n" - "je 91f \n" - "jl 99f \n" - - // 1 pixel loop until destination pointer is aligned. - "10: \n" - "test $0xf,%2 \n" - "je 19f \n" - "movd " MEMACCESS(0) ",%%xmm3 \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movd " MEMACCESS(1) ",%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "pshufhw $0xf5,%%xmm3,%%xmm3 \n" - "pshuflw $0xf5,%%xmm3,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movd " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x4,1) ",%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movd %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x4,2) ",%2 \n" - "sub $0x1,%3 \n" - "jge 10b \n" - - "19: \n" - "add $1-4,%3 \n" + "sub $0x4,%3 \n" "jl 49f \n" // 4 pixel loop. @@ -3231,39 +3330,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, "psllw $0x8,%%xmm5 \n" "pcmpeqb %%xmm4,%%xmm4 \n" "pslld $0x18,%%xmm4 \n" - "sub $0x1,%3 \n" - "je 91f \n" - "jl 99f \n" - - // 1 pixel loop until destination pointer is aligned. - "10: \n" - "test $0xf,%2 \n" - "je 19f \n" - "movd " MEMACCESS(0) ",%%xmm3 \n" - "lea " MEMLEA(0x4,0) ",%0 \n" - "movdqa %%xmm3,%%xmm0 \n" - "pxor %%xmm4,%%xmm3 \n" - "movd " MEMACCESS(1) ",%%xmm2 \n" - "pshufb %4,%%xmm3 \n" - "pand %%xmm6,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "pmullw %%xmm3,%%xmm2 \n" - "movd " MEMACCESS(1) ",%%xmm1 \n" - "lea " MEMLEA(0x4,1) ",%1 \n" - "psrlw $0x8,%%xmm1 \n" - "por %%xmm4,%%xmm0 \n" - "pmullw %%xmm3,%%xmm1 \n" - "psrlw $0x8,%%xmm2 \n" - "paddusb %%xmm2,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "paddusb %%xmm1,%%xmm0 \n" - "movd %%xmm0," MEMACCESS(2) " \n" - "lea " MEMLEA(0x4,2) ",%2 \n" - "sub $0x1,%3 \n" - "jge 10b \n" - - "19: \n" - "add $1-4,%3 \n" + "sub $0x4,%3 \n" "jl 49f \n" // 4 pixel loop. @@ -4897,37 +4964,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, } #endif // HAS_INTERPOLATEROW_SSE2 -#ifdef HAS_ARGBTOBAYERGGROW_SSE2 -void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrld $0x18,%%xmm5 \n" - LABELALIGN - "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" - "psrld $0x8,%%xmm0 \n" - "psrld $0x8,%%xmm1 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packssdw %%xmm1,%%xmm0 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movq %%xmm0," MEMACCESS(1) " \n" - "lea " MEMLEA(0x8,1) ",%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_bayer), // %1 - "+r"(pix) // %2 - : - : "memory", "cc" - , "xmm0", "xmm1", "xmm5" - ); -} -#endif // HAS_ARGBTOBAYERGGROW_SSE2 - #ifdef HAS_ARGBSHUFFLEROW_SSSE3 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, diff --git a/third_party/libyuv/source/row_neon.cc b/third_party/libyuv/source/row_neon.cc index 8badc5a..1a72eb9 100644 --- a/third_party/libyuv/source/row_neon.cc +++ b/third_party/libyuv/source/row_neon.cc @@ -94,11 +94,17 @@ extern "C" { "vtrn.u32 d2, d3 \n" #define YUV422TORGB_SETUP_REG \ + MEMACCESS([kUVToRB]) \ "vld1.8 {d24}, [%[kUVToRB]] \n" \ + MEMACCESS([kUVToG]) \ "vld1.8 {d25}, [%[kUVToG]] \n" \ + MEMACCESS([kUVBiasBGR]) \ "vld1.16 {d26[], d27[]}, [%[kUVBiasBGR]]! \n" \ + MEMACCESS([kUVBiasBGR]) \ "vld1.16 {d8[], d9[]}, [%[kUVBiasBGR]]! \n" \ + MEMACCESS([kUVBiasBGR]) \ "vld1.16 {d28[], d29[]}, [%[kUVBiasBGR]] \n" \ + MEMACCESS([kYToRgb]) \ "vld1.32 {d30[], d31[]}, [%[kYToRgb]] \n" #define YUV422TORGB \ @@ -186,7 +192,7 @@ void I444ToARGBRow_NEON(const uint8* src_y, [kUVToG]"r"(&kUVToG), // %6 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -216,7 +222,7 @@ void I422ToARGBRow_NEON(const uint8* src_y, [kUVToG]"r"(&kUVToG), // %6 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -246,7 +252,7 @@ void I411ToARGBRow_NEON(const uint8* src_y, [kUVToG]"r"(&kUVToG), // %6 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -277,7 +283,7 @@ void I422ToBGRARow_NEON(const uint8* src_y, [kUVToG]"r"(&kUVToG), // %6 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -308,7 +314,7 @@ void I422ToABGRRow_NEON(const uint8* src_y, [kUVToG]"r"(&kUVToG), // %6 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -338,7 +344,7 @@ void I422ToRGBARow_NEON(const uint8* src_y, [kUVToG]"r"(&kUVToG), // %6 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -367,7 +373,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y, [kUVToG]"r"(&kUVToG), // %6 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -397,7 +403,7 @@ void I422ToRAWRow_NEON(const uint8* src_y, [kUVToG]"r"(&kUVToG), // %6 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -439,7 +445,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y, [kUVToG]"r"(&kUVToG), // %6 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -485,7 +491,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, [kUVToG]"r"(&kUVToG), // %6 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -526,14 +532,14 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, [kUVToG]"r"(&kUVToG), // %6 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -void YToARGBRow_NEON(const uint8* src_y, - uint8* dst_argb, - int width) { +void I400ToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { asm volatile ( YUV422TORGB_SETUP_REG ".p2align 2 \n" @@ -552,17 +558,17 @@ void YToARGBRow_NEON(const uint8* src_y, [kUVToG]"r"(&kUVToG), // %4 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -void I400ToARGBRow_NEON(const uint8* src_y, +void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - ".p2align 2 \n" "vmov.u8 d23, #255 \n" + ".p2align 2 \n" "1: \n" MEMACCESS(0) "vld1.8 {d20}, [%0]! \n" @@ -603,7 +609,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y, [kUVToG]"r"(&kUVToG), // %5 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -631,7 +637,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y, [kUVToG]"r"(&kUVToG), // %5 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -659,7 +665,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, [kUVToG]"r"(&kUVToG), // %5 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -687,7 +693,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, [kUVToG]"r"(&kUVToG), // %5 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -713,7 +719,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, [kUVToG]"r"(&kUVToG), // %4 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -739,7 +745,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, [kUVToG]"r"(&kUVToG), // %4 [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) - : "cc", "memory", "q0", "q1", "q2", "q3", + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } @@ -1245,25 +1251,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, ); } -// Select G channels from ARGB. e.g. GGGGGGGG -void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, - uint32 /*selector*/, int pix) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop - MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // store 8 G's. - "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_bayer), // %1 - "+r"(pix) // %2 - : - : "cc", "memory", "q0", "q1" // Clobber List - ); -} - // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { @@ -1360,6 +1347,30 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { ); } +void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int width) { + asm volatile ( + ".p2align 2 \n" + "vdup.32 d2, %2 \n" // dither4 + "1: \n" + MEMACCESS(1) + "vld4.8 {d20, d21, d22, d23}, [%1]! \n" // load 8 pixels of ARGB. + "subs %3, %3, #8 \n" // 8 processed per loop. + "vqadd.u8 d20, d20, d2 \n" + "vqadd.u8 d21, d21, d2 \n" + "vqadd.u8 d22, d22, d2 \n" + ARGBTORGB565 + MEMACCESS(0) + "vst1.8 {q0}, [%0]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(dst_rgb) // %0 + : "r"(src_argb), // %1 + "r"(dither4), // %2 + "r"(width) // %3 + : "cc", "memory", "q0", "q1", "q8", "q9", "q10", "q11" + ); +} + void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, int pix) { asm volatile ( diff --git a/third_party/libyuv/source/row_neon64.cc b/third_party/libyuv/source/row_neon64.cc index ddccd5d..5d01545 100644 --- a/third_party/libyuv/source/row_neon64.cc +++ b/third_party/libyuv/source/row_neon64.cc @@ -178,7 +178,7 @@ void I444ToARGBRow_NEON(const uint8* src_y, "1: \n" READYUV444 YUV422TORGB(v22, v21, v20) - "subs %4, %4, #8 \n" + "subs %w4, %w4, #8 \n" "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" @@ -207,7 +207,7 @@ void I422ToARGBRow_NEON(const uint8* src_y, "1: \n" READYUV422 YUV422TORGB(v22, v21, v20) - "subs %4, %4, #8 \n" + "subs %w4, %w4, #8 \n" "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" @@ -236,7 +236,7 @@ void I411ToARGBRow_NEON(const uint8* src_y, "1: \n" READYUV411 YUV422TORGB(v22, v21, v20) - "subs %4, %4, #8 \n" + "subs %w4, %w4, #8 \n" "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" @@ -265,7 +265,7 @@ void I422ToBGRARow_NEON(const uint8* src_y, "1: \n" READYUV422 YUV422TORGB(v21, v22, v23) - "subs %4, %4, #8 \n" + "subs %w4, %w4, #8 \n" "movi v20.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" @@ -294,7 +294,7 @@ void I422ToABGRRow_NEON(const uint8* src_y, "1: \n" READYUV422 YUV422TORGB(v20, v21, v22) - "subs %4, %4, #8 \n" + "subs %w4, %w4, #8 \n" "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" @@ -323,7 +323,7 @@ void I422ToRGBARow_NEON(const uint8* src_y, "1: \n" READYUV422 YUV422TORGB(v23, v22, v21) - "subs %4, %4, #8 \n" + "subs %w4, %w4, #8 \n" "movi v20.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" @@ -352,7 +352,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y, "1: \n" READYUV422 YUV422TORGB(v22, v21, v20) - "subs %4, %4, #8 \n" + "subs %w4, %w4, #8 \n" MEMACCESS(3) "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" "b.gt 1b \n" @@ -380,7 +380,7 @@ void I422ToRAWRow_NEON(const uint8* src_y, "1: \n" READYUV422 YUV422TORGB(v20, v21, v22) - "subs %4, %4, #8 \n" + "subs %w4, %w4, #8 \n" MEMACCESS(3) "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" "b.gt 1b \n" @@ -415,7 +415,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y, "1: \n" READYUV422 YUV422TORGB(v22, v21, v20) - "subs %4, %4, #8 \n" + "subs %w4, %w4, #8 \n" ARGBTORGB565 MEMACCESS(3) "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. @@ -453,7 +453,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, "1: \n" READYUV422 YUV422TORGB(v22, v21, v20) - "subs %4, %4, #8 \n" + "subs %w4, %w4, #8 \n" "movi v23.8b, #255 \n" ARGBTOARGB1555 MEMACCESS(3) @@ -494,7 +494,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, "1: \n" READYUV422 YUV422TORGB(v22, v21, v20) - "subs %4, %4, #8 \n" + "subs %w4, %w4, #8 \n" "movi v23.8b, #255 \n" ARGBTOARGB4444 MEMACCESS(3) @@ -513,33 +513,34 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, } #endif // HAS_I422TOARGB4444ROW_NEON -#ifdef HAS_YTOARGBROW_NEON -void YToARGBRow_NEON(const uint8* src_y, - uint8* dst_argb, - int width) { +#ifdef HAS_I400TOARGBROW_NEON +void I400ToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { + int64 width64 = (int64)(width); asm volatile ( YUV422TORGB_SETUP_REG "1: \n" READYUV400 YUV422TORGB(v22, v21, v20) - "subs %2, %2, #8 \n" + "subs %w2, %w2, #8 \n" "movi v23.8b, #255 \n" MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 - "+r"(width) // %2 + "+r"(width64) // %2 : [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } -#endif // HAS_YTOARGBROW_NEON +#endif // HAS_I400TOARGBROW_NEON -#ifdef HAS_I400TOARGBROW_NEON -void I400ToARGBRow_NEON(const uint8* src_y, +#ifdef HAS_J400TOARGBROW_NEON +void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( @@ -549,7 +550,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, "ld1 {v20.8b}, [%0], #8 \n" "orr v21.8b, v20.8b, v20.8b \n" "orr v22.8b, v20.8b, v20.8b \n" - "subs %2, %2, #8 \n" + "subs %w2, %w2, #8 \n" MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" @@ -560,7 +561,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, : "cc", "memory", "v20", "v21", "v22", "v23" ); } -#endif // HAS_I400TOARGBROW_NEON +#endif // HAS_J400TOARGBROW_NEON #ifdef HAS_NV12TOARGBROW_NEON void NV12ToARGBRow_NEON(const uint8* src_y, @@ -572,7 +573,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y, "1: \n" READNV12 YUV422TORGB(v22, v21, v20) - "subs %3, %3, #8 \n" + "subs %w3, %w3, #8 \n" "movi v23.8b, #255 \n" MEMACCESS(2) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" @@ -599,7 +600,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y, "1: \n" READNV21 YUV422TORGB(v22, v21, v20) - "subs %3, %3, #8 \n" + "subs %w3, %w3, #8 \n" "movi v23.8b, #255 \n" MEMACCESS(2) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" @@ -626,7 +627,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, "1: \n" READNV12 YUV422TORGB(v22, v21, v20) - "subs %3, %3, #8 \n" + "subs %w3, %w3, #8 \n" ARGBTORGB565 MEMACCESS(2) "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. @@ -653,7 +654,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, "1: \n" READNV21 YUV422TORGB(v22, v21, v20) - "subs %3, %3, #8 \n" + "subs %w3, %w3, #8 \n" ARGBTORGB565 MEMACCESS(2) "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. @@ -674,19 +675,20 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, void YUY2ToARGBRow_NEON(const uint8* src_yuy2, uint8* dst_argb, int width) { + int64 width64 = (int64)(width); asm volatile ( YUV422TORGB_SETUP_REG "1: \n" READYUY2 YUV422TORGB(v22, v21, v20) - "subs %2, %2, #8 \n" + "subs %w2, %w2, #8 \n" "movi v23.8b, #255 \n" MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_argb), // %1 - "+r"(width) // %2 + "+r"(width64) // %2 : [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", @@ -699,19 +701,20 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, void UYVYToARGBRow_NEON(const uint8* src_uyvy, uint8* dst_argb, int width) { + int64 width64 = (int64)(width); asm volatile ( YUV422TORGB_SETUP_REG "1: \n" READUYVY YUV422TORGB(v22, v21, v20) - "subs %2, %2, #8 \n" + "subs %w2, %w2, #8 \n" "movi v23.8b, #255 \n" MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_argb), // %1 - "+r"(width) // %2 + "+r"(width64) // %2 : [kUVBiasBGR]"r"(&kUVBiasBGR), [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", @@ -728,7 +731,7 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, "1: \n" MEMACCESS(0) "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV - "subs %3, %3, #16 \n" // 16 processed per loop + "subs %w3, %w3, #16 \n" // 16 processed per loop MEMACCESS(1) "st1 {v0.16b}, [%1], #16 \n" // store U MEMACCESS(2) @@ -754,7 +757,7 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, "ld1 {v0.16b}, [%0], #16 \n" // load U MEMACCESS(1) "ld1 {v1.16b}, [%1], #16 \n" // load V - "subs %3, %3, #16 \n" // 16 processed per loop + "subs %w3, %w3, #16 \n" // 16 processed per loop MEMACCESS(2) "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV "b.gt 1b \n" @@ -776,7 +779,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { "1: \n" MEMACCESS(0) "ld1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 32 - "subs %2, %2, #32 \n" // 32 processed per loop + "subs %w2, %w2, #32 \n" // 32 processed per loop MEMACCESS(1) "st1 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 32 "b.gt 1b \n" @@ -794,7 +797,7 @@ void SetRow_NEON(uint8* dst, uint8 v8, int count) { asm volatile ( "dup v0.16b, %w2 \n" // duplicate 16 bytes "1: \n" - "subs %1, %1, #16 \n" // 16 bytes per loop + "subs %w1, %w1, #16 \n" // 16 bytes per loop MEMACCESS(0) "st1 {v0.16b}, [%0], #16 \n" // store "b.gt 1b \n" @@ -809,7 +812,7 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { asm volatile ( "dup v0.4s, %w2 \n" // duplicate 4 ints "1: \n" - "subs %1, %1, #4 \n" // 4 ints per loop + "subs %w1, %w1, #4 \n" // 4 ints per loop MEMACCESS(0) "st1 {v0.16b}, [%0], #16 \n" // store "b.gt 1b \n" @@ -822,6 +825,7 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { #ifdef HAS_MIRRORROW_NEON void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { + int64 width64 = (int64) width; asm volatile ( // Start at end of source row. "add %0, %0, %2 \n" @@ -830,7 +834,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 - "subs %2, %2, #16 \n" // 16 pixels per loop. + "subs %2, %2, #16 \n" // 16 pixels per loop. "rev64 v0.16b, v0.16b \n" MEMACCESS(1) "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 @@ -839,7 +843,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 - "+r"(width) // %2 + "+r"(width64) // %2 : "r"((ptrdiff_t)-16) // %3 : "cc", "memory", "v0" ); @@ -849,6 +853,7 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { #ifdef HAS_MIRRORUVROW_NEON void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { + int64 width64 = (int64) width; asm volatile ( // Start at end of source row. "add %0, %0, %3, lsl #1 \n" @@ -868,7 +873,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(width) // %3 + "+r"(width64) // %3 : "r"((ptrdiff_t)-16) // %4 : "cc", "memory", "v0", "v1" ); @@ -877,6 +882,7 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, #ifdef HAS_ARGBMIRRORROW_NEON void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { + int64 width64 = (int64) width; asm volatile ( // Start at end of source row. "add %0, %0, %2, lsl #2 \n" @@ -894,7 +900,7 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 - "+r"(width) // %2 + "+r"(width64) // %2 : "r"((ptrdiff_t)-16) // %3 : "cc", "memory", "v0" ); @@ -908,7 +914,7 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { "1: \n" MEMACCESS(0) "ld3 {v1.8b,v2.8b,v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. MEMACCESS(1) "st4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n" // store 8 ARGB pixels "b.gt 1b \n" @@ -928,7 +934,7 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { "1: \n" MEMACCESS(0) "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v3.8b, v1.8b, v1.8b \n" // move g "orr v4.8b, v0.8b, v0.8b \n" // move r MEMACCESS(1) @@ -963,7 +969,7 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. RGB565TOARGB MEMACCESS(1) "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels @@ -1022,7 +1028,7 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB1555TOARGB MEMACCESS(1) "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels @@ -1055,7 +1061,7 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB4444TOARGB MEMACCESS(1) "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB pixels @@ -1075,7 +1081,7 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { "1: \n" MEMACCESS(0) "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB pixels - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. MEMACCESS(1) "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. "b.gt 1b \n" @@ -1094,7 +1100,7 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { "1: \n" MEMACCESS(0) "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v4.8b, v2.8b, v2.8b \n" // mov g "orr v5.8b, v1.8b, v1.8b \n" // mov b MEMACCESS(1) @@ -1115,7 +1121,7 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { "1: \n" MEMACCESS(0) "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. - "subs %2, %2, #16 \n" // 16 processed per loop. + "subs %w2, %w2, #16 \n" // 16 processed per loop. MEMACCESS(1) "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. "b.gt 1b \n" @@ -1134,7 +1140,7 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { "1: \n" MEMACCESS(0) "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. - "subs %2, %2, #16 \n" // 16 processed per loop. + "subs %w2, %w2, #16 \n" // 16 processed per loop. MEMACCESS(1) "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. "b.gt 1b \n" @@ -1154,7 +1160,7 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 pixels - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) "st1 {v1.8b}, [%1], #8 \n" // store 8 U. MEMACCESS(2) @@ -1177,7 +1183,7 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY pixels - "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) "st1 {v0.8b}, [%1], #8 \n" // store 8 U. MEMACCESS(2) @@ -1201,7 +1207,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U @@ -1231,7 +1237,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U @@ -1253,27 +1259,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, } #endif // HAS_UYVYTOUVROW_NEON -// Select G channels from ARGB. e.g. GGGGGGGG -#ifdef HAS_ARGBTOBAYERGGROW_NEON -void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, - uint32 /*selector*/, int pix) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels - "subs %2, %2, #8 \n" // 8 processed per loop - MEMACCESS(1) - "st1 {v1.8b}, [%1], #8 \n" // store 8 G's. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_bayer), // %1 - "+r"(pix) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} -#endif // HAS_ARGBTOBAYERGGROW_NEON - // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. #ifdef HAS_ARGBSHUFFLEROW_NEON void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, @@ -1284,7 +1269,7 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. - "subs %2, %2, #4 \n" // 4 processed per loop + "subs %w2, %w2, #4 \n" // 4 processed per loop "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels MEMACCESS(1) "st1 {v1.16b}, [%1], #16 \n" // store 4. @@ -1312,7 +1297,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y, "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us MEMACCESS(2) "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels + "subs %w4, %w4, #16 \n" // 16 pixels MEMACCESS(3) "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. "b.gt 1b \n" @@ -1341,7 +1326,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us MEMACCESS(2) "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs - "subs %4, %4, #16 \n" // 16 pixels + "subs %w4, %w4, #16 \n" // 16 pixels MEMACCESS(3) "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. "b.gt 1b \n" @@ -1362,7 +1347,7 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { "1: \n" MEMACCESS(0) "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGBTORGB565 MEMACCESS(1) "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. @@ -1376,6 +1361,31 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { } #endif // HAS_ARGBTORGB565ROW_NEON +#ifdef HAS_ARGBTORGB565DITHERROW_NEON +void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int width) { + asm volatile ( + "dup v1.4s, %w2 \n" // dither4 + "1: \n" + MEMACCESS(1) + "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 pixels + "subs %w3, %w3, #8 \n" // 8 processed per loop. + "uqadd v20.8b, v20.8b, v1.8b \n" + "uqadd v21.8b, v21.8b, v1.8b \n" + "uqadd v22.8b, v22.8b, v1.8b \n" + ARGBTORGB565 + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(dst_rgb) // %0 + : "r"(src_argb), // %1 + "r"(dither4), // %2 + "r"(width) // %3 + : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" + ); +} +#endif // HAS_ARGBTORGB565ROW_NEON + #ifdef HAS_ARGBTOARGB1555ROW_NEON void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, int pix) { @@ -1383,7 +1393,7 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, "1: \n" MEMACCESS(0) "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGBTOARGB1555 MEMACCESS(1) "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB1555. @@ -1405,7 +1415,7 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, "1: \n" MEMACCESS(0) "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 pixels - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGBTOARGB4444 MEMACCESS(1) "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels ARGB4444. @@ -1429,7 +1439,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v3.8h, v0.8b, v4.8b \n" // B "umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v2.8b, v6.8b \n" // R @@ -1456,7 +1466,7 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v3.8h, v0.8b, v4.8b \n" // B "umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v2.8b, v6.8b \n" // R @@ -1487,7 +1497,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. + "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B "umlsl v4.8h, v1.8b, v25.8b \n" // G "umlsl v4.8h, v2.8b, v26.8b \n" // R @@ -1531,7 +1541,7 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. - "subs %3, %3, #16 \n" // 16 processed per loop. + "subs %w3, %w3, #16 \n" // 16 processed per loop. "mul v3.8h, v0.8h, v20.8h \n" // B "mls v3.8h, v1.8h, v21.8h \n" // G "mls v3.8h, v2.8h, v22.8h \n" // R @@ -1587,7 +1597,7 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" - "subs %3, %3, #32 \n" // 32 processed per loop. + "subs %w3, %w3, #32 \n" // 32 processed per loop. "mul v3.8h, v0.8h, v20.8h \n" // B "mls v3.8h, v1.8h, v21.8h \n" // G "mls v3.8h, v2.8h, v22.8h \n" // R @@ -1653,7 +1663,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" - "subs %4, %4, #16 \n" // 32 processed per loop. + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -1700,7 +1710,7 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" - "subs %4, %4, #16 \n" // 32 processed per loop. + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -1741,7 +1751,7 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "urshr v1.8h, v3.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" - "subs %4, %4, #16 \n" // 32 processed per loop. + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -1782,7 +1792,7 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "urshr v2.8h, v2.8h, #1 \n" "urshr v1.8h, v1.8h, #1 \n" - "subs %4, %4, #16 \n" // 32 processed per loop. + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v2.8h, v1.8h) MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -1823,7 +1833,7 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" - "subs %4, %4, #16 \n" // 32 processed per loop. + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -1864,7 +1874,7 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "urshr v1.8h, v1.8h, #1 \n" "urshr v2.8h, v2.8h, #1 \n" - "subs %4, %4, #16 \n" // 32 processed per loop. + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v0.8h, v1.8h, v2.8h) MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -1905,7 +1915,7 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "urshr v1.8h, v1.8h, #1 \n" "urshr v0.8h, v0.8h, #1 \n" - "subs %4, %4, #16 \n" // 32 processed per loop. + "subs %w4, %w4, #16 \n" // 32 processed per loop. RGBTOUV(v2.8h, v1.8h, v0.8h) MEMACCESS(2) "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. @@ -1971,7 +1981,7 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, "urshr v5.8h, v18.8h, #1 \n" "urshr v6.8h, v20.8h, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. + "subs %w4, %w4, #16 \n" // 16 processed per loop. "mul v16.8h, v4.8h, v22.8h \n" // B "mls v16.8h, v5.8h, v23.8h \n" // G "mls v16.8h, v6.8h, v24.8h \n" // R @@ -2042,7 +2052,7 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, "urshr v5.8h, v17.8h, #1 \n" "urshr v6.8h, v18.8h, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. + "subs %w4, %w4, #16 \n" // 16 processed per loop. "mul v2.8h, v4.8h, v20.8h \n" // B "mls v2.8h, v5.8h, v21.8h \n" // G "mls v2.8h, v6.8h, v22.8h \n" // R @@ -2113,7 +2123,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, "urshr v5.8h, v17.8h, #1 \n" "urshr v6.8h, v18.8h, #1 \n" - "subs %4, %4, #16 \n" // 16 processed per loop. + "subs %w4, %w4, #16 \n" // 16 processed per loop. "mul v2.8h, v4.8h, v20.8h \n" // B "mls v2.8h, v5.8h, v21.8h \n" // G "mls v2.8h, v6.8h, v22.8h \n" // R @@ -2153,7 +2163,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. RGB565TOARGB "umull v3.8h, v0.8b, v24.8b \n" // B "umlal v3.8h, v1.8b, v25.8b \n" // G @@ -2183,7 +2193,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB1555TOARGB "umull v3.8h, v0.8b, v4.8b \n" // B "umlal v3.8h, v1.8b, v5.8b \n" // G @@ -2212,7 +2222,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB4444TOARGB "umull v3.8h, v0.8b, v24.8b \n" // B "umlal v3.8h, v1.8b, v25.8b \n" // G @@ -2241,7 +2251,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v1.8b, v4.8b \n" // R "umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v3.8b, v6.8b \n" // B @@ -2269,7 +2279,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // R "umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v6.8b \n" // B @@ -2297,7 +2307,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v1.8b, v4.8b \n" // B "umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v3.8b, v6.8b \n" // R @@ -2325,7 +2335,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { "1: \n" MEMACCESS(0) "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // B "umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v6.8b \n" // R @@ -2353,7 +2363,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { "1: \n" MEMACCESS(0) "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // B "umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v6.8b \n" // R @@ -2380,13 +2390,13 @@ void InterpolateRow_NEON(uint8* dst_ptr, int y0_fraction = 256 - y1_fraction; const uint8* src_ptr1 = src_ptr + src_stride; asm volatile ( - "cmp %4, #0 \n" + "cmp %w4, #0 \n" "b.eq 100f \n" - "cmp %4, #64 \n" + "cmp %w4, #64 \n" "b.eq 75f \n" - "cmp %4, #128 \n" + "cmp %w4, #128 \n" "b.eq 50f \n" - "cmp %4, #192 \n" + "cmp %w4, #192 \n" "b.eq 25f \n" "dup v5.16b, %w4 \n" @@ -2397,7 +2407,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) "ld1 {v1.16b}, [%2], #16 \n" - "subs %3, %3, #16 \n" + "subs %w3, %w3, #16 \n" "umull v2.8h, v0.8b, v4.8b \n" "umull2 v3.8h, v0.16b, v4.16b \n" "umlal v2.8h, v1.8b, v5.8b \n" @@ -2415,7 +2425,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) "ld1 {v1.16b}, [%2], #16 \n" - "subs %3, %3, #16 \n" + "subs %w3, %w3, #16 \n" "urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) @@ -2429,7 +2439,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) "ld1 {v1.16b}, [%2], #16 \n" - "subs %3, %3, #16 \n" + "subs %w3, %w3, #16 \n" "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) "st1 {v0.16b}, [%0], #16 \n" @@ -2442,7 +2452,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, "ld1 {v1.16b}, [%1], #16 \n" MEMACCESS(2) "ld1 {v0.16b}, [%2], #16 \n" - "subs %3, %3, #16 \n" + "subs %w3, %w3, #16 \n" "urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) @@ -2454,7 +2464,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, "100: \n" MEMACCESS(1) "ld1 {v0.16b}, [%1], #16 \n" - "subs %3, %3, #16 \n" + "subs %w3, %w3, #16 \n" MEMACCESS(0) "st1 {v0.16b}, [%0], #16 \n" "b.gt 100b \n" @@ -2477,7 +2487,7 @@ void InterpolateRow_NEON(uint8* dst_ptr, void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( - "subs %3, %3, #8 \n" + "subs %w3, %w3, #8 \n" "b.lt 89f \n" // Blend 8 pixels. "8: \n" @@ -2485,7 +2495,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 pixels MEMACCESS(1) "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 pixels - "subs %3, %3, #8 \n" // 8 processed per loop. + "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v16.8h, v4.8b, v3.8b \n" // db * a "umull v17.8h, v5.8b, v3.8b \n" // dg * a "umull v18.8h, v6.8b, v3.8b \n" // dr * a @@ -2504,7 +2514,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "b.ge 8b \n" "89: \n" - "adds %3, %3, #8-1 \n" + "adds %w3, %w3, #8-1 \n" "b.lt 99f \n" // Blend 1 pixels. @@ -2513,7 +2523,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. MEMACCESS(1) "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. - "subs %3, %3, #1 \n" // 1 processed per loop. + "subs %w3, %w3, #1 \n" // 1 processed per loop. "umull v16.8h, v4.8b, v3.8b \n" // db * a "umull v17.8h, v5.8b, v3.8b \n" // dg * a "umull v18.8h, v6.8b, v3.8b \n" // dr * a @@ -2552,7 +2562,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v3.8b \n" // b * a "umull v5.8h, v1.8b, v3.8b \n" // g * a "umull v6.8h, v2.8b, v3.8b \n" // r * a @@ -2586,7 +2596,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 pixels of ARGB. - "subs %1, %1, #8 \n" // 8 processed per loop. + "subs %w1, %w1, #8 \n" // 8 processed per loop. "uxtl v0.8h, v0.8b \n" // b (0 .. 255) "uxtl v1.8h, v1.8b \n" "uxtl v2.8h, v2.8b \n" @@ -2630,7 +2640,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, "1: \n" MEMACCESS(0) "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. "uxtl v4.8h, v4.8b \n" // b (0 .. 255) "uxtl v5.8h, v5.8b \n" "uxtl v6.8h, v6.8b \n" @@ -2667,7 +2677,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B "umlal v4.8h, v1.8b, v25.8b \n" // G "umlal v4.8h, v2.8b, v26.8b \n" // R @@ -2706,7 +2716,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. - "subs %1, %1, #8 \n" // 8 processed per loop. + "subs %w1, %w1, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B "umlal v4.8h, v1.8b, v21.8b \n" // G "umlal v4.8h, v2.8b, v22.8b \n" // R @@ -2746,7 +2756,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, "1: \n" MEMACCESS(0) "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 pixels. - "subs %2, %2, #8 \n" // 8 processed per loop. + "subs %w2, %w2, #8 \n" // 8 processed per loop. "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit "uxtl v17.8h, v17.8b \n" // g "uxtl v18.8h, v18.8b \n" // r @@ -2808,7 +2818,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. MEMACCESS(1) "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. + "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v0.8h, v0.8b, v4.8b \n" // multiply B "umull v1.8h, v1.8b, v5.8b \n" // multiply G "umull v2.8h, v2.8b, v6.8b \n" // multiply R @@ -2842,7 +2852,7 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. MEMACCESS(1) "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. + "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v0.8b, v0.8b, v4.8b \n" "uqadd v1.8b, v1.8b, v5.8b \n" "uqadd v2.8b, v2.8b, v6.8b \n" @@ -2872,7 +2882,7 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB pixels. MEMACCESS(1) "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. + "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqsub v0.8b, v0.8b, v4.8b \n" "uqsub v1.8b, v1.8b, v5.8b \n" "uqsub v2.8b, v2.8b, v6.8b \n" @@ -2907,7 +2917,7 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. MEMACCESS(1) "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. + "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v0.8b, v0.8b, v1.8b \n" // add "orr v1.8b, v0.8b, v0.8b \n" "orr v2.8b, v0.8b, v0.8b \n" @@ -2935,7 +2945,7 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. MEMACCESS(1) "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. - "subs %3, %3, #16 \n" // 16 processed per loop. + "subs %w3, %w3, #16 \n" // 16 processed per loop. "uqadd v0.16b, v0.16b, v1.16b \n" // add MEMACCESS(2) "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. @@ -2966,7 +2976,7 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. MEMACCESS(1) "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. - "subs %3, %3, #8 \n" // 8 processed per loop. + "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v1.8b, v0.8b, v2.8b \n" // add MEMACCESS(2) "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB pixels @@ -3006,7 +3016,7 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, "ld1 {v2.8b}, [%2],%5 \n" // bottom MEMACCESS(2) "ld1 {v3.8b}, [%2],%6 \n" - "subs %4, %4, #8 \n" // 8 pixels + "subs %w4, %w4, #8 \n" // 8 pixels "usubl v1.8h, v2.8b, v3.8b \n" "add v0.8h, v0.8h, v1.8h \n" "abs v0.8h, v0.8h \n" @@ -3019,8 +3029,8 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, "+r"(src_y2), // %2 "+r"(dst_sobelx), // %3 "+r"(width) // %4 - : "r"(2), // %5 - "r"(6) // %6 + : "r"(2LL), // %5 + "r"(6LL) // %6 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } @@ -3051,7 +3061,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, "ld1 {v2.8b}, [%0],%5 \n" // right MEMACCESS(1) "ld1 {v3.8b}, [%1],%5 \n" - "subs %3, %3, #8 \n" // 8 pixels + "subs %w3, %w3, #8 \n" // 8 pixels "usubl v1.8h, v2.8b, v3.8b \n" "add v0.8h, v0.8h, v1.8h \n" "abs v0.8h, v0.8h \n" @@ -3063,8 +3073,8 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, "+r"(src_y1), // %1 "+r"(dst_sobely), // %2 "+r"(width) // %3 - : "r"(1), // %4 - "r"(6) // %5 + : "r"(1LL), // %4 + "r"(6LL) // %5 : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } diff --git a/third_party/libyuv/source/row_win.cc b/third_party/libyuv/source/row_win.cc index 6e9d04c..71be268 100644 --- a/third_party/libyuv/source/row_win.cc +++ b/third_party/libyuv/source/row_win.cc @@ -10,7 +10,8 @@ #include "libyuv/row.h" -#if defined (_M_X64) && !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_X64) && \ + defined(_MSC_VER) && !defined(__clang__) #include #include // For _mm_maddubs_epi16 #endif @@ -21,24 +22,8 @@ extern "C" { #endif // This module is for Visual C. -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ - (defined(_M_IX86) || defined(_M_X64)) - -// YUV to RGB conversion constants. -// Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ -#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ - -// U and V contributions to R,G,B. -#define UB -128 /* -min(128, round(2.018 * 64)) */ -#define UG 25 /* -round(-0.391 * 64) */ -#define VG 52 /* -round(-0.813 * 64) */ -#define VR -102 /* -round(1.596 * 64) */ - -// Bias values to subtract 16 from Y and 128 from U and V. -#define BB (UB * 128 - YGB) -#define BG (UG * 128 + VG * 128 - YGB) -#define BR (VR * 128 - YGB) +#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) && \ + defined(_MSC_VER) && !defined(__clang__) struct YuvConstants { lvec8 kUVToB; // 0 @@ -50,6 +35,27 @@ struct YuvConstants { lvec16 kYToRgb; // 192 }; +// BT.601 YUV to RGB reference +// R = (Y - 16) * 1.164 - V * -1.596 +// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813 +// B = (Y - 16) * 1.164 - U * -2.018 + +// Y contribution to R,G,B. Scale and bias. +// TODO(fbarchard): Consider moving constants into a common header. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ + +// U and V contributions to R,G,B. +#define UB -128 /* max(-128, round(-2.018 * 64)) */ +#define UG 25 /* round(0.391 * 64) */ +#define VG 52 /* round(0.813 * 64) */ +#define VR -102 /* round(-1.596 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BB (UB * 128 + YGB) +#define BG (UG * 128 + VG * 128 + YGB) +#define BR (VR * 128 + YGB) + // BT601 constants for YUV to RGB. static YuvConstants SIMD_ALIGNED(kYuvConstants) = { { UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, @@ -78,10 +84,70 @@ static YuvConstants SIMD_ALIGNED(kYvuConstants) = { { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG } }; +#undef YG +#undef YGB +#undef UB +#undef UG +#undef VG +#undef VR +#undef BB +#undef BG +#undef BR + +// JPEG YUV to RGB reference +// * R = Y - V * -1.40200 +// * G = Y - U * 0.34414 - V * 0.71414 +// * B = Y - U * -1.77200 + +// Y contribution to R,G,B. Scale and bias. +// TODO(fbarchard): Consider moving constants into a common header. +#define YGJ 16320 /* round(1.000 * 64 * 256 * 256 / 257) */ +#define YGBJ 32 /* 64 / 2 */ + +// U and V contributions to R,G,B. +#define UBJ -113 /* round(-1.77200 * 64) */ +#define UGJ 22 /* round(0.34414 * 64) */ +#define VGJ 46 /* round(0.71414 * 64) */ +#define VRJ -90 /* round(-1.40200 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BBJ (UBJ * 128 + YGBJ) +#define BGJ (UGJ * 128 + VGJ * 128 + YGBJ) +#define BRJ (VRJ * 128 + YGBJ) + +// JPEG constants for YUV to RGB. +static YuvConstants SIMD_ALIGNED(kYuvJConstants) = { + { UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, + UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0, UBJ, 0 }, + { UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, + UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, + UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, + UGJ, VGJ, UGJ, VGJ, UGJ, VGJ, UGJ, VGJ }, + { 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, + 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ, 0, VRJ }, + { BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, + BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ, BBJ }, + { BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, + BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ, BGJ }, + { BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, + BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ, BRJ }, + { YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, + YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ, YGJ } +}; + +#undef YGJ +#undef YGBJ +#undef UBJ +#undef UGJ +#undef VGJ +#undef VRJ +#undef BBJ +#undef BGJ +#undef BRJ + // 64 bit #if defined(_M_X64) - -__declspec(align(16)) +#if defined(HAS_I422TOARGBROW_SSSE3) void I422ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -131,10 +197,9 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, width -= 8; } } - +#endif // 32 bit #else // defined(_M_X64) - #ifdef HAS_ARGBTOYROW_SSSE3 // Constants for ARGB. @@ -257,8 +322,8 @@ static const uvec8 kShuffleMaskARGBToRAW_0 = { }; // Duplicates gray value 3 times and fills in alpha opaque. -__declspec(naked) __declspec(align(16)) -void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { +__declspec(naked) +void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { __asm { mov eax, [esp + 4] // src_y mov edx, [esp + 8] // dst_argb @@ -284,7 +349,39 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { } } -__declspec(naked) __declspec(align(16)) +#ifdef HAS_J400TOARGBROW_AVX2 +// Duplicates gray value 3 times and fills in alpha opaque. +__declspec(naked) +void J400ToARGBRow_AVX2(const uint8* src_y, uint8* dst_argb, int pix) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 + vpslld ymm5, ymm5, 24 + + convertloop: + vmovdqu xmm0, [eax] + lea eax, [eax + 16] + vpermq ymm0, ymm0, 0xd8 + vpunpcklbw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + vpunpckhwd ymm1, ymm0, ymm0 + vpunpcklwd ymm0, ymm0, ymm0 + vpor ymm0, ymm0, ymm5 + vpor ymm1, ymm1, ymm5 + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_J400TOARGBROW_AVX2 + +__declspec(naked) void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { __asm { mov eax, [esp + 4] // src_rgb24 @@ -322,7 +419,7 @@ void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { __asm { @@ -368,7 +465,7 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, // v * (256 + 8) // G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 // 20 instructions. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, int pix) { __asm { @@ -417,8 +514,155 @@ void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, } } +#ifdef HAS_RGB565TOARGBROW_AVX2 +// pmul method to replicate bits. +// Math to replicate bits: +// (v << 8) | (v << 3) +// v * 256 + v * 8 +// v * (256 + 8) +// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 +__declspec(naked) +void RGB565ToARGBRow_AVX2(const uint8* src_rgb565, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + vmovd xmm5, eax + vbroadcastss ymm5, xmm5 + mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits + movd xmm6, eax + vbroadcastss ymm6, xmm6 + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red + vpsllw ymm3, ymm3, 11 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x07e007e0 for Green + vpsllw ymm4, ymm4, 10 + vpsrlw ymm4, ymm4, 5 + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha + vpsllw ymm7, ymm7, 8 + + mov eax, [esp + 4] // src_rgb565 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + convertloop: + vmovdqu ymm0, [eax] // fetch 16 pixels of bgr565 + vpand ymm1, ymm0, ymm3 // R in upper 5 bits + vpsllw ymm2, ymm0, 11 // B in upper 5 bits + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) + vpsllw ymm1, ymm1, 8 + vpor ymm1, ymm1, ymm2 // RB + vpand ymm0, ymm0, ymm4 // G in middle 6 bits + vpmulhuw ymm0, ymm0, ymm6 // << 5 * (256 + 4) + vpor ymm0, ymm0, ymm7 // AG + vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpermq ymm1, ymm1, 0xd8 + vpunpckhbw ymm2, ymm1, ymm0 + vpunpcklbw ymm1, ymm1, ymm0 + vmovdqu [eax * 2 + edx], ymm1 // store 4 pixels of ARGB + vmovdqu [eax * 2 + edx + 32], ymm2 // store next 4 pixels of ARGB + lea eax, [eax + 32] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_RGB565TOARGBROW_AVX2 + +#ifdef HAS_ARGB1555TOARGBROW_AVX2 +__declspec(naked) +void ARGB1555ToARGBRow_AVX2(const uint8* src_argb1555, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x01080108 // generate multiplier to repeat 5 bits + vmovd xmm5, eax + vbroadcastss ymm5, xmm5 + mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits + movd xmm6, eax + vbroadcastss ymm6, xmm6 + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0xf800f800 for Red + vpsllw ymm3, ymm3, 11 + vpsrlw ymm4, ymm3, 6 // generate mask 0x03e003e0 for Green + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xff00ff00 for Alpha + vpsllw ymm7, ymm7, 8 + + mov eax, [esp + 4] // src_argb1555 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + convertloop: + vmovdqu ymm0, [eax] // fetch 16 pixels of 1555 + vpsllw ymm1, ymm0, 1 // R in upper 5 bits + vpsllw ymm2, ymm0, 11 // B in upper 5 bits + vpand ymm1, ymm1, ymm3 + vpmulhuw ymm2, ymm2, ymm5 // * (256 + 8) + vpmulhuw ymm1, ymm1, ymm5 // * (256 + 8) + vpsllw ymm1, ymm1, 8 + vpor ymm1, ymm1, ymm2 // RB + vpsraw ymm2, ymm0, 8 // A + vpand ymm0, ymm0, ymm4 // G in middle 5 bits + vpmulhuw ymm0, ymm0, ymm6 // << 6 * (256 + 8) + vpand ymm2, ymm2, ymm7 + vpor ymm0, ymm0, ymm2 // AG + vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpermq ymm1, ymm1, 0xd8 + vpunpckhbw ymm2, ymm1, ymm0 + vpunpcklbw ymm1, ymm1, ymm0 + vmovdqu [eax * 2 + edx], ymm1 // store 8 pixels of ARGB + vmovdqu [eax * 2 + edx + 32], ymm2 // store next 8 pixels of ARGB + lea eax, [eax + 32] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGB1555TOARGBROW_AVX2 + +#ifdef HAS_ARGB4444TOARGBROW_AVX2 +__declspec(naked) +void ARGB4444ToARGBRow_AVX2(const uint8* src_argb4444, uint8* dst_argb, + int pix) { + __asm { + mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f + vmovd xmm4, eax + vbroadcastss ymm4, xmm4 + vpslld ymm5, ymm4, 4 // 0xf0f0f0f0 for high nibbles + mov eax, [esp + 4] // src_argb4444 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + sub edx, eax + sub edx, eax + + convertloop: + vmovdqu ymm0, [eax] // fetch 16 pixels of bgra4444 + vpand ymm2, ymm0, ymm5 // mask high nibbles + vpand ymm0, ymm0, ymm4 // mask low nibbles + vpsrlw ymm3, ymm2, 4 + vpsllw ymm1, ymm0, 4 + vpor ymm2, ymm2, ymm3 + vpor ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // mutate for unpack + vpermq ymm2, ymm2, 0xd8 + vpunpckhbw ymm1, ymm0, ymm2 + vpunpcklbw ymm0, ymm0, ymm2 + vmovdqu [eax * 2 + edx], ymm0 // store 8 pixels of ARGB + vmovdqu [eax * 2 + edx + 32], ymm1 // store next 8 pixels of ARGB + lea eax, [eax + 32] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGB4444TOARGBROW_AVX2 + // 24 instructions -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, int pix) { __asm { @@ -471,7 +715,7 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, } // 18 instructions. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, int pix) { __asm { @@ -509,7 +753,7 @@ void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -547,7 +791,7 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -585,7 +829,8 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { } } -__declspec(naked) __declspec(align(16)) +// 4 pixels +__declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -622,8 +867,97 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { } } +// 8 pixels +__declspec(naked) +void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int pix) { + __asm { + + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + movd xmm6, [esp + 12] // dither4 + mov ecx, [esp + 16] // pix + punpcklbw xmm6, xmm6 // make dither 16 bytes + movdqa xmm7, xmm6 + punpcklwd xmm6, xmm6 + punpckhwd xmm7, xmm7 + pcmpeqb xmm3, xmm3 // generate mask 0x0000001f + psrld xmm3, 27 + pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 + psrld xmm4, 26 + pslld xmm4, 5 + pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 + pslld xmm5, 11 + + convertloop: + movdqu xmm0, [eax] // fetch 4 pixels of argb + paddusb xmm0, xmm6 // add dither + movdqa xmm1, xmm0 // B + movdqa xmm2, xmm0 // G + pslld xmm0, 8 // R + psrld xmm1, 3 // B + psrld xmm2, 5 // G + psrad xmm0, 16 // R + pand xmm1, xmm3 // B + pand xmm2, xmm4 // G + pand xmm0, xmm5 // R + por xmm1, xmm2 // BG + por xmm0, xmm1 // BGR + packssdw xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of RGB565 + lea edx, [edx + 8] + sub ecx, 4 + jg convertloop + ret + } +} + +#ifdef HAS_ARGBTORGB565DITHERROW_AVX2 +__declspec(naked) +void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, + const uint32 dither4, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + vbroadcastss xmm6, [esp + 12] // dither4 + mov ecx, [esp + 16] // pix + vpunpcklbw xmm6, xmm6, xmm6 // make dither 32 bytes + vpermq ymm6, ymm6, 0xd8 + vpunpcklwd ymm6, ymm6, ymm6 + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f + vpsrld ymm3, ymm3, 27 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 + vpsrld ymm4, ymm4, 26 + vpslld ymm4, ymm4, 5 + vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpaddusb ymm0, ymm0, ymm6 // add dither + vpsrld ymm2, ymm0, 5 // G + vpsrld ymm1, ymm0, 3 // B + vpsrld ymm0, ymm0, 8 // R + vpand ymm2, ymm2, ymm4 // G + vpand ymm1, ymm1, ymm3 // B + vpand ymm0, ymm0, ymm5 // R + vpor ymm1, ymm1, ymm2 // BG + vpor ymm0, ymm0, ymm1 // BGR + vpackusdw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of RGB565 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTORGB565DITHERROW_AVX2 + // TODO(fbarchard): Improve sign extension/packing. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -664,7 +998,7 @@ void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -694,7 +1028,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { } #ifdef HAS_ARGBTORGB565ROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -705,21 +1039,19 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 vpsrld ymm4, ymm4, 26 vpslld ymm4, ymm4, 5 - vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xfffff800 - vpslld ymm5, ymm5, 11 + vpslld ymm5, ymm3, 11 // generate mask 0x0000f800 convertloop: vmovdqu ymm0, [eax] // fetch 8 pixels of argb vpsrld ymm2, ymm0, 5 // G vpsrld ymm1, ymm0, 3 // B - vpslld ymm0, ymm0, 8 // R + vpsrld ymm0, ymm0, 8 // R vpand ymm2, ymm2, ymm4 // G vpand ymm1, ymm1, ymm3 // B - vpsrad ymm0, ymm0, 16 // R vpand ymm0, ymm0, ymm5 // R vpor ymm1, ymm1, ymm2 // BG vpor ymm0, ymm0, ymm1 // BGR - vpackssdw ymm0, ymm0, ymm0 + vpackusdw ymm0, ymm0, ymm0 vpermq ymm0, ymm0, 0xd8 lea eax, [eax + 32] vmovdqu [edx], xmm0 // store 8 pixels of RGB565 @@ -733,7 +1065,7 @@ void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { #endif // HAS_ARGBTORGB565ROW_AVX2 #ifdef HAS_ARGBTOARGB1555ROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -773,7 +1105,7 @@ void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { #endif // HAS_ARGBTOARGB1555ROW_AVX2 #ifdef HAS_ARGBTOARGB4444ROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { mov eax, [esp + 4] // src_argb @@ -804,7 +1136,7 @@ void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { #endif // HAS_ARGBTOARGB4444ROW_AVX2 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -839,7 +1171,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { // Convert 16 ARGB pixels (64 bytes) to 16 YJ values. // Same as ARGBToYRow but different coefficients, no add 16, but do rounding. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -880,7 +1212,7 @@ static const lvec32 kPermdARGBToY_AVX = { }; // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -__declspec(naked) __declspec(align(32)) +__declspec(naked) void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -917,9 +1249,9 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { } #endif // HAS_ARGBTOYROW_AVX2 -#ifdef HAS_ARGBTOYROW_AVX2 +#ifdef HAS_ARGBTOYJROW_AVX2 // Convert 32 ARGB pixels (128 bytes) to 32 Y values. -__declspec(naked) __declspec(align(32)) +__declspec(naked) void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -958,7 +1290,7 @@ void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { } #endif // HAS_ARGBTOYJROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -991,7 +1323,7 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -1024,7 +1356,7 @@ void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -1057,7 +1389,7 @@ void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1127,7 +1459,7 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1199,7 +1531,7 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } #ifdef HAS_ARGBTOUVROW_AVX2 -__declspec(naked) __declspec(align(32)) +__declspec(naked) void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1264,7 +1596,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, } #endif // HAS_ARGBTOUVROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToUV444Row_SSSE3(const uint8* src_argb0, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1321,7 +1653,7 @@ void ARGBToUV444Row_SSSE3(const uint8* src_argb0, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBToUV422Row_SSSE3(const uint8* src_argb0, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1379,7 +1711,7 @@ void ARGBToUV422Row_SSSE3(const uint8* src_argb0, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1449,7 +1781,7 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1519,7 +1851,7 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -1590,6 +1922,16 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } #endif // HAS_ARGBTOYROW_SSSE3 +// Read 16 UV from 444 +#define READYUV444_AVX2 __asm { \ + __asm vmovdqu xmm0, [esi] /* U */ /* NOLINT */ \ + __asm vmovdqu xmm1, [esi + edi] /* V */ /* NOLINT */ \ + __asm lea esi, [esi + 16] \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpermq ymm1, ymm1, 0xd8 \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + } + // Read 8 UV from 422, upsample to 16 UV. #define READYUV422_AVX2 __asm { \ __asm vmovq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ @@ -1600,6 +1942,17 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ } +// Read 4 UV from 411, upsample to 16 UV. +#define READYUV411_AVX2 __asm { \ + __asm vmovd xmm0, dword ptr [esi] /* U */ /* NOLINT */ \ + __asm vmovd xmm1, dword ptr [esi + edi] /* V */ /* NOLINT */ \ + __asm lea esi, [esi + 4] \ + __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ + __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ + __asm vpermq ymm0, ymm0, 0xd8 \ + __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ + } + // Read 8 UV from NV12, upsample to 16 UV. #define READNV12_AVX2 __asm { \ __asm vmovdqu xmm0, [esi] /* UV */ \ @@ -1646,15 +1999,15 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, __asm vpermq ymm2, ymm2, 0xd8 \ __asm vpunpcklwd ymm1, ymm0, ymm2 /* BGRA first 8 pixels */ \ __asm vpunpckhwd ymm0, ymm0, ymm2 /* BGRA next 8 pixels */ \ - __asm vmovdqu [edx], ymm1 \ - __asm vmovdqu [edx + 32], ymm0 \ + __asm vmovdqu 0[edx], ymm1 \ + __asm vmovdqu 32[edx], ymm0 \ __asm lea edx, [edx + 64] \ } #ifdef HAS_I422TOARGBROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToARGBRow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1687,10 +2040,118 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, } #endif // HAS_I422TOARGBROW_AVX2 +#ifdef HAS_J422TOARGBROW_AVX2 +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) +void J422ToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUV422_AVX2 + YUVTORGB_AVX2(kYuvJConstants) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_J422TOARGBROW_AVX2 + +#ifdef HAS_I444TOARGBROW_AVX2 +// 16 pixels +// 16 UV values with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) +void I444ToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUV444_AVX2 + YUVTORGB_AVX2(kYuvConstants) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I444TOARGBROW_AVX2 + +#ifdef HAS_I411TOARGBROW_AVX2 +// 16 pixels +// 4 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) +void I411ToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + + convertloop: + READYUV411_AVX2 + YUVTORGB_AVX2(kYuvConstants) + STOREARGB_AVX2 + + sub ecx, 16 + jg convertloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_I411TOARGBROW_AVX2 + #ifdef HAS_NV12TOARGBROW_AVX2 // 16 pixels. // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void NV12ToARGBRow_AVX2(const uint8* y_buf, const uint8* uv_buf, uint8* dst_argb, @@ -1712,6 +2173,7 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf, jg convertloop pop esi + vzeroupper ret } } @@ -1720,7 +2182,7 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf, #ifdef HAS_NV21TOARGBROW_AVX2 // 16 pixels. // 8 VU values upsampled to 16 VU, mixed with 16 Y producing 16 ARGB (64 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void NV21ToARGBRow_AVX2(const uint8* y_buf, const uint8* uv_buf, uint8* dst_argb, @@ -1742,6 +2204,7 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf, jg convertloop pop esi + vzeroupper ret } } @@ -1751,7 +2214,7 @@ void NV21ToARGBRow_AVX2(const uint8* y_buf, // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToBGRARow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1797,7 +2260,7 @@ void I422ToBGRARow_AVX2(const uint8* y_buf, // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToRGBARow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1843,7 +2306,7 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToABGRRow_AVX2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1914,7 +2377,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm lea esi, [esi + 2] \ __asm punpcklbw xmm0, xmm1 /* UV */ \ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ - __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ + __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ } // Read 4 UV from NV12, upsample to 8 UV. @@ -1963,8 +2426,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm movdqa xmm1, xmm0 \ __asm punpcklwd xmm0, xmm2 /* BGRA first 4 pixels */ \ __asm punpckhwd xmm1, xmm2 /* BGRA next 4 pixels */ \ - __asm movdqu [edx], xmm0 \ - __asm movdqu [edx + 16], xmm1 \ + __asm movdqu 0[edx], xmm0 \ + __asm movdqu 16[edx], xmm1 \ __asm lea edx, [edx + 32] \ } @@ -1977,8 +2440,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm movdqa xmm0, xmm5 \ __asm punpcklwd xmm5, xmm1 /* BGRA first 4 pixels */ \ __asm punpckhwd xmm0, xmm1 /* BGRA next 4 pixels */ \ - __asm movdqu [edx], xmm5 \ - __asm movdqu [edx + 16], xmm0 \ + __asm movdqu 0[edx], xmm5 \ + __asm movdqu 16[edx], xmm0 \ __asm lea edx, [edx + 32] \ } @@ -1990,8 +2453,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm movdqa xmm1, xmm2 \ __asm punpcklwd xmm2, xmm0 /* RGBA first 4 pixels */ \ __asm punpckhwd xmm1, xmm0 /* RGBA next 4 pixels */ \ - __asm movdqu [edx], xmm2 \ - __asm movdqu [edx + 16], xmm1 \ + __asm movdqu 0[edx], xmm2 \ + __asm movdqu 16[edx], xmm1 \ __asm lea edx, [edx + 32] \ } @@ -2004,8 +2467,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm movdqa xmm0, xmm5 \ __asm punpcklwd xmm5, xmm1 /* RGBA first 4 pixels */ \ __asm punpckhwd xmm0, xmm1 /* RGBA next 4 pixels */ \ - __asm movdqu [edx], xmm5 \ - __asm movdqu [edx + 16], xmm0 \ + __asm movdqu 0[edx], xmm5 \ + __asm movdqu 16[edx], xmm0 \ __asm lea edx, [edx + 32] \ } @@ -2021,8 +2484,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ - __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \ - __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \ + __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ + __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ __asm lea edx, [edx + 24] \ } @@ -2038,8 +2501,8 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm pshufb xmm0, xmm5 /* Pack first 8 and last 4 bytes. */ \ __asm pshufb xmm1, xmm6 /* Pack first 12 bytes. */ \ __asm palignr xmm1, xmm0, 12 /* last 4 bytes of xmm0 + 12 xmm1 */ \ - __asm movq qword ptr [edx], xmm0 /* First 8 bytes */ \ - __asm movdqu [edx + 8], xmm1 /* Last 16 bytes */ \ + __asm movq qword ptr 0[edx], xmm0 /* First 8 bytes */ \ + __asm movdqu 8[edx], xmm1 /* Last 16 bytes */ \ __asm lea edx, [edx + 24] \ } @@ -2075,13 +2538,13 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm por xmm3, xmm2 /* BG */ \ __asm por xmm1, xmm3 /* BGR */ \ __asm packssdw xmm0, xmm1 \ - __asm movdqu [edx], xmm0 /* store 8 pixels of RGB565 */ \ + __asm movdqu 0[edx], xmm0 /* store 8 pixels of RGB565 */ \ __asm lea edx, [edx + 16] \ } // 8 pixels. // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2114,7 +2577,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB24 (24 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToRGB24Row_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2148,7 +2611,7 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RAW (24 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToRAWRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2182,7 +2645,7 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf, // 8 pixels // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 RGB565 (16 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToRGB565Row_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2221,7 +2684,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2253,9 +2716,43 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, } // 8 pixels. +// JPeg color space version of I422ToARGB +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) +void J422ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READYUV422 + YUVTORGB(kYuvJConstants) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels. // 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). // Similar to I420 but duplicate UV once more. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I411ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2290,7 +2787,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void NV12ToARGBRow_SSSE3(const uint8* y_buf, const uint8* uv_buf, uint8* dst_argb, @@ -2318,7 +2815,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, // 8 pixels. // 4 VU values upsampled to 8 VU, mixed with 8 Y producing 8 ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void NV21ToARGBRow_SSSE3(const uint8* y_buf, const uint8* uv_buf, uint8* dst_argb, @@ -2344,7 +2841,7 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToBGRARow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2374,7 +2871,7 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToABGRRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2405,7 +2902,7 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToRGBARow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -2437,12 +2934,12 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, #endif // HAS_I422TOARGBROW_SSSE3 -#ifdef HAS_YTOARGBROW_SSE2 +#ifdef HAS_I400TOARGBROW_SSE2 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). -__declspec(naked) __declspec(align(16)) -void YToARGBRow_SSE2(const uint8* y_buf, - uint8* rgb_buf, - int width) { +__declspec(naked) +void I400ToARGBRow_SSE2(const uint8* y_buf, + uint8* rgb_buf, + int width) { __asm { mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) movd xmm2, eax @@ -2482,15 +2979,15 @@ void YToARGBRow_SSE2(const uint8* y_buf, ret } } -#endif // HAS_YTOARGBROW_SSE2 +#endif // HAS_I400TOARGBROW_SSE2 -#ifdef HAS_YTOARGBROW_AVX2 +#ifdef HAS_I400TOARGBROW_AVX2 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). // note: vpunpcklbw mutates and vpackuswb unmutates. -__declspec(naked) __declspec(align(16)) -void YToARGBRow_AVX2(const uint8* y_buf, - uint8* rgb_buf, - int width) { +__declspec(naked) +void I400ToARGBRow_AVX2(const uint8* y_buf, + uint8* rgb_buf, + int width) { __asm { mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) vmovd xmm2, eax @@ -2506,7 +3003,7 @@ void YToARGBRow_AVX2(const uint8* y_buf, mov ecx, [esp + 12] // width convertloop: - // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 + // Step 1: Scale Y contriportbution to 16 G values. G = (y - 16) * 1.164 vmovdqu xmm0, [eax] lea eax, [eax + 16] vpermq ymm0, ymm0, 0xd8 // vpunpcklbw mutates @@ -2533,7 +3030,7 @@ void YToARGBRow_AVX2(const uint8* y_buf, ret } } -#endif // HAS_YTOARGBROW_AVX2 +#endif // HAS_I400TOARGBROW_AVX2 #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. @@ -2542,7 +3039,7 @@ static const uvec8 kShuffleMirror = { }; // TODO(fbarchard): Replace lea with -16 offset. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -2563,7 +3060,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { #endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -2586,7 +3083,7 @@ void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) { #endif // HAS_MIRRORROW_AVX2 #ifdef HAS_MIRRORROW_SSE2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -2617,7 +3114,7 @@ static const uvec8 kShuffleMirrorUV = { 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u }; -__declspec(naked) __declspec(align(16)) +__declspec(naked) void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width) { __asm { @@ -2647,7 +3144,7 @@ void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, #endif // HAS_MIRRORROW_UV_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -2674,7 +3171,7 @@ static const ulvec32 kARGBShuffleMirror_AVX2 = { 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u }; -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -2695,7 +3192,7 @@ void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) { #endif // HAS_ARGBMIRRORROW_AVX2 #ifdef HAS_SPLITUVROW_SSE2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { __asm { push edi @@ -2733,7 +3230,7 @@ void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { #endif // HAS_SPLITUVROW_SSE2 #ifdef HAS_SPLITUVROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { __asm { push edi @@ -2771,7 +3268,7 @@ void SplitUVRow_AVX2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { #endif // HAS_SPLITUVROW_AVX2 #ifdef HAS_MERGEUVROW_SSE2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) { __asm { @@ -2802,7 +3299,7 @@ void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, #endif // HAS_MERGEUVROW_SSE2 #ifdef HAS_MERGEUVROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) { __asm { @@ -2836,7 +3333,7 @@ void MergeUVRow_AVX2(const uint8* src_u, const uint8* src_v, uint8* dst_uv, #ifdef HAS_COPYROW_SSE2 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { __asm { mov eax, [esp + 4] // src @@ -2859,7 +3356,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { #ifdef HAS_COPYROW_AVX // CopyRow copys 'count' bytes using a 32 byte load/store, 64 bytes at time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void CopyRow_AVX(const uint8* src, uint8* dst, int count) { __asm { mov eax, [esp + 4] // src @@ -2883,7 +3380,7 @@ void CopyRow_AVX(const uint8* src, uint8* dst, int count) { #endif // HAS_COPYROW_AVX // Multiple of 1. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { __asm { mov eax, esi @@ -2900,7 +3397,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int count) { #ifdef HAS_ARGBCOPYALPHAROW_SSE2 // width in pixels -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -2936,7 +3433,7 @@ void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBCOPYALPHAROW_AVX2 // width in pixels -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -2965,7 +3462,7 @@ void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2 // width in pixels -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -3003,7 +3500,7 @@ void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) { #ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2 // width in pixels -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { __asm { mov eax, [esp + 4] // src @@ -3035,7 +3532,7 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { #ifdef HAS_SETROW_X86 // Write 'count' bytes using an 8 bit value repeated. // Count should be multiple of 4. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SetRow_X86(uint8* dst, uint8 v8, int count) { __asm { movzx eax, byte ptr [esp + 8] // v8 @@ -3052,7 +3549,7 @@ void SetRow_X86(uint8* dst, uint8 v8, int count) { } // Write 'count' bytes using an 8 bit value repeated. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SetRow_ERMS(uint8* dst, uint8 v8, int count) { __asm { mov edx, edi @@ -3066,7 +3563,7 @@ void SetRow_ERMS(uint8* dst, uint8 v8, int count) { } // Write 'count' 32 bit values. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { __asm { mov edx, edi @@ -3081,7 +3578,7 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { #endif // HAS_SETROW_X86 #ifdef HAS_YUY2TOYROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { __asm { @@ -3108,7 +3605,7 @@ void YUY2ToYRow_AVX2(const uint8* src_yuy2, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix) { __asm { @@ -3152,7 +3649,7 @@ void YUY2ToUVRow_AVX2(const uint8* src_yuy2, int stride_yuy2, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix) { __asm { @@ -3191,7 +3688,7 @@ void YUY2ToUV422Row_AVX2(const uint8* src_yuy2, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) { __asm { @@ -3216,7 +3713,7 @@ void UYVYToYRow_AVX2(const uint8* src_uyvy, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix) { __asm { @@ -3260,7 +3757,7 @@ void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void UYVYToUV422Row_AVX2(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix) { __asm { @@ -3301,7 +3798,7 @@ void UYVYToUV422Row_AVX2(const uint8* src_uyvy, #endif // HAS_YUY2TOYROW_AVX2 #ifdef HAS_YUY2TOYROW_SSE2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { __asm { @@ -3326,7 +3823,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix) { __asm { @@ -3369,7 +3866,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix) { __asm { @@ -3405,7 +3902,7 @@ void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { __asm { @@ -3428,7 +3925,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix) { __asm { @@ -3471,7 +3968,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void UYVYToUV422Row_SSE2(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix) { __asm { @@ -3510,7 +4007,7 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, #ifdef HAS_ARGBBLENDROW_SSE2 // Blend 8 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { @@ -3527,43 +4024,8 @@ void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, psllw xmm5, 8 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 pslld xmm4, 24 - - sub ecx, 1 - je convertloop1 // only 1 pixel? - jl convertloop1b - - // 1 pixel loop until destination pointer is aligned. - alignloop1: - test edx, 15 // aligned? - je alignloop1b - movd xmm3, [eax] - lea eax, [eax + 4] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movd xmm2, [esi] // _r_b - psrlw xmm3, 8 // alpha - pshufhw xmm3, xmm3, 0F5h // 8 alpha words - pshuflw xmm3, xmm3, 0F5h - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movd xmm1, [esi] // _a_g - lea esi, [esi + 4] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - movd [edx], xmm0 - lea edx, [edx + 4] - sub ecx, 1 - jge alignloop1 - - alignloop1b: - add ecx, 1 - 4 - jl convertloop4b + sub ecx, 4 + jl convertloop4b // less than 4 pixels? // 4 pixel loop. convertloop4: @@ -3644,7 +4106,7 @@ static const uvec8 kShuffleAlpha = { // pshufb xmm3, kShuffleAlpha // alpha // Blend 8 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { @@ -3661,41 +4123,8 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, psllw xmm5, 8 pcmpeqb xmm4, xmm4 // generate mask 0xff000000 pslld xmm4, 24 - - sub ecx, 1 - je convertloop1 // only 1 pixel? - jl convertloop1b - - // 1 pixel loop until destination pointer is aligned. - alignloop1: - test edx, 15 // aligned? - je alignloop1b - movd xmm3, [eax] - lea eax, [eax + 4] - movdqa xmm0, xmm3 // src argb - pxor xmm3, xmm4 // ~alpha - movd xmm2, [esi] // _r_b - pshufb xmm3, kShuffleAlpha // alpha - pand xmm2, xmm6 // _r_b - paddw xmm3, xmm7 // 256 - alpha - pmullw xmm2, xmm3 // _r_b * alpha - movd xmm1, [esi] // _a_g - lea esi, [esi + 4] - psrlw xmm1, 8 // _a_g - por xmm0, xmm4 // set alpha to 255 - pmullw xmm1, xmm3 // _a_g * alpha - psrlw xmm2, 8 // _r_b convert to 8 bits again - paddusb xmm0, xmm2 // + src argb - pand xmm1, xmm5 // a_g_ convert to 8 bits again - paddusb xmm0, xmm1 // + src argb - movd [edx], xmm0 - lea edx, [edx + 4] - sub ecx, 1 - jge alignloop1 - - alignloop1b: - add ecx, 1 - 4 - jl convertloop4b + sub ecx, 4 + jl convertloop4b // less than 4 pixels? // 4 pixel loop. convertloop4: @@ -3760,7 +4189,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBATTENUATEROW_SSE2 // Attenuate 4 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_argb0 @@ -3809,7 +4238,7 @@ static const uvec8 kShuffleAlpha1 = { 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, }; -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_argb0 @@ -3853,7 +4282,7 @@ void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { static const uvec8 kShuffleAlpha_AVX2 = { 6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u }; -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { __asm { mov eax, [esp + 4] // src_argb0 @@ -3890,7 +4319,7 @@ void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { #ifdef HAS_ARGBUNATTENUATEROW_SSE2 // Unattenuate 4 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { __asm { @@ -3944,7 +4373,7 @@ static const uvec8 kUnattenShuffleAlpha_AVX2 = { // TODO(fbarchard): Enable USE_GATHER for future hardware if faster. // USE_GATHER is not on by default, due to being a slow instruction. #ifdef USE_GATHER -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { __asm { @@ -3978,7 +4407,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, } } #else // USE_GATHER -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) { __asm { @@ -4045,7 +4474,7 @@ void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBGRAYROW_SSSE3 // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { __asm { mov eax, [esp + 4] /* src_argb */ @@ -4104,7 +4533,7 @@ static const vec8 kARGBToSepiaR = { }; // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { __asm { mov eax, [esp + 4] /* dst_argb */ @@ -4161,7 +4590,7 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { // Same as Sepia except matrix is provided. // TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R // and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, const int8* matrix_argb, int width) { __asm { @@ -4222,7 +4651,7 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, int interval_offset, int width) { __asm { @@ -4267,7 +4696,7 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, #ifdef HAS_ARGBSHADEROW_SSE2 // Shade 4 pixels at a time by specified value. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, uint32 value) { __asm { @@ -4301,7 +4730,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, #ifdef HAS_ARGBMULTIPLYROW_SSE2 // Multiply 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { @@ -4340,7 +4769,7 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBADDROW_SSE2 // Add 2 rows of ARGB pixels together, 4 pixels at a time. // TODO(fbarchard): Port this to posix, neon and other math functions. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { @@ -4388,7 +4817,7 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBSUBTRACTROW_SSE2 // Subtract 2 rows of ARGB pixels together, 4 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { @@ -4417,7 +4846,7 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBMULTIPLYROW_AVX2 // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { @@ -4454,7 +4883,7 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBADDROW_AVX2 // Add 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { @@ -4483,7 +4912,7 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, #ifdef HAS_ARGBSUBTRACTROW_AVX2 // Subtract 2 rows of ARGB pixels together, 8 pixels at a time. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { __asm { @@ -4515,7 +4944,7 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1, // -1 0 1 // -2 0 2 // -1 0 1 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, uint8* dst_sobelx, int width) { __asm { @@ -4571,7 +5000,7 @@ void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1, // -1 -2 -1 // 0 0 0 // 1 2 1 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width) { __asm { @@ -4624,7 +5053,7 @@ void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1, // R = Sobel // G = Sobel // B = Sobel -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) { __asm { @@ -4671,7 +5100,7 @@ void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, #ifdef HAS_SOBELTOPLANEROW_SSE2 // Adds Sobel X and Sobel Y and stores Sobel into a plane. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_y, int width) { __asm { @@ -4704,7 +5133,7 @@ void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, // R = Sobel X // G = Sobel // B = Sobel Y -__declspec(naked) __declspec(align(16)) +__declspec(naked) void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) { __asm { @@ -4991,7 +5420,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, #ifdef HAS_ARGBAFFINEROW_SSE2 // Copy ARGB pixels from source image with slope to a row of destination. -__declspec(naked) __declspec(align(16)) +__declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, uint8* dst_argb, const float* uv_dudv, int width) { @@ -5076,7 +5505,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, #ifdef HAS_INTERPOLATEROW_AVX2 // Bilinear filter 32x2 -> 32x1 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { @@ -5173,7 +5602,7 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr, #endif // HAS_INTERPOLATEROW_AVX2 // Bilinear filter 16x2 -> 16x1 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { @@ -5274,7 +5703,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, #ifdef HAS_INTERPOLATEROW_SSE2 // Bilinear filter 16x2 -> 16x1 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { @@ -5380,38 +5809,8 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr, } #endif // HAS_INTERPOLATEROW_SSE2 -// Specialized ARGB to Bayer that just isolates G channel. -__declspec(naked) __declspec(align(16)) -void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_bayer - // selector - mov ecx, [esp + 16] // pix - pcmpeqb xmm5, xmm5 // generate mask 0x000000ff - psrld xmm5, 24 - - wloop: - movdqu xmm0, [eax] - movdqu xmm1, [eax + 16] - lea eax, [eax + 32] - psrld xmm0, 8 // Move green to bottom. - psrld xmm1, 8 - pand xmm0, xmm5 - pand xmm1, xmm5 - packssdw xmm0, xmm1 - packuswb xmm0, xmm1 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - sub ecx, 8 - jg wloop - ret - } -} - // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { __asm { @@ -5437,7 +5836,7 @@ void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb, } #ifdef HAS_ARGBSHUFFLEROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { __asm { @@ -5465,7 +5864,7 @@ void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb, } #endif // HAS_ARGBSHUFFLEROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { __asm { @@ -5587,7 +5986,7 @@ void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb, // UYVY - Macro-pixel = 2 image pixels // U0Y0V0Y1 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToYUY2Row_SSE2(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -5624,7 +6023,7 @@ void I422ToYUY2Row_SSE2(const uint8* src_y, } } -__declspec(naked) __declspec(align(16)) +__declspec(naked) void I422ToUYVYRow_SSE2(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -5662,7 +6061,7 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, } #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBPolynomialRow_SSE2(const uint8* src_argb, uint8* dst_argb, const float* poly, int width) { @@ -5721,7 +6120,7 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, #endif // HAS_ARGBPOLYNOMIALROW_SSE2 #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBPolynomialRow_AVX2(const uint8* src_argb, uint8* dst_argb, const float* poly, int width) { @@ -5761,7 +6160,7 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, #ifdef HAS_ARGBCOLORTABLEROW_X86 // Tranform ARGB pixels with color table. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { __asm { @@ -5795,7 +6194,7 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, #ifdef HAS_RGBCOLORTABLEROW_X86 // Tranform RGB pixels with color table. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { __asm { push esi @@ -5826,7 +6225,7 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 // Tranform RGB pixels with luma table. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width, const uint8* luma, uint32 lumacoeff) { @@ -5924,7 +6323,7 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 #endif // defined(_M_X64) -#endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) +#endif // !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || defined(_M_X64)) #ifdef __cplusplus } // extern "C" diff --git a/third_party/libyuv/source/scale.cc b/third_party/libyuv/source/scale.cc index 482c5a6..0a01304 100644 --- a/third_party/libyuv/source/scale.cc +++ b/third_party/libyuv/source/scale.cc @@ -23,9 +23,6 @@ namespace libyuv { extern "C" { #endif -// Remove this macro if OVERREAD is safe. -#define AVOID_OVERREAD 1 - static __inline int Abs(int v) { return v >= 0 ? v : -v; } @@ -44,9 +41,8 @@ static void ScalePlaneDown2(int src_width, int src_height, int y; void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) = - filtering == kFilterNone ? ScaleRowDown2_C : - (filtering == kFilterLinear ? ScaleRowDown2Linear_C : - ScaleRowDown2Box_C); + filtering == kFilterNone ? ScaleRowDown2_C : + (filtering == kFilterLinear ? ScaleRowDown2Linear_C : ScaleRowDown2Box_C); int row_stride = src_stride << 1; if (!filtering) { src_ptr += src_stride; // Point to odd rows. @@ -54,15 +50,39 @@ static void ScalePlaneDown2(int src_width, int src_height, } #if defined(HAS_SCALEROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_NEON : + (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_NEON : + ScaleRowDown2Box_Any_NEON); + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_NEON : + (filtering == kFilterLinear ? ScaleRowDown2Linear_NEON : + ScaleRowDown2Box_NEON); + } } #endif #if defined(HAS_SCALEROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : - (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : - ScaleRowDown2Box_SSE2); + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_SSE2 : + ScaleRowDown2Box_Any_SSE2); + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_SSE2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_SSE2 : + ScaleRowDown2Box_SSE2); + } + } +#endif +#if defined(HAS_SCALEROWDOWN2_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_Any_AVX2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_Any_AVX2 : + ScaleRowDown2Box_Any_AVX2); + if (IS_ALIGNED(dst_width, 32)) { + ScaleRowDown2 = filtering == kFilterNone ? ScaleRowDown2_AVX2 : + (filtering == kFilterLinear ? ScaleRowDown2Linear_AVX2 : + ScaleRowDown2Box_AVX2); + } } #endif #if defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) @@ -154,13 +174,30 @@ static void ScalePlaneDown4(int src_width, int src_height, src_stride = 0; } #if defined(HAS_SCALEROWDOWN4_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowDown4 = filtering ? + ScaleRowDown4Box_Any_NEON : ScaleRowDown4_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; + } } #endif #if defined(HAS_SCALEROWDOWN4_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleRowDown4 = filtering ? + ScaleRowDown4Box_Any_SSE2 : ScaleRowDown4_Any_SSE2; + if (IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2; + } + } +#endif +#if defined(HAS_SCALEROWDOWN4_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowDown4 = filtering ? + ScaleRowDown4Box_Any_AVX2 : ScaleRowDown4_Any_AVX2; + if (IS_ALIGNED(dst_width, 16)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_AVX2 : ScaleRowDown4_AVX2; + } } #endif #if defined(HAS_SCALEROWDOWN4_MIPS_DSPR2) @@ -249,24 +286,42 @@ static void ScalePlaneDown34(int src_width, int src_height, ScaleRowDown34_1 = ScaleRowDown34_1_Box_C; } #if defined(HAS_SCALEROWDOWN34_NEON) - if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) { + if (TestCpuFlag(kCpuHasNEON)) { if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_NEON; - ScaleRowDown34_1 = ScaleRowDown34_NEON; + ScaleRowDown34_0 = ScaleRowDown34_Any_NEON; + ScaleRowDown34_1 = ScaleRowDown34_Any_NEON; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON; + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_NEON; + } + if (dst_width % 24 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_NEON; + ScaleRowDown34_1 = ScaleRowDown34_NEON; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON; + } } } #endif #if defined(HAS_SCALEROWDOWN34_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { + if (TestCpuFlag(kCpuHasSSSE3)) { if (!filtering) { - ScaleRowDown34_0 = ScaleRowDown34_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_SSSE3; + ScaleRowDown34_0 = ScaleRowDown34_Any_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_Any_SSSE3; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; + ScaleRowDown34_0 = ScaleRowDown34_0_Box_Any_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_Any_SSSE3; + } + if (dst_width % 24 == 0) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_SSSE3; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; + } } } #endif @@ -422,23 +477,41 @@ static void ScalePlaneDown38(int src_width, int src_height, ScaleRowDown38_3 = ScaleRowDown38_3_Box_C; ScaleRowDown38_2 = ScaleRowDown38_2_Box_C; } + #if defined(HAS_SCALEROWDOWN38_NEON) - if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) { + if (TestCpuFlag(kCpuHasNEON)) { if (!filtering) { - ScaleRowDown38_3 = ScaleRowDown38_NEON; - ScaleRowDown38_2 = ScaleRowDown38_NEON; + ScaleRowDown38_3 = ScaleRowDown38_Any_NEON; + ScaleRowDown38_2 = ScaleRowDown38_Any_NEON; } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON; - ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON; + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_NEON; + } + if (dst_width % 12 == 0) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_NEON; + ScaleRowDown38_2 = ScaleRowDown38_NEON; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON; + } } } #endif #if defined(HAS_SCALEROWDOWN38_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && (dst_width % 24 == 0)) { + if (TestCpuFlag(kCpuHasSSSE3)) { if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_Any_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_Any_SSSE3; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Box_Any_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_Any_SSSE3; + } + if (dst_width % 12 == 0 && !filtering) { ScaleRowDown38_3 = ScaleRowDown38_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_SSSE3; - } else { + } + if (dst_width % 6 == 0 && filtering) { ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; } @@ -559,65 +632,7 @@ static void ScalePlaneDown38_16(int src_width, int src_height, } } -static __inline uint32 SumBox(int iboxwidth, int iboxheight, - ptrdiff_t src_stride, const uint8* src_ptr) { - uint32 sum = 0u; - int y; - assert(iboxwidth > 0); - assert(iboxheight > 0); - for (y = 0; y < iboxheight; ++y) { - int x; - for (x = 0; x < iboxwidth; ++x) { - sum += src_ptr[x]; - } - src_ptr += src_stride; - } - return sum; -} - -static __inline uint32 SumBox_16(int iboxwidth, int iboxheight, - ptrdiff_t src_stride, const uint16* src_ptr) { - uint32 sum = 0u; - int y; - assert(iboxwidth > 0); - assert(iboxheight > 0); - for (y = 0; y < iboxheight; ++y) { - int x; - for (x = 0; x < iboxwidth; ++x) { - sum += src_ptr[x]; - } - src_ptr += src_stride; - } - return sum; -} - -static void ScalePlaneBoxRow_C(int dst_width, int boxheight, - int x, int dx, ptrdiff_t src_stride, - const uint8* src_ptr, uint8* dst_ptr) { - int i; - int boxwidth; - for (i = 0; i < dst_width; ++i) { - int ix = x >> 16; - x += dx; - boxwidth = (x >> 16) - ix; - *dst_ptr++ = SumBox(boxwidth, boxheight, src_stride, src_ptr + ix) / - (boxwidth * boxheight); - } -} - -static void ScalePlaneBoxRow_16_C(int dst_width, int boxheight, - int x, int dx, ptrdiff_t src_stride, - const uint16* src_ptr, uint16* dst_ptr) { - int i; - int boxwidth; - for (i = 0; i < dst_width; ++i) { - int ix = x >> 16; - x += dx; - boxwidth = (x >> 16) - ix; - *dst_ptr++ = SumBox_16(boxwidth, boxheight, src_stride, src_ptr + ix) / - (boxwidth * boxheight); - } -} +#define MIN1(x) ((x) < 1 ? 1 : (x)) static __inline uint32 SumPixels(int iboxwidth, const uint16* src_ptr) { uint32 sum = 0u; @@ -643,15 +658,15 @@ static void ScaleAddCols2_C(int dst_width, int boxheight, int x, int dx, const uint16* src_ptr, uint8* dst_ptr) { int i; int scaletbl[2]; - int minboxwidth = (dx >> 16); + int minboxwidth = dx >> 16; int* scaleptr = scaletbl - minboxwidth; int boxwidth; - scaletbl[0] = 65536 / (minboxwidth * boxheight); - scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); + scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); + scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); for (i = 0; i < dst_width; ++i) { int ix = x >> 16; x += dx; - boxwidth = (x >> 16) - ix; + boxwidth = MIN1((x >> 16) - ix); *dst_ptr++ = SumPixels(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; } } @@ -660,25 +675,36 @@ static void ScaleAddCols2_16_C(int dst_width, int boxheight, int x, int dx, const uint32* src_ptr, uint16* dst_ptr) { int i; int scaletbl[2]; - int minboxwidth = (dx >> 16); + int minboxwidth = dx >> 16; int* scaleptr = scaletbl - minboxwidth; int boxwidth; - scaletbl[0] = 65536 / (minboxwidth * boxheight); - scaletbl[1] = 65536 / ((minboxwidth + 1) * boxheight); + scaletbl[0] = 65536 / (MIN1(minboxwidth) * boxheight); + scaletbl[1] = 65536 / (MIN1(minboxwidth + 1) * boxheight); for (i = 0; i < dst_width; ++i) { int ix = x >> 16; x += dx; - boxwidth = (x >> 16) - ix; - *dst_ptr++ = SumPixels_16(boxwidth, src_ptr + ix) * - scaleptr[boxwidth] >> 16; + boxwidth = MIN1((x >> 16) - ix); + *dst_ptr++ = + SumPixels_16(boxwidth, src_ptr + ix) * scaleptr[boxwidth] >> 16; + } +} + +static void ScaleAddCols0_C(int dst_width, int boxheight, int x, int, + const uint16* src_ptr, uint8* dst_ptr) { + int scaleval = 65536 / boxheight; + int i; + src_ptr += (x >> 16); + for (i = 0; i < dst_width; ++i) { + *dst_ptr++ = src_ptr[i] * scaleval >> 16; } } static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, const uint16* src_ptr, uint8* dst_ptr) { - int boxwidth = (dx >> 16); + int boxwidth = MIN1(dx >> 16); int scaleval = 65536 / (boxwidth * boxheight); int i; + x >>= 16; for (i = 0; i < dst_width; ++i) { *dst_ptr++ = SumPixels(boxwidth, src_ptr + x) * scaleval >> 16; x += boxwidth; @@ -687,7 +713,7 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, static void ScaleAddCols1_16_C(int dst_width, int boxheight, int x, int dx, const uint32* src_ptr, uint16* dst_ptr) { - int boxwidth = (dx >> 16); + int boxwidth = MIN1(dx >> 16); int scaleval = 65536 / (boxwidth * boxheight); int i; for (i = 0; i < dst_width; ++i) { @@ -707,7 +733,7 @@ static void ScalePlaneBox(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr) { - int j; + int j, k; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; @@ -717,42 +743,37 @@ static void ScalePlaneBox(int src_width, int src_height, ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, &dx, &dy); src_width = Abs(src_width); - // TODO(fbarchard): Remove this and make AddRows handle boxheight 1. - if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) { - uint8* dst = dst_ptr; - int j; - for (j = 0; j < dst_height; ++j) { - int boxheight; - int iy = y >> 16; - const uint8* src = src_ptr + iy * src_stride; - y += dy; - if (y > max_y) { - y = max_y; - } - boxheight = (y >> 16) - iy; - ScalePlaneBoxRow_C(dst_width, boxheight, - x, dx, src_stride, - src, dst); - dst += dst_stride; - } - return; - } { // Allocate a row buffer of uint16. align_buffer_64(row16, src_width * 2); void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, const uint16* src_ptr, uint8* dst_ptr) = - (dx & 0xffff) ? ScaleAddCols2_C: ScaleAddCols1_C; - void (*ScaleAddRows)(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height) = ScaleAddRows_C; - -#if defined(HAS_SCALEADDROWS_SSE2) - if (TestCpuFlag(kCpuHasSSE2) -#ifdef AVOID_OVERREAD - && IS_ALIGNED(src_width, 16) + (dx & 0xffff) ? ScaleAddCols2_C: + ((dx != 0x10000) ? ScaleAddCols1_C : ScaleAddCols0_C); + void (*ScaleAddRow)(const uint8* src_ptr, uint16* dst_ptr, int src_width) = + ScaleAddRow_C; +#if defined(HAS_SCALEADDROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleAddRow = ScaleAddRow_Any_SSE2; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_SSE2; + } + } #endif - ) { - ScaleAddRows = ScaleAddRows_SSE2; +#if defined(HAS_SCALEADDROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleAddRow = ScaleAddRow_Any_AVX2; + if (IS_ALIGNED(src_width, 32)) { + ScaleAddRow = ScaleAddRow_AVX2; + } + } +#endif +#if defined(HAS_SCALEADDROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleAddRow = ScaleAddRow_Any_NEON; + if (IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_NEON; + } } #endif @@ -761,14 +782,16 @@ static void ScalePlaneBox(int src_width, int src_height, int iy = y >> 16; const uint8* src = src_ptr + iy * src_stride; y += dy; - if (y > (src_height << 16)) { - y = (src_height << 16); + if (y > max_y) { + y = max_y; + } + boxheight = MIN1((y >> 16) - iy); + memset(row16, 0, src_width * 2); + for (k = 0; k < boxheight; ++k) { + ScaleAddRow(src, (uint16 *)(row16), src_width); + src += src_stride; } - boxheight = (y >> 16) - iy; - ScaleAddRows(src, src_stride, (uint16*)(row16), - src_width, boxheight); - ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), - dst_ptr); + ScaleAddCols(dst_width, boxheight, x, dx, (uint16*)(row16), dst_ptr); dst_ptr += dst_stride; } free_aligned_buffer_64(row16); @@ -779,7 +802,7 @@ static void ScalePlaneBox_16(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, const uint16* src_ptr, uint16* dst_ptr) { - int j; + int j, k; // Initial source x/y coordinate and step values as 16.16 fixed point. int x = 0; int y = 0; @@ -789,42 +812,18 @@ static void ScalePlaneBox_16(int src_width, int src_height, ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, &dx, &dy); src_width = Abs(src_width); - // TODO(fbarchard): Remove this and make AddRows handle boxheight 1. - if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) { - uint16* dst = dst_ptr; - int j; - for (j = 0; j < dst_height; ++j) { - int boxheight; - int iy = y >> 16; - const uint16* src = src_ptr + iy * src_stride; - y += dy; - if (y > max_y) { - y = max_y; - } - boxheight = (y >> 16) - iy; - ScalePlaneBoxRow_16_C(dst_width, boxheight, - x, dx, src_stride, - src, dst); - dst += dst_stride; - } - return; - } { // Allocate a row buffer of uint32. align_buffer_64(row32, src_width * 4); void (*ScaleAddCols)(int dst_width, int boxheight, int x, int dx, const uint32* src_ptr, uint16* dst_ptr) = (dx & 0xffff) ? ScaleAddCols2_16_C: ScaleAddCols1_16_C; - void (*ScaleAddRows)(const uint16* src_ptr, ptrdiff_t src_stride, - uint32* dst_ptr, int src_width, int src_height) = ScaleAddRows_16_C; + void (*ScaleAddRow)(const uint16* src_ptr, uint32* dst_ptr, int src_width) = + ScaleAddRow_16_C; -#if defined(HAS_SCALEADDROWS_16_SSE2) - if (TestCpuFlag(kCpuHasSSE2) -#ifdef AVOID_OVERREAD - && IS_ALIGNED(src_width, 16) -#endif - ) { - ScaleAddRows = ScaleAddRows_16_SSE2; +#if defined(HAS_SCALEADDROW_16_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16)) { + ScaleAddRow = ScaleAddRow_16_SSE2; } #endif @@ -833,14 +832,16 @@ static void ScalePlaneBox_16(int src_width, int src_height, int iy = y >> 16; const uint16* src = src_ptr + iy * src_stride; y += dy; - if (y > (src_height << 16)) { - y = (src_height << 16); + if (y > max_y) { + y = max_y; } - boxheight = (y >> 16) - iy; - ScaleAddRows(src, src_stride, (uint32*)(row32), - src_width, boxheight); - ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), - dst_ptr); + boxheight = MIN1((y >> 16) - iy); + memset(row32, 0, src_width * 4); + for (k = 0; k < boxheight; ++k) { + ScaleAddRow(src, (uint32 *)(row32), src_width); + src += src_stride; + } + ScaleAddCols(dst_width, boxheight, x, dx, (uint32*)(row32), dst_ptr); dst_ptr += dst_stride; } free_aligned_buffer_64(row32); @@ -921,6 +922,14 @@ void ScalePlaneBilinearDown(int src_width, int src_height, ScaleFilterCols = ScaleFilterCols_SSSE3; } #endif +#if defined(HAS_SCALEFILTERCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleFilterCols_NEON; + } + } +#endif if (y > max_y) { y = max_y; } @@ -1057,8 +1066,8 @@ void ScalePlaneBilinearUp(int src_width, int src_height, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_C; void (*ScaleFilterCols)(uint8* dst_ptr, const uint8* src_ptr, - int dst_width, int x, int dx) = - filtering ? ScaleFilterCols_C : ScaleCols_C; + int dst_width, int x, int dx) = + filtering ? ScaleFilterCols_C : ScaleCols_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, &dx, &dy); src_width = Abs(src_width); @@ -1112,6 +1121,14 @@ void ScalePlaneBilinearUp(int src_width, int src_height, ScaleFilterCols = ScaleFilterCols_SSSE3; } #endif +#if defined(HAS_SCALEFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON) && src_width < 32768) { + ScaleFilterCols = ScaleFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleFilterCols = ScaleFilterCols_NEON; + } + } +#endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleFilterCols = ScaleColsUp2_C; #if defined(HAS_SCALECOLS_SSE2) @@ -1129,7 +1146,7 @@ void ScalePlaneBilinearUp(int src_width, int src_height, const uint8* src = src_ptr + yi * src_stride; // Allocate 2 row buffers. - const int kRowSize = (dst_width + 15) & ~15; + const int kRowSize = (dst_width + 31) & ~31; align_buffer_64(row, kRowSize * 2); uint8* rowptr = row; @@ -1188,8 +1205,8 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, ptrdiff_t src_stride, int dst_width, int source_y_fraction) = InterpolateRow_16_C; void (*ScaleFilterCols)(uint16* dst_ptr, const uint16* src_ptr, - int dst_width, int x, int dx) = - filtering ? ScaleFilterCols_16_C : ScaleCols_16_C; + int dst_width, int x, int dx) = + filtering ? ScaleFilterCols_16_C : ScaleCols_16_C; ScaleSlope(src_width, src_height, dst_width, dst_height, filtering, &x, &y, &dx, &dy); src_width = Abs(src_width); @@ -1260,7 +1277,7 @@ void ScalePlaneBilinearUp_16(int src_width, int src_height, const uint16* src = src_ptr + yi * src_stride; // Allocate 2 row buffers. - const int kRowSize = (dst_width + 15) & ~15; + const int kRowSize = (dst_width + 31) & ~31; align_buffer_64(row, kRowSize * 4); uint16* rowptr = (uint16*)row; @@ -1334,8 +1351,7 @@ static void ScalePlaneSimple(int src_width, int src_height, } for (i = 0; i < dst_height; ++i) { - ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, - dst_width, x, dx); + ScaleCols(dst_ptr, src_ptr + (y >> 16) * src_stride, dst_width, x, dx); dst_ptr += dst_stride; y += dy; } @@ -1385,8 +1401,7 @@ void ScalePlane(const uint8* src, int src_stride, enum FilterMode filtering) { // Simplify filtering when possible. filtering = ScaleFilterReduce(src_width, src_height, - dst_width, dst_height, - filtering); + dst_width, dst_height, filtering); // Negative height means invert the image. if (src_height < 0) { @@ -1402,9 +1417,9 @@ void ScalePlane(const uint8* src, int src_stride, CopyPlane(src, src_stride, dst, dst_stride, dst_width, dst_height); return; } - if (dst_width == src_width) { + if (dst_width == src_width && filtering != kFilterBox) { int dy = FixedDiv(src_height, dst_height); - // Arbitrary scale vertically, but unscaled vertically. + // Arbitrary scale vertically, but unscaled horizontally. ScalePlaneVertical(src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, @@ -1435,7 +1450,7 @@ void ScalePlane(const uint8* src, int src_stride, return; } if (4 * dst_width == src_width && 4 * dst_height == src_height && - filtering != kFilterBilinear) { + (filtering == kFilterBox || filtering == kFilterNone)) { // optimized, 1/4 ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); @@ -1469,8 +1484,7 @@ void ScalePlane_16(const uint16* src, int src_stride, enum FilterMode filtering) { // Simplify filtering when possible. filtering = ScaleFilterReduce(src_width, src_height, - dst_width, dst_height, - filtering); + dst_width, dst_height, filtering); // Negative height means invert the image. if (src_height < 0) { @@ -1563,6 +1577,7 @@ int I420Scale(const uint8* src_y, int src_stride_y, int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { return -1; } @@ -1594,6 +1609,7 @@ int I420Scale_16(const uint16* src_y, int src_stride_y, int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); if (!src_y || !src_u || !src_v || src_width == 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { return -1; } diff --git a/third_party/libyuv/source/scale_any.cc b/third_party/libyuv/source/scale_any.cc new file mode 100644 index 0000000..2f6a2c8 --- /dev/null +++ b/third_party/libyuv/source/scale_any.cc @@ -0,0 +1,200 @@ +/* + * Copyright 2015 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/scale.h" +#include "libyuv/scale_row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +// Definition for ScaleFilterCols, ScaleARGBCols and ScaleARGBFilterCols +#define CANY(NAMEANY, TERP_SIMD, TERP_C, BPP, MASK) \ + void NAMEANY(uint8* dst_ptr, const uint8* src_ptr, \ + int dst_width, int x, int dx) { \ + int n = dst_width & ~MASK; \ + if (n > 0) { \ + TERP_SIMD(dst_ptr, src_ptr, n, x, dx); \ + } \ + TERP_C(dst_ptr + n * BPP, src_ptr, \ + dst_width & MASK, x + n * dx, dx); \ + } + +#ifdef HAS_SCALEFILTERCOLS_NEON +CANY(ScaleFilterCols_Any_NEON, ScaleFilterCols_NEON, ScaleFilterCols_C, 1, 7) +#endif +#ifdef HAS_SCALEARGBCOLS_NEON +CANY(ScaleARGBCols_Any_NEON, ScaleARGBCols_NEON, ScaleARGBCols_C, 4, 7) +#endif +#ifdef HAS_SCALEARGBFILTERCOLS_NEON +CANY(ScaleARGBFilterCols_Any_NEON, ScaleARGBFilterCols_NEON, + ScaleARGBFilterCols_C, 4, 3) +#endif +#undef CANY + +// Fixed scale down. +#define SDANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, FACTOR, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, \ + uint8* dst_ptr, int dst_width) { \ + int r = (int)((unsigned int)dst_width % (MASK + 1)); \ + int n = dst_width - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * FACTOR) * BPP, src_stride, \ + dst_ptr + n * BPP, r); \ + } + +#ifdef HAS_SCALEROWDOWN2_SSE2 +SDANY(ScaleRowDown2_Any_SSE2, ScaleRowDown2_SSE2, ScaleRowDown2_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_SSE2, ScaleRowDown2Linear_SSE2, + ScaleRowDown2Linear_C, 2, 1, 15) +SDANY(ScaleRowDown2Box_Any_SSE2, ScaleRowDown2Box_SSE2, ScaleRowDown2Box_C, + 2, 1, 15) +#endif +#ifdef HAS_SCALEROWDOWN2_AVX2 +SDANY(ScaleRowDown2_Any_AVX2, ScaleRowDown2_AVX2, ScaleRowDown2_C, 2, 1, 31) +SDANY(ScaleRowDown2Linear_Any_AVX2, ScaleRowDown2Linear_AVX2, + ScaleRowDown2Linear_C, 2, 1, 31) +SDANY(ScaleRowDown2Box_Any_AVX2, ScaleRowDown2Box_AVX2, ScaleRowDown2Box_C, + 2, 1, 31) +#endif +#ifdef HAS_SCALEROWDOWN2_NEON +SDANY(ScaleRowDown2_Any_NEON, ScaleRowDown2_NEON, ScaleRowDown2_C, 2, 1, 15) +SDANY(ScaleRowDown2Linear_Any_NEON, ScaleRowDown2Linear_NEON, + ScaleRowDown2Linear_C, 2, 1, 15) +SDANY(ScaleRowDown2Box_Any_NEON, ScaleRowDown2Box_NEON, + ScaleRowDown2Box_C, 2, 1, 15) +#endif +#ifdef HAS_SCALEROWDOWN4_SSE2 +SDANY(ScaleRowDown4_Any_SSE2, ScaleRowDown4_SSE2, ScaleRowDown4_C, 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_SSE2, ScaleRowDown4Box_SSE2, ScaleRowDown4Box_C, + 4, 1, 7) +#endif +#ifdef HAS_SCALEROWDOWN4_AVX2 +SDANY(ScaleRowDown4_Any_AVX2, ScaleRowDown4_AVX2, ScaleRowDown4_C, 4, 1, 15) +SDANY(ScaleRowDown4Box_Any_AVX2, ScaleRowDown4Box_AVX2, ScaleRowDown4Box_C, + 4, 1, 15) +#endif +#ifdef HAS_SCALEROWDOWN4_NEON +SDANY(ScaleRowDown4_Any_NEON, ScaleRowDown4_NEON, ScaleRowDown4_C, 4, 1, 7) +SDANY(ScaleRowDown4Box_Any_NEON, ScaleRowDown4Box_NEON, ScaleRowDown4Box_C, + 4, 1, 7) +#endif +#ifdef HAS_SCALEROWDOWN34_SSSE3 +SDANY(ScaleRowDown34_Any_SSSE3, ScaleRowDown34_SSSE3, + ScaleRowDown34_C, 4 / 3, 1, 23) +SDANY(ScaleRowDown34_0_Box_Any_SSSE3, ScaleRowDown34_0_Box_SSSE3, + ScaleRowDown34_0_Box_C, 4 / 3, 1, 23) +SDANY(ScaleRowDown34_1_Box_Any_SSSE3, ScaleRowDown34_1_Box_SSSE3, + ScaleRowDown34_1_Box_C, 4 / 3, 1, 23) +#endif +#ifdef HAS_SCALEROWDOWN34_NEON +SDANY(ScaleRowDown34_Any_NEON, ScaleRowDown34_NEON, + ScaleRowDown34_C, 4 / 3, 1, 23) +SDANY(ScaleRowDown34_0_Box_Any_NEON, ScaleRowDown34_0_Box_NEON, + ScaleRowDown34_0_Box_C, 4 / 3, 1, 23) +SDANY(ScaleRowDown34_1_Box_Any_NEON, ScaleRowDown34_1_Box_NEON, + ScaleRowDown34_1_Box_C, 4 / 3, 1, 23) +#endif +#ifdef HAS_SCALEROWDOWN38_SSSE3 +SDANY(ScaleRowDown38_Any_SSSE3, ScaleRowDown38_SSSE3, + ScaleRowDown38_C, 8 / 3, 1, 11) +SDANY(ScaleRowDown38_3_Box_Any_SSSE3, ScaleRowDown38_3_Box_SSSE3, + ScaleRowDown38_3_Box_C, 8 / 3, 1, 5) +SDANY(ScaleRowDown38_2_Box_Any_SSSE3, ScaleRowDown38_2_Box_SSSE3, + ScaleRowDown38_2_Box_C, 8 / 3, 1, 5) +#endif +#ifdef HAS_SCALEROWDOWN38_NEON +SDANY(ScaleRowDown38_Any_NEON, ScaleRowDown38_NEON, + ScaleRowDown38_C, 8 / 3, 1, 11) +SDANY(ScaleRowDown38_3_Box_Any_NEON, ScaleRowDown38_3_Box_NEON, + ScaleRowDown38_3_Box_C, 8 / 3, 1, 11) +SDANY(ScaleRowDown38_2_Box_Any_NEON, ScaleRowDown38_2_Box_NEON, + ScaleRowDown38_2_Box_C, 8 / 3, 1, 11) +#endif + +#ifdef HAS_SCALEARGBROWDOWN2_SSE2 +SDANY(ScaleARGBRowDown2_Any_SSE2, ScaleARGBRowDown2_SSE2, + ScaleARGBRowDown2_C, 2, 4, 3) +SDANY(ScaleARGBRowDown2Linear_Any_SSE2, ScaleARGBRowDown2Linear_SSE2, + ScaleARGBRowDown2Linear_C, 2, 4, 3) +SDANY(ScaleARGBRowDown2Box_Any_SSE2, ScaleARGBRowDown2Box_SSE2, + ScaleARGBRowDown2Box_C, 2, 4, 3) +#endif +#ifdef HAS_SCALEARGBROWDOWN2_NEON +SDANY(ScaleARGBRowDown2_Any_NEON, ScaleARGBRowDown2_NEON, + ScaleARGBRowDown2_C, 2, 4, 7) +SDANY(ScaleARGBRowDown2Linear_Any_NEON, ScaleARGBRowDown2Linear_NEON, + ScaleARGBRowDown2Linear_C, 2, 4, 7) +SDANY(ScaleARGBRowDown2Box_Any_NEON, ScaleARGBRowDown2Box_NEON, + ScaleARGBRowDown2Box_C, 2, 4, 7) +#endif +#undef SDANY + +// Scale down by even scale factor. +#define SDAANY(NAMEANY, SCALEROWDOWN_SIMD, SCALEROWDOWN_C, BPP, MASK) \ + void NAMEANY(const uint8* src_ptr, ptrdiff_t src_stride, int src_stepx, \ + uint8* dst_ptr, int dst_width) { \ + int r = (int)((unsigned int)dst_width % (MASK + 1)); \ + int n = dst_width - r; \ + if (n > 0) { \ + SCALEROWDOWN_SIMD(src_ptr, src_stride, src_stepx, dst_ptr, n); \ + } \ + SCALEROWDOWN_C(src_ptr + (n * src_stepx) * BPP, src_stride, \ + src_stepx, dst_ptr + n * BPP, r); \ + } + +#ifdef HAS_SCALEARGBROWDOWNEVEN_SSE2 +SDAANY(ScaleARGBRowDownEven_Any_SSE2, ScaleARGBRowDownEven_SSE2, + ScaleARGBRowDownEven_C, 4, 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_SSE2, ScaleARGBRowDownEvenBox_SSE2, + ScaleARGBRowDownEvenBox_C, 4, 3) +#endif +#ifdef HAS_SCALEARGBROWDOWNEVEN_NEON +SDAANY(ScaleARGBRowDownEven_Any_NEON, ScaleARGBRowDownEven_NEON, + ScaleARGBRowDownEven_C, 4, 3) +SDAANY(ScaleARGBRowDownEvenBox_Any_NEON, ScaleARGBRowDownEvenBox_NEON, + ScaleARGBRowDownEvenBox_C, 4, 3) +#endif + +// Add rows box filter scale down. +#define SAANY(NAMEANY, SCALEADDROW_SIMD, SCALEADDROW_C, MASK) \ + void NAMEANY(const uint8* src_ptr, uint16* dst_ptr, int src_width) { \ + int n = src_width & ~MASK; \ + if (n > 0) { \ + SCALEADDROW_SIMD(src_ptr, dst_ptr, n); \ + } \ + SCALEADDROW_C(src_ptr + n, dst_ptr + n, src_width & MASK); \ + } + +#ifdef HAS_SCALEADDROW_SSE2 +SAANY(ScaleAddRow_Any_SSE2, ScaleAddRow_SSE2, ScaleAddRow_C, 15) +#endif +#ifdef HAS_SCALEADDROW_AVX2 +SAANY(ScaleAddRow_Any_AVX2, ScaleAddRow_AVX2, ScaleAddRow_C, 31) +#endif +#ifdef HAS_SCALEADDROW_NEON +SAANY(ScaleAddRow_Any_NEON, ScaleAddRow_NEON, ScaleAddRow_C, 15) +#endif +#undef SAANY + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif + + + + + diff --git a/third_party/libyuv/source/scale_argb.cc b/third_party/libyuv/source/scale_argb.cc index 05b58e1..40a2d1a 100644 --- a/third_party/libyuv/source/scale_argb.cc +++ b/third_party/libyuv/source/scale_argb.cc @@ -53,16 +53,27 @@ static void ScaleARGBDown2(int src_width, int src_height, } #if defined(HAS_SCALEARGBROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 : - (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 : - ScaleARGBRowDown2Box_SSE2); + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_SSE2 : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_SSE2 : + ScaleARGBRowDown2Box_Any_SSE2); + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 : + ScaleARGBRowDown2Box_SSE2); + } } #endif #if defined(HAS_SCALEARGBROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { - ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON : - ScaleARGBRowDown2_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_Any_NEON : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_Any_NEON : + ScaleARGBRowDown2Box_Any_NEON); + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_NEON : + (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_NEON : + ScaleARGBRowDown2Box_NEON); + } } #endif @@ -86,7 +97,7 @@ static void ScaleARGBDown4Box(int src_width, int src_height, int x, int dx, int y, int dy) { int j; // Allocate 2 rows of ARGB. - const int kRowSize = (dst_width * 2 * 4 + 15) & ~15; + const int kRowSize = (dst_width * 2 * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); int row_stride = src_stride * (dy >> 16); void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, @@ -96,15 +107,22 @@ static void ScaleARGBDown4Box(int src_width, int src_height, assert(dx == 65536 * 4); // Test scale factor of 4. assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4. #if defined(HAS_SCALEARGBROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2; + } } #endif #if defined(HAS_SCALEARGBROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { - ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON; + } } #endif + for (j = 0; j < dst_height; ++j) { ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2); ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride, @@ -135,15 +153,23 @@ static void ScaleARGBDownEven(int src_width, int src_height, assert(IS_ALIGNED(src_height, 2)); src_argb += (y >> 16) * src_stride + (x >> 16) * 4; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 : - ScaleARGBRowDownEven_SSE2; + if (TestCpuFlag(kCpuHasSSE2)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_SSE2 : + ScaleARGBRowDownEven_Any_SSE2; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 : + ScaleARGBRowDownEven_SSE2; + } } #endif #if defined(HAS_SCALEARGBROWDOWNEVEN_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON : - ScaleARGBRowDownEven_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_Any_NEON : + ScaleARGBRowDownEven_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON : + ScaleARGBRowDownEven_NEON; + } } #endif @@ -230,6 +256,14 @@ static void ScaleARGBBilinearDown(int src_width, int src_height, ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; + } + } +#endif // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear. // Allocate a row of ARGB. { @@ -321,11 +355,27 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; + } + } +#endif #if defined(HAS_SCALEARGBCOLS_SSE2) if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; } #endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (!filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBCols_NEON; + } + } +#endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) @@ -344,7 +394,7 @@ static void ScaleARGBBilinearUp(int src_width, int src_height, const uint8* src = src_argb + yi * src_stride; // Allocate 2 rows of ARGB. - const int kRowSize = (dst_width * 4 + 15) & ~15; + const int kRowSize = (dst_width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); uint8* rowptr = row; @@ -495,11 +545,27 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3; } #endif +#if defined(HAS_SCALEARGBFILTERCOLS_NEON) + if (filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_Any_NEON; + if (IS_ALIGNED(dst_width, 4)) { + ScaleARGBFilterCols = ScaleARGBFilterCols_NEON; + } + } +#endif #if defined(HAS_SCALEARGBCOLS_SSE2) if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) { ScaleARGBFilterCols = ScaleARGBCols_SSE2; } #endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (!filtering && TestCpuFlag(kCpuHasNEON)) { + ScaleARGBFilterCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBFilterCols = ScaleARGBCols_NEON; + } + } +#endif if (!filtering && src_width * 2 == dst_width && x < 0x8000) { ScaleARGBFilterCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) @@ -521,7 +587,7 @@ static void ScaleYUVToARGBBilinearUp(int src_width, int src_height, const uint8* src_row_v = src_v + uv_yi * src_stride_v; // Allocate 2 rows of ARGB. - const int kRowSize = (dst_width * 4 + 15) & ~15; + const int kRowSize = (dst_width * 4 + 31) & ~31; align_buffer_64(row, kRowSize * 2); // Allocate 1 row of ARGB for source conversion. @@ -607,6 +673,14 @@ static void ScaleARGBSimple(int src_width, int src_height, ScaleARGBCols = ScaleARGBCols_SSE2; } #endif +#if defined(HAS_SCALEARGBCOLS_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ScaleARGBCols = ScaleARGBCols_Any_NEON; + if (IS_ALIGNED(dst_width, 8)) { + ScaleARGBCols = ScaleARGBCols_NEON; + } + } +#endif if (src_width * 2 == dst_width && x < 0x8000) { ScaleARGBCols = ScaleARGBColsUp2_C; #if defined(HAS_SCALEARGBCOLSUP2_SSE2) @@ -744,6 +818,7 @@ int ARGBScaleClip(const uint8* src_argb, int src_stride_argb, if (!src_argb || src_width == 0 || src_height == 0 || !dst_argb || dst_width <= 0 || dst_height <= 0 || clip_x < 0 || clip_y < 0 || + clip_width > 32768 || clip_height > 32768 || (clip_x + clip_width) > dst_width || (clip_y + clip_height) > dst_height) { return -1; @@ -762,6 +837,7 @@ int ARGBScale(const uint8* src_argb, int src_stride_argb, int dst_width, int dst_height, enum FilterMode filtering) { if (!src_argb || src_width == 0 || src_height == 0 || + src_width > 32768 || src_height > 32768 || !dst_argb || dst_width <= 0 || dst_height <= 0) { return -1; } diff --git a/third_party/libyuv/source/scale_common.cc b/third_party/libyuv/source/scale_common.cc index 96e2564..1711f3d 100644 --- a/third_party/libyuv/source/scale_common.cc +++ b/third_party/libyuv/source/scale_common.cc @@ -621,39 +621,31 @@ void ScaleRowDown38_2_Box_16_C(const uint16* src_ptr, ptrdiff_t src_stride, } } -void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, int src_height) { +void ScaleAddRow_C(const uint8* src_ptr, uint16* dst_ptr, int src_width) { int x; assert(src_width > 0); - assert(src_height > 0); - for (x = 0; x < src_width; ++x) { - const uint8* s = src_ptr + x; - unsigned int sum = 0u; - int y; - for (y = 0; y < src_height; ++y) { - sum += s[0]; - s += src_stride; - } - // TODO(fbarchard): Consider limitting height to 256 to avoid overflow. - dst_ptr[x] = sum < 65535u ? sum : 65535u; + for (x = 0; x < src_width - 1; x += 2) { + dst_ptr[0] += src_ptr[0]; + dst_ptr[1] += src_ptr[1]; + src_ptr += 2; + dst_ptr += 2; + } + if (src_width & 1) { + dst_ptr[0] += src_ptr[0]; } } -void ScaleAddRows_16_C(const uint16* src_ptr, ptrdiff_t src_stride, - uint32* dst_ptr, int src_width, int src_height) { +void ScaleAddRow_16_C(const uint16* src_ptr, uint32* dst_ptr, int src_width) { int x; assert(src_width > 0); - assert(src_height > 0); - for (x = 0; x < src_width; ++x) { - const uint16* s = src_ptr + x; - unsigned int sum = 0u; - int y; - for (y = 0; y < src_height; ++y) { - sum += s[0]; - s += src_stride; - } - // No risk of overflow here now - dst_ptr[x] = sum; + for (x = 0; x < src_width - 1; x += 2) { + dst_ptr[0] += src_ptr[0]; + dst_ptr[1] += src_ptr[1]; + src_ptr += 2; + dst_ptr += 2; + } + if (src_width & 1) { + dst_ptr[0] += src_ptr[0]; } } @@ -1030,10 +1022,6 @@ enum FilterMode ScaleFilterReduce(int src_width, int src_height, if (dst_width * 2 >= src_width && dst_height * 2 >= src_height) { filtering = kFilterBilinear; } - // If scaling to larger, switch from Box to Bilinear. - if (dst_width >= src_width || dst_height >= src_height) { - filtering = kFilterBilinear; - } } if (filtering == kFilterBilinear) { if (src_height == 1) { diff --git a/third_party/libyuv/source/scale_posix.cc b/third_party/libyuv/source/scale_gcc.cc similarity index 97% rename from third_party/libyuv/source/scale_posix.cc rename to third_party/libyuv/source/scale_gcc.cc index bb6e57e..8a6ac54 100644 --- a/third_party/libyuv/source/scale_posix.cc +++ b/third_party/libyuv/source/scale_gcc.cc @@ -573,44 +573,38 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, ); } +// Reads 16xN bytes and produces 16 shorts at a time. void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int src_width, int src_height) { int tmp_height = 0; intptr_t tmp_src = 0; asm volatile ( + "mov %0,%3 \n" // row pointer + "mov %5,%2 \n" // height + "pxor %%xmm0,%%xmm0 \n" // clear accumulators + "pxor %%xmm1,%%xmm1 \n" "pxor %%xmm4,%%xmm4 \n" - "sub $0x1,%5 \n" LABELALIGN "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "mov %0,%3 \n" - "add %6,%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm4,%%xmm0 \n" - "punpckhbw %%xmm4,%%xmm1 \n" - "mov %5,%2 \n" - "test %2,%2 \n" - "je 3f \n" - - LABELALIGN - "2: \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "add %6,%0 \n" + "movdqu " MEMACCESS(3) ",%%xmm2 \n" + "add %6,%3 \n" "movdqa %%xmm2,%%xmm3 \n" "punpcklbw %%xmm4,%%xmm2 \n" "punpckhbw %%xmm4,%%xmm3 \n" "paddusw %%xmm2,%%xmm0 \n" "paddusw %%xmm3,%%xmm1 \n" "sub $0x1,%2 \n" - "jg 2b \n" + "jg 1b \n" - LABELALIGN - "3: \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x10,3) ",%0 \n" "lea " MEMLEA(0x20,1) ",%1 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16 + "mov %0,%3 \n" // row pointer + "mov %5,%2 \n" // height + "pxor %%xmm0,%%xmm0 \n" // clear accumulators + "pxor %%xmm1,%%xmm1 \n" "sub $0x10,%4 \n" "jg 1b \n" : "+r"(src_ptr), // %0 @@ -799,8 +793,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, // Reads 4 pixels at a time. // Alignment requirement: dst_argb 16 byte aligned. void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { + int src_stepx, uint8* dst_argb, int dst_width) { intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12 = 0; asm volatile ( diff --git a/third_party/libyuv/source/scale_neon.cc b/third_party/libyuv/source/scale_neon.cc index 7921219..7825878 100644 --- a/third_party/libyuv/source/scale_neon.cc +++ b/third_party/libyuv/source/scale_neon.cc @@ -43,6 +43,30 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } +// Read 32x1 average down and write 16x1. +void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld1.8 {q0, q1}, [%0]! \n" // load pixels and post inc + "subs %2, %2, #16 \n" // 16 processed per loop + "vpaddl.u8 q0, q0 \n" // add adjacent + "vpaddl.u8 q1, q1 \n" + "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #1 \n" + MEMACCESS(1) + "vst1.8 {q0}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "q0", "q1" // Clobber List + ); +} + // Read 32x2 average down and write 16x1. void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { @@ -517,6 +541,112 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ); } +void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) { + const uint8* src_tmp = NULL; + asm volatile ( + ".p2align 2 \n" + "1: \n" + "mov %0, %1 \n" + "mov r12, %5 \n" + "veor q2, q2, q2 \n" + "veor q3, q3, q3 \n" + "2: \n" + // load 16 pixels into q0 + MEMACCESS(0) + "vld1.8 {q0}, [%0], %3 \n" + "vaddw.u8 q3, q3, d1 \n" + "vaddw.u8 q2, q2, d0 \n" + "subs r12, r12, #1 \n" + "bgt 2b \n" + MEMACCESS(2) + "vst1.16 {q2, q3}, [%2]! \n" // store pixels + "add %1, %1, #16 \n" + "subs %4, %4, #16 \n" // 16 processed per loop + "bgt 1b \n" + : "+r"(src_tmp), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_ptr), // %2 + "+r"(src_stride), // %3 + "+r"(src_width), // %4 + "+r"(src_height) // %5 + : + : "memory", "cc", "r12", "q0", "q1", "q2", "q3" // Clobber List + ); +} + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "vld2.8 {d6["#n"], d7["#n"]}, [%6] \n" + +void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8* src_tmp = src_ptr; + asm volatile ( + ".p2align 2 \n" + "vdup.32 q0, %3 \n" // x + "vdup.32 q1, %4 \n" // dx + "vld1.32 {q2}, [%5] \n" // 0 1 2 3 + "vshl.i32 q3, q1, #2 \n" // 4 * dx + "vmul.s32 q1, q1, q2 \n" + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "vadd.s32 q1, q1, q0 \n" + // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx + "vadd.s32 q2, q1, q3 \n" + "vshl.i32 q0, q3, #1 \n" // 8 * dx + "1: \n" + LOAD2_DATA8_LANE(0) + LOAD2_DATA8_LANE(1) + LOAD2_DATA8_LANE(2) + LOAD2_DATA8_LANE(3) + LOAD2_DATA8_LANE(4) + LOAD2_DATA8_LANE(5) + LOAD2_DATA8_LANE(6) + LOAD2_DATA8_LANE(7) + "vmov q10, q1 \n" + "vmov q11, q2 \n" + "vuzp.16 q10, q11 \n" + "vmovl.u8 q8, d6 \n" + "vmovl.u8 q9, d7 \n" + "vsubl.s16 q11, d18, d16 \n" + "vsubl.s16 q12, d19, d17 \n" + "vmovl.u16 q13, d20 \n" + "vmovl.u16 q10, d21 \n" + "vmul.s32 q11, q11, q13 \n" + "vmul.s32 q12, q12, q10 \n" + "vshrn.s32 d18, q11, #16 \n" + "vshrn.s32 d19, q12, #16 \n" + "vadd.s16 q8, q8, q9 \n" + "vmovn.s16 d6, q8 \n" + + MEMACCESS(0) + "vst1.8 {d6}, [%0]! \n" // store pixels + "vadd.s32 q1, q1, q0 \n" + "vadd.s32 q2, q2, q0 \n" + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13" + ); +} + +#undef LOAD2_DATA8_LANE + // 16x2 -> 16x1 void ScaleFilterRows_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, @@ -640,6 +770,35 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } +void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + MEMACCESS(0) + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "subs %2, %2, #8 \n" // 8 processed per loop + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts. + "vrshrn.u16 d0, q0, #1 \n" // downshift, round and pack + "vrshrn.u16 d1, q1, #1 \n" + "vrshrn.u16 d2, q2, #1 \n" + "vrshrn.u16 d3, q3, #1 \n" + MEMACCESS(1) + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} + void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( @@ -757,6 +916,119 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, ); } +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD1_DATA32_LANE(dn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "vld1.32 {"#dn"["#n"]}, [%6] \n" + +void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + int tmp = 0; + const uint8* src_tmp = src_argb; + asm volatile ( + ".p2align 2 \n" + "1: \n" + LOAD1_DATA32_LANE(d0, 0) + LOAD1_DATA32_LANE(d0, 1) + LOAD1_DATA32_LANE(d1, 0) + LOAD1_DATA32_LANE(d1, 1) + LOAD1_DATA32_LANE(d2, 0) + LOAD1_DATA32_LANE(d2, 1) + LOAD1_DATA32_LANE(d3, 0) + LOAD1_DATA32_LANE(d3, 1) + + MEMACCESS(0) + "vst1.32 {q0, q1}, [%0]! \n" // store pixels + "subs %2, %2, #8 \n" // 8 processed per loop + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "q0", "q1" + ); +} + +#undef LOAD1_DATA32_LANE + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA32_LANE(dn1, dn2, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "vld2.32 {"#dn1"["#n"], "#dn2"["#n"]}, [%6] \n" + +void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8* src_tmp = src_argb; + asm volatile ( + ".p2align 2 \n" + "vdup.32 q0, %3 \n" // x + "vdup.32 q1, %4 \n" // dx + "vld1.32 {q2}, [%5] \n" // 0 1 2 3 + "vshl.i32 q9, q1, #2 \n" // 4 * dx + "vmul.s32 q1, q1, q2 \n" + "vmov.i8 q3, #0x7f \n" // 0x7F + "vmov.i16 q15, #0x7f \n" // 0x7F + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "vadd.s32 q8, q1, q0 \n" + "1: \n" + // d0, d1: a + // d2, d3: b + LOAD2_DATA32_LANE(d0, d2, 0) + LOAD2_DATA32_LANE(d0, d2, 1) + LOAD2_DATA32_LANE(d1, d3, 0) + LOAD2_DATA32_LANE(d1, d3, 1) + "vshrn.i32 d22, q8, #9 \n" + "vand.16 d22, d22, d30 \n" + "vdup.8 d24, d22[0] \n" + "vdup.8 d25, d22[2] \n" + "vdup.8 d26, d22[4] \n" + "vdup.8 d27, d22[6] \n" + "vext.8 d4, d24, d25, #4 \n" + "vext.8 d5, d26, d27, #4 \n" // f + "veor.8 q10, q2, q3 \n" // 0x7f ^ f + "vmull.u8 q11, d0, d20 \n" + "vmull.u8 q12, d1, d21 \n" + "vmull.u8 q13, d2, d4 \n" + "vmull.u8 q14, d3, d5 \n" + "vadd.i16 q11, q11, q13 \n" + "vadd.i16 q12, q12, q14 \n" + "vshrn.i16 d0, q11, #7 \n" + "vshrn.i16 d1, q12, #7 \n" + + MEMACCESS(0) + "vst1.32 {d0, d1}, [%0]! \n" // store pixels + "vadd.s32 q8, q8, q9 \n" + "subs %2, %2, #4 \n" // 4 processed per loop + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 + "+r"(x), // %3 + "+r"(dx), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", + "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +#undef LOAD2_DATA32_LANE + #endif // defined(__ARM_NEON__) && !defined(__aarch64__) #ifdef __cplusplus diff --git a/third_party/libyuv/source/scale_neon64.cc b/third_party/libyuv/source/scale_neon64.cc index fb68b67..1d55193 100644 --- a/third_party/libyuv/source/scale_neon64.cc +++ b/third_party/libyuv/source/scale_neon64.cc @@ -27,8 +27,8 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "1: \n" // load even pixels into v0, odd into v1 MEMACCESS(0) - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %2, %2, #16 \n" // 16 processed per loop + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop MEMACCESS(1) "st1 {v1.16b}, [%1], #16 \n" // store odd pixels "b.gt 1b \n" @@ -40,6 +40,29 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } +// Read 32x1 average down and write 16x1. +void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc + "subs %w2, %w2, #16 \n" // 16 processed per loop + "uaddlp v0.8h, v0.16b \n" // add adjacent + "uaddlp v1.8h, v1.16b \n" + "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack + "rshrn2 v0.16b, v1.8h, #1 \n" + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List + ); +} + // Read 32x2 average down and write 16x1. void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { @@ -51,7 +74,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc MEMACCESS(1) "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc - "subs %3, %3, #16 \n" // 16 processed per loop + "subs %w3, %w3, #16 \n" // 16 processed per loop "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent "uaddlp v1.8h, v1.16b \n" "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1 @@ -76,7 +99,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %2, %2, #8 \n" // 8 processed per loop + "subs %w2, %w2, #8 \n" // 8 processed per loop MEMACCESS(1) "st1 {v2.8b}, [%1], #8 \n" "b.gt 1b \n" @@ -103,7 +126,7 @@ asm volatile ( "ld1 {v2.16b}, [%3], #16 \n" MEMACCESS(5) "ld1 {v3.16b}, [%4], #16 \n" - "subs %5, %5, #4 \n" + "subs %w5, %w5, #4 \n" "uaddlp v0.8h, v0.16b \n" "uadalp v0.8h, v1.16b \n" "uadalp v0.8h, v2.16b \n" @@ -134,7 +157,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %2, %2, #24 \n" + "subs %w2, %w2, #24 \n" "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 MEMACCESS(1) "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" @@ -158,7 +181,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 MEMACCESS(3) "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %2, %2, #24 \n" + "subs %w2, %w2, #24 \n" // filter src line 0 with src line 1 // expand chars to shorts to allow for room @@ -218,7 +241,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 MEMACCESS(3) "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %2, %2, #24 \n" + "subs %w2, %w2, #24 \n" // average src line 0 with src line 1 "urhadd v0.8b, v0.8b, v4.8b \n" "urhadd v1.8b, v1.8b, v5.8b \n" @@ -271,7 +294,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, "1: \n" MEMACCESS(0) "ld1 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %2, %2, #12 \n" + "subs %w2, %w2, #12 \n" "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" MEMACCESS(1) "st1 {v2.8b}, [%1], #8 \n" @@ -313,7 +336,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" MEMACCESS(4) "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" - "subs %4, %4, #12 \n" + "subs %w4, %w4, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 @@ -437,7 +460,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" MEMACCESS(3) "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - "subs %3, %3, #12 \n" + "subs %w3, %w3, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 @@ -522,20 +545,127 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ); } +void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) { + const uint8* src_tmp = NULL; + asm volatile ( + "1: \n" + "mov %0, %1 \n" + "mov w12, %w5 \n" + "eor v2.16b, v2.16b, v2.16b \n" + "eor v3.16b, v3.16b, v3.16b \n" + "2: \n" + // load 16 pixels into q0 + MEMACCESS(0) + "ld1 {v0.16b}, [%0], %3 \n" + "uaddw2 v3.8h, v3.8h, v0.16b \n" + "uaddw v2.8h, v2.8h, v0.8b \n" + "subs w12, w12, #1 \n" + "b.gt 2b \n" + MEMACCESS(2) + "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels + "add %1, %1, #16 \n" + "subs %w4, %w4, #16 \n" // 16 processed per loop + "b.gt 1b \n" + : "+r"(src_tmp), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_ptr), // %2 + "+r"(src_stride), // %3 + "+r"(src_width), // %4 + "+r"(src_height) // %5 + : + : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "ld2 {v4.b, v5.b}["#n"], [%6] \n" + +void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8* src_tmp = src_ptr; + int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. + int64 x64 = (int64) x; + int64 dx64 = (int64) dx; + asm volatile ( + "dup v0.4s, %w3 \n" // x + "dup v1.4s, %w4 \n" // dx + "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 + "shl v3.4s, v1.4s, #2 \n" // 4 * dx + "mul v1.4s, v1.4s, v2.4s \n" + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "add v1.4s, v1.4s, v0.4s \n" + // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx + "add v2.4s, v1.4s, v3.4s \n" + "shl v0.4s, v3.4s, #1 \n" // 8 * dx + "1: \n" + LOAD2_DATA8_LANE(0) + LOAD2_DATA8_LANE(1) + LOAD2_DATA8_LANE(2) + LOAD2_DATA8_LANE(3) + LOAD2_DATA8_LANE(4) + LOAD2_DATA8_LANE(5) + LOAD2_DATA8_LANE(6) + LOAD2_DATA8_LANE(7) + "mov v6.16b, v1.16b \n" + "mov v7.16b, v2.16b \n" + "uzp1 v6.8h, v6.8h, v7.8h \n" + "ushll v4.8h, v4.8b, #0 \n" + "ushll v5.8h, v5.8b, #0 \n" + "ssubl v16.4s, v5.4h, v4.4h \n" + "ssubl2 v17.4s, v5.8h, v4.8h \n" + "ushll v7.4s, v6.4h, #0 \n" + "ushll2 v6.4s, v6.8h, #0 \n" + "mul v16.4s, v16.4s, v7.4s \n" + "mul v17.4s, v17.4s, v6.4s \n" + "shrn v6.4h, v16.4s, #16 \n" + "shrn2 v6.8h, v17.4s, #16 \n" + "add v4.8h, v4.8h, v6.8h \n" + "xtn v4.8b, v4.8h \n" + + MEMACCESS(0) + "st1 {v4.8b}, [%0], #8 \n" // store pixels + "add v1.4s, v1.4s, v0.4s \n" + "add v2.4s, v2.4s, v0.4s \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width64), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", + "v4", "v5", "v6", "v7", "v16", "v17" + ); +} + +#undef LOAD2_DATA8_LANE + // 16x2 -> 16x1 void ScaleFilterRows_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { int y_fraction = 256 - source_y_fraction; asm volatile ( - "cmp %4, #0 \n" + "cmp %w4, #0 \n" "b.eq 100f \n" "add %2, %2, %1 \n" - "cmp %4, #64 \n" + "cmp %w4, #64 \n" "b.eq 75f \n" - "cmp %4, #128 \n" + "cmp %w4, #128 \n" "b.eq 50f \n" - "cmp %4, #192 \n" + "cmp %w4, #192 \n" "b.eq 25f \n" "dup v5.8b, %w4 \n" @@ -546,7 +676,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) "ld1 {v1.16b}, [%2], #16 \n" - "subs %3, %3, #16 \n" + "subs %w3, %w3, #16 \n" "umull v6.8h, v0.8b, v4.8b \n" "umull2 v7.8h, v0.16b, v4.16b \n" "umlal v6.8h, v1.8b, v5.8b \n" @@ -564,7 +694,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) "ld1 {v1.16b}, [%2], #16 \n" - "subs %3, %3, #16 \n" + "subs %w3, %w3, #16 \n" "urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) @@ -578,7 +708,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) "ld1 {v1.16b}, [%2], #16 \n" - "subs %3, %3, #16 \n" + "subs %w3, %w3, #16 \n" "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) "st1 {v0.16b}, [%0], #16 \n" @@ -591,7 +721,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "ld1 {v1.16b}, [%1], #16 \n" MEMACCESS(2) "ld1 {v0.16b}, [%2], #16 \n" - "subs %3, %3, #16 \n" + "subs %w3, %w3, #16 \n" "urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) @@ -603,7 +733,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "100: \n" MEMACCESS(1) "ld1 {v0.16b}, [%1], #16 \n" - "subs %3, %3, #16 \n" + "subs %w3, %w3, #16 \n" MEMACCESS(0) "st1 {v0.16b}, [%0], #16 \n" "b.gt 100b \n" @@ -631,7 +761,7 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "ld2 {v0.4s, v1.4s}, [%0], #32 \n" MEMACCESS (0) "ld2 {v2.4s, v3.4s}, [%0], #32 \n" - "subs %2, %2, #8 \n" // 8 processed per loop + "subs %w2, %w2, #8 \n" // 8 processed per loop MEMACCESS (1) "st1 {v1.16b}, [%1], #16 \n" // store odd pixels MEMACCESS (1) @@ -645,6 +775,33 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } +void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + "1: \n" + MEMACCESS (0) + // load 8 ARGB pixels. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack + "rshrn v1.8b, v1.8h, #1 \n" + "rshrn v2.8b, v2.8h, #1 \n" + "rshrn v3.8b, v3.8h, #1 \n" + MEMACCESS (1) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + ); +} + void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( @@ -653,7 +810,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "1: \n" MEMACCESS (0) "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. + "subs %w3, %w3, #8 \n" // 8 processed per loop. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. @@ -694,21 +851,21 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, "ld1 {v0.s}[2], [%0], %3 \n" MEMACCESS(0) "ld1 {v0.s}[3], [%0], %3 \n" - "subs %2, %2, #4 \n" // 4 pixels per loop. + "subs %w2, %w2, #4 \n" // 4 pixels per loop. MEMACCESS(1) "st1 {v0.16b}, [%1], #16 \n" "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 - : "r"(static_cast(src_stepx * 4)) // %3 + : "r"((int64)(src_stepx * 4)) // %3 : "memory", "cc", "v0" ); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. -// TODO, might be worth another optimization pass in future. +// TODO(Yang Zhang): Might be worth another optimization pass in future. // It could be upgraded to 8 pixels at a time to start with. void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, @@ -717,36 +874,36 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, "add %1, %1, %0 \n" "1: \n" MEMACCESS(0) - "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 + "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 MEMACCESS(1) - "ld1 {v1.8b}, [%1], %4 \n" + "ld1 {v1.8b}, [%1], %4 \n" MEMACCESS(0) - "ld1 {v2.8b}, [%0], %4 \n" + "ld1 {v2.8b}, [%0], %4 \n" MEMACCESS(1) - "ld1 {v3.8b}, [%1], %4 \n" + "ld1 {v3.8b}, [%1], %4 \n" MEMACCESS(0) - "ld1 {v4.8b}, [%0], %4 \n" + "ld1 {v4.8b}, [%0], %4 \n" MEMACCESS(1) - "ld1 {v5.8b}, [%1], %4 \n" + "ld1 {v5.8b}, [%1], %4 \n" MEMACCESS(0) - "ld1 {v6.8b}, [%0], %4 \n" + "ld1 {v6.8b}, [%0], %4 \n" MEMACCESS(1) - "ld1 {v7.8b}, [%1], %4 \n" - "uaddl v0.8h, v0.8b, v1.8b \n" - "uaddl v2.8h, v2.8b, v3.8b \n" - "uaddl v4.8h, v4.8b, v5.8b \n" - "uaddl v6.8h, v6.8b, v7.8b \n" - "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd - "mov v0.d[1], v2.d[0] \n" - "mov v2.d[0], v16.d[1] \n" - "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh - "mov v4.d[1], v6.d[0] \n" - "mov v6.d[0], v16.d[1] \n" - "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) - "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) - "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. - "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. - "subs %3, %3, #4 \n" // 4 pixels per loop. + "ld1 {v7.8b}, [%1], %4 \n" + "uaddl v0.8h, v0.8b, v1.8b \n" + "uaddl v2.8h, v2.8b, v3.8b \n" + "uaddl v4.8h, v4.8b, v5.8b \n" + "uaddl v6.8h, v6.8b, v7.8b \n" + "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd + "mov v0.d[1], v2.d[0] \n" + "mov v2.d[0], v16.d[1] \n" + "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh + "mov v4.d[1], v6.d[0] \n" + "mov v6.d[0], v16.d[1] \n" + "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) + "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) + "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. + "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. + "subs %w3, %w3, #4 \n" // 4 pixels per loop. MEMACCESS(2) "st1 {v0.16b}, [%2], #16 \n" "b.gt 1b \n" @@ -754,10 +911,129 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, "+r"(src_stride), // %1 "+r"(dst_argb), // %2 "+r"(dst_width) // %3 - : "r"(src_stepx * 4) // %4 + : "r"((int64)(src_stepx * 4)) // %4 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD1_DATA32_LANE(vn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "ld1 {"#vn".s}["#n"], [%6] \n" + +void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + const uint8* src_tmp = src_argb; + int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. + int64 x64 = (int64) x; + int64 dx64 = (int64) dx; + int64 tmp64 = 0; + asm volatile ( + "1: \n" + LOAD1_DATA32_LANE(v0, 0) + LOAD1_DATA32_LANE(v0, 1) + LOAD1_DATA32_LANE(v0, 2) + LOAD1_DATA32_LANE(v0, 3) + LOAD1_DATA32_LANE(v1, 0) + LOAD1_DATA32_LANE(v1, 1) + LOAD1_DATA32_LANE(v1, 2) + LOAD1_DATA32_LANE(v1, 3) + + MEMACCESS(0) + "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width64), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "+r"(tmp64), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1" + ); +} + +#undef LOAD1_DATA32_LANE + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA32_LANE(vn1, vn2, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n" + +void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8* src_tmp = src_argb; + int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. + int64 x64 = (int64) x; + int64 dx64 = (int64) dx; + asm volatile ( + "dup v0.4s, %w3 \n" // x + "dup v1.4s, %w4 \n" // dx + "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 + "shl v6.4s, v1.4s, #2 \n" // 4 * dx + "mul v1.4s, v1.4s, v2.4s \n" + "movi v3.16b, #0x7f \n" // 0x7F + "movi v4.8h, #0x7f \n" // 0x7F + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "add v5.4s, v1.4s, v0.4s \n" + "1: \n" + // d0, d1: a + // d2, d3: b + LOAD2_DATA32_LANE(v0, v1, 0) + LOAD2_DATA32_LANE(v0, v1, 1) + LOAD2_DATA32_LANE(v0, v1, 2) + LOAD2_DATA32_LANE(v0, v1, 3) + "shrn v2.4h, v5.4s, #9 \n" + "and v2.8b, v2.8b, v4.8b \n" + "dup v16.8b, v2.b[0] \n" + "dup v17.8b, v2.b[2] \n" + "dup v18.8b, v2.b[4] \n" + "dup v19.8b, v2.b[6] \n" + "ext v2.8b, v16.8b, v17.8b, #4 \n" + "ext v17.8b, v18.8b, v19.8b, #4 \n" + "ins v2.d[1], v17.d[0] \n" // f + "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f + "umull v16.8h, v0.8b, v7.8b \n" + "umull2 v17.8h, v0.16b, v7.16b \n" + "umull v18.8h, v1.8b, v2.8b \n" + "umull2 v19.8h, v1.16b, v2.16b \n" + "add v16.8h, v16.8h, v18.8h \n" + "add v17.8h, v17.8h, v19.8h \n" + "shrn v0.8b, v16.8h, #7 \n" + "shrn2 v0.16b, v17.8h, #7 \n" + + MEMACCESS(0) + "st1 {v0.4s}, [%0], #16 \n" // store pixels + "add v5.4s, v5.4s, v6.4s \n" + "subs %w2, %w2, #4 \n" // 4 processed per loop + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width64), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v16", "v17", "v18", "v19" + ); +} + +#undef LOAD2_DATA32_LANE + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/third_party/libyuv/source/scale_win.cc b/third_party/libyuv/source/scale_win.cc index e0209cd..c3896eb 100644 --- a/third_party/libyuv/source/scale_win.cc +++ b/third_party/libyuv/source/scale_win.cc @@ -9,6 +9,7 @@ */ #include "libyuv/row.h" +#include "libyuv/scale_row.h" #ifdef __cplusplus namespace libyuv { @@ -16,7 +17,8 @@ extern "C" { #endif // This module is for Visual C x86. -#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && \ + defined(_MSC_VER) && !defined(__clang__) // Offsets for source bytes 0 to 9 static uvec8 kShuf0 = @@ -93,8 +95,7 @@ static uvec16 kScaleAb2 = { 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 }; // Reads 32 pixels, throws half away and writes 16 pixels. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -120,8 +121,7 @@ void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x1 rectangle to 16x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -157,8 +157,7 @@ void ScaleRowDown2Linear_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x2 rectangle to 16x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -199,9 +198,116 @@ void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } } +#ifdef HAS_SCALEROWDOWN2_AVX2 +// Reads 64 pixels, throws half away and writes 32 pixels. +__declspec(naked) +void ScaleRowDown2_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpsrlw ymm0, ymm0, 8 // isolate odd pixels. + vpsrlw ymm1, ymm1, 8 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + vzeroupper + ret + } +} + +// Blends 64x1 rectangle to 32x1. +__declspec(naked) +void ScaleRowDown2Linear_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + + vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b + vpsrlw ymm4, ymm4, 15 + vpackuswb ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 // constant 0 + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + + vpmaddubsw ymm0, ymm0, ymm4 // average horizontally + vpmaddubsw ymm1, ymm1, ymm4 + vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 + vpavgw ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + vzeroupper + ret + } +} + +// Blends 64x2 rectangle to 32x1. +__declspec(naked) +void ScaleRowDown2Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + + vpcmpeqb ymm4, ymm4, ymm4 // '1' constant, 8b + vpsrlw ymm4, ymm4, 15 + vpackuswb ymm4, ymm4, ymm4 + vpxor ymm5, ymm5, ymm5 // constant 0 + + wloop: + vmovdqu ymm0, [eax] // average rows + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + lea eax, [eax + 64] + + vpmaddubsw ymm0, ymm0, ymm4 // average horizontally + vpmaddubsw ymm1, ymm1, ymm4 + vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 + vpavgw ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + sub ecx, 32 + jg wloop + + pop esi + vzeroupper + ret + } +} +#endif // HAS_SCALEROWDOWN2_AVX2 + // Point samples 32 pixels to 8 pixels. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -232,8 +338,7 @@ void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } // Blends 32x4 rectangle to 8x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -248,11 +353,11 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, psrlw xmm7, 8 wloop: - movdqu xmm0, [eax] + movdqu xmm0, [eax] // average rows movdqu xmm1, [eax + 16] movdqu xmm2, [eax + esi] movdqu xmm3, [eax + esi + 16] - pavgb xmm0, xmm2 // average rows + pavgb xmm0, xmm2 pavgb xmm1, xmm3 movdqu xmm2, [eax + esi * 2] movdqu xmm3, [eax + esi * 2 + 16] @@ -291,13 +396,102 @@ void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } } +#ifdef HAS_SCALEROWDOWN4_AVX2 +// Point samples 64 pixels to 16 pixels. +__declspec(naked) +void ScaleRowDown4_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0x00ff0000 + vpsrld ymm5, ymm5, 24 + vpslld ymm5, ymm5, 16 + + wloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + lea eax, [eax + 64] + vpand ymm0, ymm0, ymm5 + vpand ymm1, ymm1, ymm5 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpsrlw ymm0, ymm0, 8 + vpackuswb ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vmovdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + vzeroupper + ret + } +} + +// Blends 64x4 rectangle to 16x1. +__declspec(naked) +void ScaleRowDown4Box_AVX2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_ptr + mov esi, [esp + 8 + 8] // src_stride + mov edx, [esp + 8 + 12] // dst_ptr + mov ecx, [esp + 8 + 16] // dst_width + lea edi, [esi + esi * 2] // src_stride * 3 + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0x00ff00ff + vpsrlw ymm7, ymm7, 8 + + wloop: + vmovdqu ymm0, [eax] // average rows + vmovdqu ymm1, [eax + 32] + vpavgb ymm0, ymm0, [eax + esi] + vpavgb ymm1, ymm1, [eax + esi + 32] + vmovdqu ymm2, [eax + esi * 2] + vmovdqu ymm3, [eax + esi * 2 + 32] + vpavgb ymm2, ymm2, [eax + edi] + vpavgb ymm3, ymm3, [eax + edi + 32] + lea eax, [eax + 64] + vpavgb ymm0, ymm0, ymm2 + vpavgb ymm1, ymm1, ymm3 + + vpand ymm2, ymm0, ymm7 // average columns (64 to 32 pixels) + vpand ymm3, ymm1, ymm7 + vpsrlw ymm0, ymm0, 8 + vpsrlw ymm1, ymm1, 8 + vpavgw ymm0, ymm0, ymm2 + vpavgw ymm1, ymm1, ymm3 + vpackuswb ymm0, ymm0, ymm1 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + + vpand ymm2, ymm0, ymm7 // average columns (32 to 16 pixels) + vpsrlw ymm0, ymm0, 8 + vpavgw ymm0, ymm0, ymm2 + vpackuswb ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + + vmovdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg wloop + + pop edi + pop esi + vzeroupper + ret + } +} +#endif // HAS_SCALEROWDOWN4_AVX2 + // Point samples 32 pixels to 24 pixels. // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. // Then shuffled to do the scaling. -// Note that movdqa+palign may be better than movdqu. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -344,8 +538,7 @@ void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, // xmm7 kRound34 // Note that movdqa+palign may be better than movdqu. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { @@ -402,8 +595,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, } // Note that movdqa+palign may be better than movdqu. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { @@ -465,7 +657,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, // 3/8 point sampler // Scale 32 pixels to 12 -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -496,7 +688,7 @@ void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, } // Scale 16x3 pixels to 6x1 with interpolation -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { @@ -561,7 +753,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, } // Scale 16x2 pixels to 6x1 with interpolation -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { @@ -605,76 +797,68 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, } } -// Reads 16xN bytes and produces 16 shorts at a time. -// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. -__declspec(naked) __declspec(align(16)) -void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, - int src_height) { +// Reads 16 bytes and accumulates to 16 shorts at a time. +__declspec(naked) +void ScaleAddRow_SSE2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { __asm { - push esi - push edi - push ebx - push ebp - mov esi, [esp + 16 + 4] // src_ptr - mov edx, [esp + 16 + 8] // src_stride - mov edi, [esp + 16 + 12] // dst_ptr - mov ecx, [esp + 16 + 16] // dst_width - mov ebx, [esp + 16 + 20] // height - pxor xmm4, xmm4 - dec ebx + mov eax, [esp + 4] // src_ptr + mov edx, [esp + 8] // dst_ptr + mov ecx, [esp + 12] // src_width + pxor xmm5, xmm5 + // sum rows xloop: - // first row - movdqu xmm0, [esi] - lea eax, [esi + edx] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm4 - punpckhbw xmm1, xmm4 - lea esi, [esi + 16] - mov ebp, ebx - test ebp, ebp - je ydone - - // sum remaining rows - yloop: - movdqu xmm2, [eax] // read 16 pixels - lea eax, [eax + edx] // advance to next row - movdqa xmm3, xmm2 - punpcklbw xmm2, xmm4 - punpckhbw xmm3, xmm4 + movdqu xmm3, [eax] // read 16 bytes + lea eax, [eax + 16] + movdqu xmm0, [edx] // read 16 words from destination + movdqu xmm1, [edx + 16] + movdqa xmm2, xmm3 + punpcklbw xmm2, xmm5 + punpckhbw xmm3, xmm5 paddusw xmm0, xmm2 // sum 16 words paddusw xmm1, xmm3 - sub ebp, 1 - jg yloop - - ydone: - movdqu [edi], xmm0 - movdqu [edi + 16], xmm1 - lea edi, [edi + 32] - + movdqu [edx], xmm0 // write 16 words to destination + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] sub ecx, 16 jg xloop + ret + } +} - pop ebp - pop ebx - pop edi - pop esi +#ifdef HAS_SCALEADDROW_AVX2 +// Reads 32 bytes and accumulates to 32 shorts at a time. +__declspec(naked) +void ScaleAddRow_AVX2(const uint8* src_ptr, uint16* dst_ptr, int src_width) { + __asm { + mov eax, [esp + 4] // src_ptr + mov edx, [esp + 8] // dst_ptr + mov ecx, [esp + 12] // src_width + vpxor ymm5, ymm5, ymm5 + + // sum rows + xloop: + vmovdqu ymm3, [eax] // read 32 bytes + lea eax, [eax + 32] + vpermq ymm3, ymm3, 0xd8 // unmutate for vpunpck + vpunpcklbw ymm2, ymm3, ymm5 + vpunpckhbw ymm3, ymm3, ymm5 + vpaddusw ymm0, ymm2, [edx] // sum 16 words + vpaddusw ymm1, ymm3, [edx + 32] + vmovdqu [edx], ymm0 // write 32 words to destination + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 32 + jg xloop + + vzeroupper ret } } +#endif // HAS_SCALEADDROW_AVX2 // Bilinear column filtering. SSSE3 version. -// TODO(fbarchard): Port to Neon -// TODO(fbarchard): Switch the following: -// xor ebx, ebx -// mov bx, word ptr [esi + eax] // 2 source x0 pixels -// To -// movzx ebx, word ptr [esi + eax] // 2 source x0 pixels -// when drmemory bug fixed. -// https://code.google.com/p/drmemory/issues/detail?id=1396 - -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx) { __asm { @@ -751,8 +935,7 @@ void ScaleFilterCols_SSSE3(uint8* dst_ptr, const uint8* src_ptr, } // Reads 16 pixels, duplicates them and writes 32 pixels. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, int dst_width, int x, int dx) { __asm { @@ -777,8 +960,7 @@ void ScaleColsUp2_SSE2(uint8* dst_ptr, const uint8* src_ptr, } // Reads 8 pixels, throws half away and writes 4 even pixels (0, 2, 4, 6) -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) { @@ -803,8 +985,7 @@ void ScaleARGBRowDown2_SSE2(const uint8* src_argb, } // Blends 8x1 rectangle to 4x1. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) { @@ -832,8 +1013,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8* src_argb, } // Blends 8x2 rectangle to 4x1. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) { @@ -867,8 +1047,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, } // Reads 4 pixels at a time. -// Alignment requirement: dst_argb 16 byte aligned. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width) { @@ -904,8 +1083,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, } // Blends four 2x2 to 4x1. -// Alignment requirement: dst_argb 16 byte aligned. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, @@ -953,7 +1131,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, } // Column scaling unfiltered. SSE2 version. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx) { __asm { @@ -1044,7 +1222,7 @@ static uvec8 kShuffleFractions = { 0u, 0u, 0u, 0u, 0u, 0u, 0u, 0u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, 4u, }; -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx) { __asm { @@ -1115,8 +1293,7 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, } // Reads 4 pixels, duplicates them and writes 8 pixels. -// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -__declspec(naked) __declspec(align(16)) +__declspec(naked) void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, int dst_width, int x, int dx) { __asm { @@ -1141,7 +1318,7 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, } // Divide num by div and return as 16.16 fixed point result. -__declspec(naked) __declspec(align(16)) +__declspec(naked) int FixedDiv_X86(int num, int div) { __asm { mov eax, [esp + 4] // num @@ -1154,7 +1331,7 @@ int FixedDiv_X86(int num, int div) { } // Divide num by div and return as 16.16 fixed point result. -__declspec(naked) __declspec(align(16)) +__declspec(naked) int FixedDiv1_X86(int num, int div) { __asm { mov eax, [esp + 4] // num @@ -1169,8 +1346,7 @@ int FixedDiv1_X86(int num, int div) { ret } } - -#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) #ifdef __cplusplus } // extern "C" -- 2.7.4