diff --git a/GEMINI.md b/GEMINI.md index 3bda686fd..03cdc986d 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -1,44 +1,62 @@ # Gemini Project Context: libyuv Row Functions -This file provides context for the core row-processing architecture of libyuv. Use these guidelines when refactoring, reviewing, or generating code within the `row_*.cc` files. +This file provides context for the core row-processing architecture of +libyuv. Use these guidelines when refactoring, reviewing, or generating +code within the `row_*.cc` files. ## Architectural Overview -Libyuv uses a dispatch system where high-level conversion functions call optimized "Row" functions. These functions are categorized by SIMD architecture and compiler compatibility. +Libyuv uses a dispatch system where high-level conversion functions call +optimized "Row" functions. These functions are categorized by SIMD architecture +and compiler compatibility. ## Source File Map ### x86 Architectures (32-bit and 64-bit) -* **row_gcc.cc**: **Master copy.** Contains inline assembly in GCC syntax for GCC and Clang. Supports AVX, and AVX512. AVX512 implementations are strictly for 64-bit targets. -* **row_win.cc**: Derivative of `row_gcc.cc`. Contains C++ intrinsics specifically for Visual C++ (MSVC). Can be tested with Clang using `-DLIBYUV_ENABLE_ROWWIN`. +* **row_gcc.cc**: **Master copy.** Contains inline assembly in GCC syntax for + GCC and Clang. Supports AVX, and AVX512. AVX512 implementations are strictly + for 64-bit targets. +* **row_win.cc**: Derivative of `row_gcc.cc`. Contains C++ intrinsics + specifically for Visual C++ (MSVC). Can be tested with Clang using + `-DLIBYUV_ENABLE_ROWWIN`. * **Note**: Use either `row_gcc` or `row_win`, never both. ### ARM Architectures -* **row_neon.cc**: 32-bit ARM. Written entirely in inline assembly for GCC/Clang. -* **row_neon64.cc**: 64-bit ARM (AArch64). Written entirely in inline assembly for GCC/Clang. +* **row_neon.cc**: 32-bit ARM. Written entirely in inline assembly for + GCC/Clang. +* **row_neon64.cc**: 64-bit ARM (AArch64). Written entirely in inline assembly + for GCC/Clang. * **row_sve.cc**: ARMv9 Scalable Vector Extensions (SVE). -* **row_sme.cc**: ARMv9 Scalable Matrix Extension (SME) and Streaming SVE (SSVE). +* **row_sme.cc**: ARMv9 Scalable Matrix Extension (SME) and Streaming SVE + (SSVE). ### Other Architectures -* **row_rvv.cc**: RISC-V Vector (RVV). Implemented using intrinsics. Optimized for SiFive X280. +* **row_rvv.cc**: RISC-V Vector (RVV). Implemented using intrinsics. Optimized + for SiFive X280. * **row_lsx.cc / row_lasx.cc**: Loongarch MIPS-like extensions. ### Utility and Fallbacks -* **row_common.cc**: Portable C/C++ versions. This is the reference implementation. -* **row_any.cc**: Handles "remainder" pixels for widths not multiples of SIMD register size. Used for x86, NEON, and MIPS. Not required for SVE, SME, or RVV due to hardware-level masking. +* **row_common.cc**: Portable C/C++ versions. This is the reference + implementation. +* **row_any.cc**: Handles "remainder" pixels for widths not multiples of SIMD + register size. Used for x86, NEON, and MIPS. Not required for SVE, SME, or + RVV due to hardware-level masking. ## Coding Guidelines -1. **AVX512 Logic**: AVX512 row functions are strictly enabled for **64-bit x86 only**. -2. **Feature Macros**: Use the `HAS_` macros in `include/libyuv/row.h` to enable or disable specific AVX512 versions. +1. **AVX512 Logic**: AVX512 row functions are strictly enabled for **64-bit x86 + only**. +2. **Feature Macros**: Use the `HAS_` macros in `include/libyuv/row.h` to + enable or disable specific AVX512 versions. ## Changelist (CL) & Commit Guidelines -When generating descriptions, follow the Chromium/Google standard format. Wrap commit message text at 72 characters +When generating descriptions, follow the Chromium/Google standard format. Wrap +commit message text at 72 characters ### Format Example: diff --git a/README.chromium b/README.chromium index f97dcea59..e025cb9d6 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1946 +Version: 1947 Revision: DEPS License: BSD-3-Clause License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 5aced2a2a..1ec86f5eb 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -23,10 +23,11 @@ extern "C" { #endif // This module is for Visual C 32/64 bit -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__) || \ - defined(_M_X64) || defined(_M_X86)) -#if ((defined(_MSC_VER) && !defined(__clang__)) || defined(LIBYUV_ENABLE_ROWWIN)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || \ + defined(_M_X86)) +#if ((defined(_MSC_VER) && !defined(__clang__)) || \ + defined(LIBYUV_ENABLE_ROWWIN)) #define USE_ROW_WIN #else #define USE_ROW_GCC @@ -121,9 +122,9 @@ extern "C" { // The following are available on all x86 platforms, but // require VS2012, clang 3.4 or gcc 4.7. -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__) || \ - defined(_M_X64) || defined(_M_X86)) +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || \ + defined(_M_X86)) #define HAS_ARGBMIRRORROW_AVX2 #define HAS_RGB24MIRRORROW_AVX2 #define HAS_ARGBTOUVMATRIXROW_AVX2 @@ -139,7 +140,7 @@ extern "C" { #define HAS_INTERPOLATEROW_AVX2 #endif -#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \ +#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \ (defined(VISUALC_HAS_AVX2) || defined(CLANG_HAS_AVX2) || \ defined(GCC_HAS_AVX2)) #define HAS_ARGBCOPYALPHAROW_AVX2 @@ -183,7 +184,7 @@ extern "C" { // The following are available for gcc/clang x86 platforms: // TODO(fbarchard): Port to Visual C #if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \ - (defined(__x86_64__) || defined(__i386__)) && \ + (defined(__x86_64__) || defined(__i386__)) && \ !defined(LIBYUV_ENABLE_ROWWIN) #define HAS_AB64TOARGBROW_SSSE3 #define HAS_ABGRTOAR30ROW_SSSE3 @@ -259,8 +260,8 @@ extern "C" { // The following are available for AVX2 gcc/clang x86 platforms: // TODO(fbarchard): Port to Visual C #if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \ - (defined(__x86_64__) || defined(__i386__)) && \ - (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) && \ + (defined(__x86_64__) || defined(__i386__)) && \ + (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) && \ !defined(LIBYUV_ENABLE_ROWWIN) #define HAS_AB64TOARGBROW_AVX2 #define HAS_ABGRTOAR30ROW_AVX2 @@ -342,19 +343,21 @@ extern "C" { #endif // This module is for Visual C 32/64 bit -#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_WIN) && \ - (defined(__x86_64__) || defined(__i386__) || \ - defined(_M_X64) || defined(_M_X86)) && \ - ((defined(_MSC_VER) && !defined(__clang__)) || \ +#if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_WIN) && \ + (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || \ + defined(_M_X86)) && \ + ((defined(_MSC_VER) && !defined(__clang__)) || \ defined(LIBYUV_ENABLE_ROWWIN)) #define HAS_RAWTOARGBROW_AVX2 #define HAS_RGB24TOARGBROW_AVX2 #define HAS_RGB565TOARGBROW_AVX2 #define HAS_ARGB1555TOARGBROW_AVX2 #define HAS_ARGB4444TOARGBROW_AVX2 +#define HAS_ARGBSHUFFLEROW_AVX2 #if defined(__x86_64__) || defined(_M_X64) #define HAS_RAWTOARGBROW_AVX512BW #define HAS_RGB24TOARGBROW_AVX512BW +#define HAS_ARGBSHUFFLEROW_AVX512BW #endif #define HAS_ARGBTOYROW_AVX2 #define HAS_ARGBTOYMATRIXROW_AVX2 @@ -383,7 +386,6 @@ extern "C" { #endif #define HAS_ARGBTORGB24ROW_AVX512VBMI #define HAS_CONVERT16TO8ROW_AVX512BW -#define HAS_MERGEUVROW_AVX512BW #endif // The following are available for AVX512 clang x64 platforms: @@ -401,6 +403,11 @@ extern "C" { #define HAS_ARGBTOUVJROW_AVX512BW #define HAS_ARGBTOUVMATRIXROW_AVX512BW #define HAS_J400TOARGBROW_AVX512BW +#define HAS_MERGEUVROW_AVX512BW +#define HAS_MIRRORROW_AVX512BW +#define HAS_MIRRORSPLITUVROW_AVX512BW +#define HAS_SPLITUVROW_AVX512BW +#define HAS_RGBTOUVMATRIXROW_AVX512BW #endif // The following are available on Neon platforms: @@ -1041,7 +1048,7 @@ struct ArgbConstants { #endif -#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1))) +#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1))) #define align_buffer_64(var, size) \ size_t var##_mem_size = (size); /* NOLINT */ \ @@ -1097,26 +1104,17 @@ struct ArgbConstants { #define IACA_UD_BYTES __asm__ __volatile__("\n\t .byte 0x0F, 0x0B"); #else /* Visual C */ -#define IACA_UD_BYTES \ - { __asm _emit 0x0F __asm _emit 0x0B } +#define IACA_UD_BYTES {__asm _emit 0x0F __asm _emit 0x0B} #define IACA_SSC_MARK(x) \ - { __asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90 } + {__asm mov ebx, x __asm _emit 0x64 __asm _emit 0x67 __asm _emit 0x90} #define IACA_VC64_START __writegsbyte(111, 111); #define IACA_VC64_END __writegsbyte(222, 222); #endif -#define IACA_START \ - { \ - IACA_UD_BYTES \ - IACA_SSC_MARK(111) \ - } -#define IACA_END \ - { \ - IACA_SSC_MARK(222) \ - IACA_UD_BYTES \ - } +#define IACA_START {IACA_UD_BYTES IACA_SSC_MARK(111)} +#define IACA_END {IACA_SSC_MARK(222) IACA_UD_BYTES} void I210AlphaToARGBRow_NEON(const uint16_t* src_y, const uint16_t* src_u, @@ -1828,9 +1826,9 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, int width, const struct ArgbConstants* c); void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c); + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); void ARGBToUV444MatrixRow_Any_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -2194,10 +2192,26 @@ void RGB565ToYMatrixRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width, const struct ArgbConstants* c); -void ARGB1555ToYMatrixRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width, const struct ArgbConstants* c); -void ARGB1555ToUVMatrixRow_C(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void ARGB4444ToYMatrixRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width, const struct ArgbConstants* c); -void ARGB4444ToUVMatrixRow_C(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); +void ARGB1555ToYMatrixRow_C(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGB1555ToUVMatrixRow_C(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void ARGB4444ToYMatrixRow_C(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGB4444ToUVMatrixRow_C(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); void RGB565ToUVMatrixRow_C(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, @@ -2210,8 +2224,30 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void RGBToUVMatrixRow_Any_AVX2(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); +void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void RGBToUVMatrixRow_Any_AVX2(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void RGBToUVMatrixRow_AVX512BW(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void RGBToUVMatrixRow_Any_AVX512BW(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, @@ -2301,18 +2337,66 @@ void RGB565ToUVMatrixRow_Any_AVX2(const uint8_t* src_rgb565, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void RGB565ToYMatrixRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width, const struct ArgbConstants* c); -void ARGB1555ToYMatrixRow_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width, const struct ArgbConstants* c); -void ARGB1555ToUVMatrixRow_NEON(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void ARGB4444ToYMatrixRow_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width, const struct ArgbConstants* c); -void ARGB4444ToUVMatrixRow_NEON(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void RGB565ToYMatrixRow_Any_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width, const struct ArgbConstants* c); -void ARGB1555ToYMatrixRow_Any_NEON(const uint8_t* src_argb1555, uint8_t* dst_y, int width, const struct ArgbConstants* c); -void ARGB1555ToUVMatrixRow_Any_NEON(const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void ARGB4444ToYMatrixRow_Any_NEON(const uint8_t* src_argb4444, uint8_t* dst_y, int width, const struct ArgbConstants* c); -void ARGB4444ToUVMatrixRow_Any_NEON(const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void RGB565ToUVMatrixRow_Any_NEON(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); +void RGB565ToYMatrixRow_NEON(const uint8_t* src_rgb565, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGB1555ToYMatrixRow_NEON(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGB1555ToUVMatrixRow_NEON(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void ARGB4444ToYMatrixRow_NEON(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGB4444ToUVMatrixRow_NEON(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void RGB565ToYMatrixRow_Any_NEON(const uint8_t* src_rgb565, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGB1555ToYMatrixRow_Any_NEON(const uint8_t* src_argb1555, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGB1555ToUVMatrixRow_Any_NEON(const uint8_t* src_argb1555, + int src_stride_argb1555, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void ARGB4444ToYMatrixRow_Any_NEON(const uint8_t* src_argb4444, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGB4444ToUVMatrixRow_Any_NEON(const uint8_t* src_argb4444, + int src_stride_argb4444, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void RGB565ToUVMatrixRow_Any_NEON(const uint8_t* src_rgb565, + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, @@ -2340,9 +2424,22 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, int width, const struct ArgbConstants* c); -void RGBToUVMatrixRow_NEON(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); -void RGBToYMatrixRow_Any_NEON(const uint8_t* src_rgb, uint8_t* dst_y, int width, const struct ArgbConstants* c); -void RGBToUVMatrixRow_Any_NEON(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c); +void RGBToUVMatrixRow_NEON(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void RGBToYMatrixRow_Any_NEON(const uint8_t* src_rgb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void RGBToUVMatrixRow_Any_NEON(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); void ARGBToYMatrixRow_NEON_DotProd(const uint8_t* src_argb, uint8_t* dst_y, @@ -2374,7 +2471,6 @@ void ARGBToYMatrixRow_Any_LASX(const uint8_t* src_argb, int width, const struct ArgbConstants* c); - void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -2432,15 +2528,29 @@ void RGBAToYRow_C(const uint8_t* src_rgb, uint8_t* dst_y, int width); void RGB565ToYRow_C(const uint8_t* src_rgb565, uint8_t* dst_y, int width); void ARGB1555ToYRow_C(const uint8_t* src_argb1555, uint8_t* dst_y, int width); void ARGB4444ToYRow_C(const uint8_t* src_argb4444, uint8_t* dst_y, int width); -void ARGBToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ARGBToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ABGRToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void ABGRToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void ARGBToYRow_Any_AVX512BW(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToYJRow_Any_AVX512BW(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ABGRToYRow_Any_AVX512BW(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ABGRToYJRow_Any_AVX512BW(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void RGBAToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGBAToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGBAToYJRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGBAToYRow_Any_AVX512BW(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RGBAToYJRow_Any_AVX512BW(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void BGRAToYRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void BGRAToYRow_Any_AVX512BW(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void BGRAToYRow_Any_AVX512BW(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void ARGBToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -3040,12 +3150,16 @@ void ARGBToUVJ444Row_C(const uint8_t* src_argb, uint8_t* dst_v, int width); +void MirrorRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_LSX(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width); void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width); +void MirrorRow_Any_AVX512BW(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void MirrorRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width); @@ -3063,6 +3177,10 @@ void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void MirrorSplitUVRow_AVX512BW(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void MirrorSplitUVRow_AVX2(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, @@ -3124,6 +3242,10 @@ void SplitUVRow_SSE2(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); +void SplitUVRow_AVX512BW(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void SplitUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, @@ -3140,6 +3262,10 @@ void SplitUVRow_RVV(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width); +void SplitUVRow_Any_AVX512BW(const uint8_t* src_ptr, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void SplitUVRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, @@ -4160,8 +4286,12 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, int width); void RAWToARGBRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_argb, int width); void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width); -void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width); -void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width); +void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width); +void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, + uint8_t* dst_argb, + int width); void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width); void RAWToRGB24Row_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgb24, int width); @@ -4250,9 +4380,7 @@ void RGB24ToARGBRow_Any_SSSE3(const uint8_t* src_ptr, void RAWToARGBRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); +void RAWToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -4272,7 +4400,6 @@ void RAWToRGB24Row_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); - void RGB565ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); diff --git a/include/libyuv/row_sve.h b/include/libyuv/row_sve.h index f7e2123a7..280d635b9 100644 --- a/include/libyuv/row_sve.h +++ b/include/libyuv/row_sve.h @@ -631,8 +631,8 @@ static inline void I422ToRGB565Row_SVE_SC( // Calculate a predicate for the final iteration to deal with the tail. "cnth %[vl] \n" "whilelt p1.b, wzr, %w[width] \n" // - READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X - RGB8TORGB565_SVE_FROM_TOP_2X + READYUV422_SVE_2X I422TORGB_SVE_2X + RGBTOARGB8_SVE_TOP_2X RGB8TORGB565_SVE_FROM_TOP_2X // Need to permute the data on the final iteration such that the // predicates (.b) line up with the 16-bit element data. "trn1 z20.b, z18.b, z19.b \n" @@ -694,8 +694,8 @@ static inline void I422ToARGB1555Row_SVE_SC( // Calculate a predicate for the final iteration to deal with the tail. "cnth %[vl] \n" "whilelt p1.b, wzr, %w[width] \n" // - READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X - RGB8TOARGB1555_SVE_FROM_TOP_2X + READYUV422_SVE_2X I422TORGB_SVE_2X + RGBTOARGB8_SVE_TOP_2X RGB8TOARGB1555_SVE_FROM_TOP_2X "st2h {z0.h, z1.h}, p1, [%[dst]] \n" "99: \n" @@ -753,8 +753,8 @@ static inline void I422ToARGB4444Row_SVE_SC( // Calculate a predicate for the final iteration to deal with the tail. "cnth %[vl] \n" "whilelt p1.b, wzr, %w[width] \n" // - READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X - RGB8TOARGB4444_SVE_FROM_TOP_2X + READYUV422_SVE_2X I422TORGB_SVE_2X + RGBTOARGB8_SVE_TOP_2X RGB8TOARGB4444_SVE_FROM_TOP_2X "st2h {z0.h, z1.h}, p1, [%[dst]] \n" "99: \n" diff --git a/include/libyuv/version.h b/include/libyuv/version.h index d90f894f7..b12b94978 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1946 +#define LIBYUV_VERSION 1947 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc index 756f83cb3..36c5e575c 100644 --- a/source/compare_neon64.cc +++ b/source/compare_neon64.cc @@ -116,7 +116,7 @@ uint32_t HashDjb2_NEON(const uint8_t* src, int count, uint32_t seed) { uint32_t hash = seed; const uint32_t c16 = 0x92d9e201; // 33^16 uint32_t tmp, tmp2; - asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n" + asm("ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%[kIdx]] \n" "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%[kMuls]] \n" // count is always a multiple of 16. diff --git a/source/compare_win.cc b/source/compare_win.cc index 9d5bb27cd..59374cd8a 100644 --- a/source/compare_win.cc +++ b/source/compare_win.cc @@ -41,8 +41,9 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a, return diff; } -__declspec(naked) uint32_t - SumSquareError_SSE2(const uint8_t* src_a, const uint8_t* src_b, int count) { +__declspec(naked) uint32_t SumSquareError_SSE2(const uint8_t* src_a, + const uint8_t* src_b, + int count) { __asm { mov eax, [esp + 4] // src_a mov edx, [esp + 8] // src_b @@ -81,8 +82,9 @@ __declspec(naked) uint32_t #ifdef HAS_SUMSQUAREERROR_AVX2 // C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX. #pragma warning(disable : 4752) -__declspec(naked) uint32_t - SumSquareError_AVX2(const uint8_t* src_a, const uint8_t* src_b, int count) { +__declspec(naked) uint32_t SumSquareError_AVX2(const uint8_t* src_a, + const uint8_t* src_b, + int count) { __asm { mov eax, [esp + 4] // src_a mov edx, [esp + 8] // src_b @@ -146,8 +148,9 @@ uvec32 kHashMul3 = { 0x00000001, // 33 ^ 0 }; -__declspec(naked) uint32_t - HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { +__declspec(naked) uint32_t HashDjb2_SSE41(const uint8_t* src, + int count, + uint32_t seed) { __asm { mov eax, [esp + 4] // src mov ecx, [esp + 8] // count @@ -197,8 +200,9 @@ __declspec(naked) uint32_t // Visual C 2012 required for AVX2. #ifdef HAS_HASHDJB2_AVX2 -__declspec(naked) uint32_t - HashDjb2_AVX2(const uint8_t* src, int count, uint32_t seed) { +__declspec(naked) uint32_t HashDjb2_AVX2(const uint8_t* src, + int count, + uint32_t seed) { __asm { mov eax, [esp + 4] // src mov ecx, [esp + 8] // count diff --git a/source/convert.cc b/source/convert.cc index c5e4be418..fbef68f57 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -13,12 +13,11 @@ #include #include "libyuv/basic_types.h" +#include "libyuv/convert_from_argb.h" #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" -#include "libyuv/convert_from_argb.h" #include "libyuv/rotate.h" #include "libyuv/row.h" - #include "libyuv/scale.h" // For ScalePlane() #include "libyuv/scale_row.h" // For FixedDiv #include "libyuv/scale_uv.h" // For UVScale() @@ -2034,8 +2033,8 @@ int ARGBToI420(const uint8_t* src_argb, int width, int height) { return ARGBToI420Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, &kArgbI601Constants, - width, height); + dst_stride_u, dst_v, dst_stride_v, + &kArgbI601Constants, width, height); } LIBYUV_API @@ -2056,7 +2055,7 @@ int ARGBToI420Matrix(const uint8_t* src_argb, void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c) = -ARGBToUVMatrixRow_C; + ARGBToUVMatrixRow_C; #if defined(HAS_ARGBTOYMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -2121,34 +2120,34 @@ ARGBToUVMatrixRow_C; #endif #if defined(HAS_ARGBTOUVMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; - } + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM) - if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; - } + if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - if (IS_ALIGNED(width, 2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; - } + if (TestCpuFlag(kCpuHasSVE2)) { + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SME) - if (TestCpuFlag(kCpuHasSME)) { - if (IS_ALIGNED(width, 2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; - } + if (TestCpuFlag(kCpuHasSME)) { + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -2439,8 +2438,8 @@ int BGRAToI420(const uint8_t* src_bgra, int width, int height) { return ARGBToI420Matrix(src_bgra, src_stride_bgra, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, &kBgraI601Constants, - width, height); + dst_stride_u, dst_v, dst_stride_v, + &kBgraI601Constants, width, height); } // Convert BGRA to I422. @@ -2456,8 +2455,8 @@ int BGRAToI422(const uint8_t* src_bgra, int width, int height) { return ARGBToI422Matrix(src_bgra, src_stride_bgra, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, &kBgraI601Constants, - width, height); + dst_stride_u, dst_v, dst_stride_v, + &kBgraI601Constants, width, height); } // Convert ABGR to I422. @@ -2473,8 +2472,8 @@ int ABGRToI422(const uint8_t* src_abgr, int width, int height) { return ARGBToI422Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, &kAbgrI601Constants, - width, height); + dst_stride_u, dst_v, dst_stride_v, + &kAbgrI601Constants, width, height); } // Convert RGBA to I422. @@ -2490,8 +2489,8 @@ int RGBAToI422(const uint8_t* src_rgba, int width, int height) { return ARGBToI422Matrix(src_rgba, src_stride_rgba, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, &kRgbaI601Constants, - width, height); + dst_stride_u, dst_v, dst_stride_v, + &kRgbaI601Constants, width, height); } // Convert ABGR to I420. @@ -2507,8 +2506,8 @@ int ABGRToI420(const uint8_t* src_abgr, int width, int height) { return ARGBToI420Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, &kAbgrI601Constants, - width, height); + dst_stride_u, dst_v, dst_stride_v, + &kAbgrI601Constants, width, height); } // Convert RGBA to I420. @@ -2524,8 +2523,8 @@ int RGBAToI420(const uint8_t* src_rgba, int width, int height) { return ARGBToI420Matrix(src_rgba, src_stride_rgba, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, &kRgbaI601Constants, - width, height); + dst_stride_u, dst_v, dst_stride_v, + &kRgbaI601Constants, width, height); } // Enabled if 1 pass is available @@ -2569,6 +2568,14 @@ int RGB24ToI420(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGBTOUVMATRIXROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + RGBToUVMatrixRow = RGBToUVMatrixRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + RGBToUVMatrixRow = RGBToUVMatrixRow_AVX512BW; + } + } +#endif #if defined(HAS_RGBTOUVMATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGBToUVMatrixRow = RGBToUVMatrixRow_Any_NEON; @@ -2603,9 +2610,11 @@ int RGB24ToI420(const uint8_t* src_rgb24, } for (y = 0; y < height - 1; y += 2) { - RGBToUVMatrixRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width, &kArgbI601Constants); + RGBToUVMatrixRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width, + &kArgbI601Constants); RGBToYMatrixRow(src_rgb24, dst_y, width, &kArgbI601Constants); - RGBToYMatrixRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width, &kArgbI601Constants); + RGBToYMatrixRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width, + &kArgbI601Constants); src_rgb24 += src_stride_rgb24 * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; @@ -2854,15 +2863,15 @@ int RGB24ToJ420(const uint8_t* src_rgb24, // Convert RAW to I420. LIBYUV_API int RAWToI420(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { + int src_stride_rgb24, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; void (*RGBToUVMatrixRow)(const uint8_t* src_rgb, int src_stride_rgb, uint8_t* dst_u, uint8_t* dst_v, int width, @@ -2886,6 +2895,14 @@ int RAWToI420(const uint8_t* src_rgb24, } } #endif +#if defined(HAS_RGBTOUVMATRIXROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + RGBToUVMatrixRow = RGBToUVMatrixRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + RGBToUVMatrixRow = RGBToUVMatrixRow_AVX512BW; + } + } +#endif #if defined(HAS_RGBTOUVMATRIXROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { RGBToUVMatrixRow = RGBToUVMatrixRow_Any_NEON; @@ -2920,9 +2937,11 @@ int RAWToI420(const uint8_t* src_rgb24, } for (y = 0; y < height - 1; y += 2) { - RGBToUVMatrixRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width, &kArgbI601Constants); + RGBToUVMatrixRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width, + &kArgbI601Constants); RGBToYMatrixRow(src_rgb24, dst_y, width, &kArgbI601Constants); - RGBToYMatrixRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width, &kArgbI601Constants); + RGBToYMatrixRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width, + &kArgbI601Constants); src_rgb24 += src_stride_rgb24 * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; @@ -3622,9 +3641,11 @@ int RGB565ToI420(const uint8_t* src_rgb565, int y; void (*RGB565ToUVMatrixRow)(const uint8_t* src_rgb565, int src_stride_rgb565, uint8_t* dst_u, uint8_t* dst_v, int width, - const struct ArgbConstants* c) = RGB565ToUVMatrixRow_C; - void (*RGB565ToYMatrixRow)(const uint8_t* src_rgb565, uint8_t* dst_y, int width, - const struct ArgbConstants* c) = RGB565ToYMatrixRow_C; + const struct ArgbConstants* c) = + RGB565ToUVMatrixRow_C; + void (*RGB565ToYMatrixRow)(const uint8_t* src_rgb565, uint8_t* dst_y, + int width, const struct ArgbConstants* c) = + RGB565ToYMatrixRow_C; #if defined(HAS_RGB565TOYMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { @@ -3671,9 +3692,11 @@ int RGB565ToI420(const uint8_t* src_rgb565, } for (y = 0; y < height - 1; y += 2) { - RGB565ToUVMatrixRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width, &kArgbI601Constants); + RGB565ToUVMatrixRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width, + &kArgbI601Constants); RGB565ToYMatrixRow(src_rgb565, dst_y, width, &kArgbI601Constants); - RGB565ToYMatrixRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width, &kArgbI601Constants); + RGB565ToYMatrixRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, + width, &kArgbI601Constants); src_rgb565 += src_stride_rgb565 * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; @@ -3681,30 +3704,31 @@ int RGB565ToI420(const uint8_t* src_rgb565, } if (height & 1) { RGB565ToYMatrixRow(src_rgb565, dst_y, width, &kArgbI601Constants); - RGB565ToUVMatrixRow(src_rgb565, 0, dst_u, dst_v, width, &kArgbI601Constants); + RGB565ToUVMatrixRow(src_rgb565, 0, dst_u, dst_v, width, + &kArgbI601Constants); } return 0; } // Convert ARGB1555 to I420. LIBYUV_API int ARGB1555ToI420(const uint8_t* src_argb1555, - int src_stride_argb1555, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { + int src_stride_argb1555, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; void (*ARGB1555ToUVMatrixRow)( const uint8_t* src_argb1555, int src_stride_argb1555, uint8_t* dst_u, - uint8_t* dst_v, int width, - const struct ArgbConstants* c) = ARGB1555ToUVMatrixRow_C; - void (*ARGB1555ToYMatrixRow)( - const uint8_t* src_argb1555, uint8_t* dst_y, int width, - const struct ArgbConstants* c) = ARGB1555ToYMatrixRow_C; + uint8_t* dst_v, int width, const struct ArgbConstants* c) = + ARGB1555ToUVMatrixRow_C; + void (*ARGB1555ToYMatrixRow)(const uint8_t* src_argb1555, uint8_t* dst_y, + int width, const struct ArgbConstants* c) = + ARGB1555ToYMatrixRow_C; #if defined(HAS_ARGB1555TOYMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { @@ -3751,9 +3775,11 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, } for (y = 0; y < height - 1; y += 2) { - ARGB1555ToUVMatrixRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width, &kArgbI601Constants); + ARGB1555ToUVMatrixRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, + width, &kArgbI601Constants); ARGB1555ToYMatrixRow(src_argb1555, dst_y, width, &kArgbI601Constants); - ARGB1555ToYMatrixRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, width, &kArgbI601Constants); + ARGB1555ToYMatrixRow(src_argb1555 + src_stride_argb1555, + dst_y + dst_stride_y, width, &kArgbI601Constants); src_argb1555 += src_stride_argb1555 * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; @@ -3761,30 +3787,31 @@ int ARGB1555ToI420(const uint8_t* src_argb1555, } if (height & 1) { ARGB1555ToYMatrixRow(src_argb1555, dst_y, width, &kArgbI601Constants); - ARGB1555ToUVMatrixRow(src_argb1555, 0, dst_u, dst_v, width, &kArgbI601Constants); + ARGB1555ToUVMatrixRow(src_argb1555, 0, dst_u, dst_v, width, + &kArgbI601Constants); } return 0; } // Convert ARGB4444 to I420. LIBYUV_API int ARGB4444ToI420(const uint8_t* src_argb4444, - int src_stride_argb4444, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { + int src_stride_argb4444, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; void (*ARGB4444ToUVMatrixRow)( const uint8_t* src_argb4444, int src_stride_argb4444, uint8_t* dst_u, - uint8_t* dst_v, int width, - const struct ArgbConstants* c) = ARGB4444ToUVMatrixRow_C; - void (*ARGB4444ToYMatrixRow)( - const uint8_t* src_argb4444, uint8_t* dst_y, int width, - const struct ArgbConstants* c) = ARGB4444ToYMatrixRow_C; + uint8_t* dst_v, int width, const struct ArgbConstants* c) = + ARGB4444ToUVMatrixRow_C; + void (*ARGB4444ToYMatrixRow)(const uint8_t* src_argb4444, uint8_t* dst_y, + int width, const struct ArgbConstants* c) = + ARGB4444ToYMatrixRow_C; #if defined(HAS_ARGB4444TOYMATRIXROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { @@ -3831,9 +3858,11 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } for (y = 0; y < height - 1; y += 2) { - ARGB4444ToUVMatrixRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width, &kArgbI601Constants); + ARGB4444ToUVMatrixRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, + width, &kArgbI601Constants); ARGB4444ToYMatrixRow(src_argb4444, dst_y, width, &kArgbI601Constants); - ARGB4444ToYMatrixRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, width, &kArgbI601Constants); + ARGB4444ToYMatrixRow(src_argb4444 + src_stride_argb4444, + dst_y + dst_stride_y, width, &kArgbI601Constants); src_argb4444 += src_stride_argb4444 * 2; dst_y += dst_stride_y * 2; dst_u += dst_stride_u; @@ -3841,7 +3870,8 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, } if (height & 1) { ARGB4444ToYMatrixRow(src_argb4444, dst_y, width, &kArgbI601Constants); - ARGB4444ToUVMatrixRow(src_argb4444, 0, dst_u, dst_v, width, &kArgbI601Constants); + ARGB4444ToUVMatrixRow(src_argb4444, 0, dst_u, dst_v, width, + &kArgbI601Constants); } return 0; } @@ -3993,7 +4023,7 @@ int RGB24ToJ400(const uint8_t* src_rgb24, RGB24ToARGBRow = RGB24ToARGBRow_RVV; } #endif -{ + { // Allocate 1 row of ARGB. const int row_size = (width * 4 + 31) & ~31; align_buffer_64(row, row_size); diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 2df97d079..a0b9c5d37 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -3720,7 +3720,7 @@ int RGB24ToARGB(const uint8_t* src_rgb24, RGB24ToARGBRow = RGB24ToARGBRow_RVV; } #endif -for (y = 0; y < height; ++y) { + for (y = 0; y < height; ++y) { RGB24ToARGBRow(src_rgb24, dst_argb, width); src_rgb24 += src_stride_rgb24; dst_argb += dst_stride_argb; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index d912f4537..77b3851d4 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -35,8 +35,8 @@ int ARGBToI444(const uint8_t* src_argb, int width, int height) { return ARGBToI444Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, &kArgbI601Constants, - width, height); + dst_stride_u, dst_v, dst_stride_v, + &kArgbI601Constants, width, height); } LIBYUV_API @@ -54,10 +54,9 @@ int ARGBToI444Matrix(const uint8_t* src_argb, int y; void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c) = ARGBToYMatrixRow_C; - void (*ARGBToUV444MatrixRow)(const uint8_t* src_argb, uint8_t* dst_u, - uint8_t* dst_v, int width, - const struct ArgbConstants* c) = -ARGBToUV444MatrixRow_C; + void (*ARGBToUV444MatrixRow)( + const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width, + const struct ArgbConstants* c) = ARGBToUV444MatrixRow_C; #if defined(HAS_ARGBTOYMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -188,8 +187,8 @@ int ARGBToI422(const uint8_t* src_argb, int width, int height) { return ARGBToI422Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, &kArgbI601Constants, - width, height); + dst_stride_u, dst_v, dst_stride_v, + &kArgbI601Constants, width, height); } LIBYUV_API @@ -210,7 +209,7 @@ int ARGBToI422Matrix(const uint8_t* src_argb, void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c) = -ARGBToUVMatrixRow_C; + ARGBToUVMatrixRow_C; #if defined(HAS_ARGBTOYMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -275,34 +274,34 @@ ARGBToUVMatrixRow_C; #endif #if defined(HAS_ARGBTOUVMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; - } + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM) - if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; - } + if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - if (IS_ALIGNED(width, 2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; - } + if (TestCpuFlag(kCpuHasSVE2)) { + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SME) - if (TestCpuFlag(kCpuHasSME)) { - if (IS_ALIGNED(width, 2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; - } + if (TestCpuFlag(kCpuHasSME)) { + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -359,8 +358,9 @@ int ARGBToNV12(const uint8_t* src_argb, int dst_stride_uv, int width, int height) { - return ARGBToNV12Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_uv, - dst_stride_uv, &kArgbI601Constants, width, height); + return ARGBToNV12Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, + dst_uv, dst_stride_uv, &kArgbI601Constants, width, + height); } LIBYUV_API @@ -380,7 +380,7 @@ int ARGBToNV12Matrix(const uint8_t* src_argb, void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c) = -ARGBToUVMatrixRow_C; + ARGBToUVMatrixRow_C; #if defined(HAS_ARGBTOYMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -445,34 +445,34 @@ ARGBToUVMatrixRow_C; #endif #if defined(HAS_ARGBTOUVMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; - } + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM) - if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; - } + if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - if (IS_ALIGNED(width, 2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; - } + if (TestCpuFlag(kCpuHasSVE2)) { + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SME) - if (TestCpuFlag(kCpuHasSME)) { - if (IS_ALIGNED(width, 2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; - } + if (TestCpuFlag(kCpuHasSME)) { + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -565,7 +565,7 @@ ARGBToUVMatrixRow_C; MergeUVRow(row_u, row_v, dst_uv, halfwidth); ARGBToYMatrixRow(src_argb, dst_y, width, argbconstants); ARGBToYMatrixRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width, - argbconstants); + argbconstants); src_argb += src_stride_argb * 2; dst_y += dst_stride_y * 2; dst_uv += dst_stride_uv; @@ -595,7 +595,7 @@ int ARGBToNV21Matrix(const uint8_t* src_argb, void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width, const struct ArgbConstants* c) = -ARGBToUVMatrixRow_C; + ARGBToUVMatrixRow_C; #if defined(HAS_ARGBTOYMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -660,34 +660,34 @@ ARGBToUVMatrixRow_C; #endif #if defined(HAS_ARGBTOUVMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; - } + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM) - if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; - } + if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - if (IS_ALIGNED(width, 2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; - } + if (TestCpuFlag(kCpuHasSVE2)) { + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SME) - if (TestCpuFlag(kCpuHasSME)) { - if (IS_ALIGNED(width, 2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; - } + if (TestCpuFlag(kCpuHasSME)) { + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -780,7 +780,7 @@ ARGBToUVMatrixRow_C; MergeUVRow(row_u, row_v, dst_vu, halfwidth); ARGBToYMatrixRow(src_argb, dst_y, width, argbconstants); ARGBToYMatrixRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width, - argbconstants); + argbconstants); src_argb += src_stride_argb * 2; dst_y += dst_stride_y * 2; dst_vu += dst_stride_uv; @@ -864,7 +864,8 @@ int ARGBToYUY2Matrix(const uint8_t* src_argb, int y; void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width, - const struct ArgbConstants* c) = ARGBToUVMatrixRow_C; + const struct ArgbConstants* c) = + ARGBToUVMatrixRow_C; void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c) = ARGBToYMatrixRow_C; void (*I422ToYUY2Row)(const uint8_t* src_y, const uint8_t* src_u, @@ -976,7 +977,8 @@ int ARGBToUYVYMatrix(const uint8_t* src_argb, int y; void (*ARGBToUVMatrixRow)(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, uint8_t* dst_v, int width, - const struct ArgbConstants* c) = ARGBToUVMatrixRow_C; + const struct ArgbConstants* c) = + ARGBToUVMatrixRow_C; void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c) = ARGBToYMatrixRow_C; void (*I422ToUYVYRow)(const uint8_t* src_y, const uint8_t* src_u, @@ -1077,8 +1079,6 @@ int ARGBToUYVYMatrix(const uint8_t* src_argb, return 0; } - - // Same as NV12 but U and V swapped. LIBYUV_API int ARGBToNV21(const uint8_t* src_argb, @@ -1089,8 +1089,9 @@ int ARGBToNV21(const uint8_t* src_argb, int dst_stride_vu, int width, int height) { - return ARGBToNV21Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_vu, - dst_stride_vu, &kArgbI601Constants, width, height); + return ARGBToNV21Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, + dst_vu, dst_stride_vu, &kArgbI601Constants, width, + height); } LIBYUV_API @@ -1102,8 +1103,9 @@ int ABGRToNV12(const uint8_t* src_abgr, int dst_stride_uv, int width, int height) { - return ARGBToNV12Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_uv, - dst_stride_uv, &kAbgrI601Constants, width, height); + return ARGBToNV12Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, + dst_uv, dst_stride_uv, &kAbgrI601Constants, width, + height); } // Same as NV12 but U and V swapped. @@ -1116,8 +1118,9 @@ int ABGRToNV21(const uint8_t* src_abgr, int dst_stride_vu, int width, int height) { - return ARGBToNV21Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_vu, - dst_stride_vu, &kAbgrI601Constants, width, height); + return ARGBToNV21Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, + dst_vu, dst_stride_vu, &kAbgrI601Constants, width, + height); } // Convert ARGB to YUY2. @@ -1819,8 +1822,8 @@ int ARGBToJ444(const uint8_t* src_argb, int width, int height) { return ARGBToI444Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, &kArgbJPEGConstants, - width, height); + dst_stride_u, dst_v, dst_stride_v, + &kArgbJPEGConstants, width, height); } // Convert ARGB to J420. (JPeg full range I420). @@ -1836,8 +1839,8 @@ int ARGBToJ420(const uint8_t* src_argb, int width, int height) { return ARGBToI420Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, &kArgbJPEGConstants, - width, height); + dst_stride_u, dst_v, dst_stride_v, + &kArgbJPEGConstants, width, height); } // Convert ARGB to J422. (JPeg full range I422). @@ -1853,8 +1856,8 @@ int ARGBToJ422(const uint8_t* src_argb, int width, int height) { return ARGBToI422Matrix(src_argb, src_stride_argb, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, &kArgbJPEGConstants, - width, height); + dst_stride_u, dst_v, dst_stride_v, + &kArgbJPEGConstants, width, height); } // Convert ARGB to J400. @@ -1978,8 +1981,8 @@ int ABGRToJ420(const uint8_t* src_abgr, int width, int height) { return ARGBToI420Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, &kAbgrJPEGConstants, - width, height); + dst_stride_u, dst_v, dst_stride_v, + &kAbgrJPEGConstants, width, height); } // Convert ABGR to J422. (JPeg full range I422). @@ -1995,8 +1998,8 @@ int ABGRToJ422(const uint8_t* src_abgr, int width, int height) { return ARGBToI422Matrix(src_abgr, src_stride_abgr, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, &kAbgrJPEGConstants, - width, height); + dst_stride_u, dst_v, dst_stride_v, + &kAbgrJPEGConstants, width, height); } // Convert ABGR to J400. @@ -2165,7 +2168,7 @@ int RAWToNV21Matrix(const uint8_t* src_raw, void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c) = ARGBToYMatrixRow_C; void (*MergeUVRow)(const uint8_t* src_uj, const uint8_t* src_vj, - uint8_t* dst_vu, int width) = MergeUVRow_C; + uint8_t* dst_vu, int width) = MergeUVRow_C; #if defined(HAS_ARGBTOYMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; @@ -2298,34 +2301,34 @@ int RAWToNV21Matrix(const uint8_t* src_raw, } #endif #if defined(HAS_ARGBTOUVMATRIXROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; - } + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_NEON_I8MM) - if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; - if (IS_ALIGNED(width, 16)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; - } + if (TestCpuFlag(kCpuHasNEON) && TestCpuFlag(kCpuHasNeonI8MM)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_Any_NEON_I8MM; + if (IS_ALIGNED(width, 16)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_NEON_I8MM; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SVE2) - if (TestCpuFlag(kCpuHasSVE2)) { - if (IS_ALIGNED(width, 2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; - } + if (TestCpuFlag(kCpuHasSVE2)) { + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SVE2; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SME) - if (TestCpuFlag(kCpuHasSME)) { - if (IS_ALIGNED(width, 2)) { - ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; - } + if (TestCpuFlag(kCpuHasSME)) { + if (IS_ALIGNED(width, 2)) { + ARGBToUVMatrixRow = ARGBToUVMatrixRow_SME; } + } #endif #if defined(HAS_ARGBTOUVMATRIXROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { @@ -2424,7 +2427,8 @@ int RAWToNV21Matrix(const uint8_t* src_raw, ARGBToUVMatrixRow(row, row_size, row_u, row_v, width, argbconstants); MergeUVRow(row_v, row_u, dst_vu, halfwidth); ARGBToYMatrixRow(row, dst_y, width, argbconstants); - ARGBToYMatrixRow(row + row_size, dst_y + dst_stride_y, width, argbconstants); + ARGBToYMatrixRow(row + row_size, dst_y + dst_stride_y, width, + argbconstants); src_raw += src_stride_raw * 2; dst_y += dst_stride_y * 2; dst_vu += dst_stride_vu; @@ -2482,7 +2486,6 @@ int RGB24ToNV12(const uint8_t* src_rgb24, height); } - #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/planar_functions.cc b/source/planar_functions.cc index cff7c5d0a..3481d643d 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -8,13 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/convert_from_argb.h" // For ArgbConstants #include "libyuv/planar_functions.h" #include #include #include // for memset() +#include "libyuv/convert_from_argb.h" // For ArgbConstants #include "libyuv/cpu_id.h" #include "libyuv/row.h" #include "libyuv/scale_row.h" // for ScaleRowDown2 @@ -630,6 +630,14 @@ void SplitUVPlane(const uint8_t* src_uv, } } #endif +#if defined(HAS_SPLITUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + SplitUVRow = SplitUVRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + SplitUVRow = SplitUVRow_AVX512BW; + } + } +#endif #if defined(HAS_SPLITUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SplitUVRow = SplitUVRow_Any_NEON; @@ -1087,7 +1095,7 @@ int NV21ToNV12(const uint8_t* src_y, } // Test if tile_height is a power of 2 (16 or 32) -#define IS_POWEROFTWO(x) (!((x) & ((x)-1))) +#define IS_POWEROFTWO(x) (!((x) & ((x) - 1))) // Detile a plane of data // tile width is 16 and assumed. @@ -2588,6 +2596,14 @@ void MirrorPlane(const uint8_t* src_y, } } #endif +#if defined(HAS_MIRRORROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW)) { + MirrorRow = MirrorRow_Any_AVX512BW; + if (IS_ALIGNED(width, 64)) { + MirrorRow = MirrorRow_AVX512BW; + } + } +#endif #if defined(HAS_MIRRORROW_LSX) if (TestCpuFlag(kCpuHasLSX)) { MirrorRow = MirrorRow_Any_LSX; diff --git a/source/rotate.cc b/source/rotate.cc index 54e0c2e63..60940f51f 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -8,11 +8,11 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "libyuv/rotate.h" + #include #include -#include "libyuv/rotate.h" - #include "libyuv/convert.h" #include "libyuv/cpu_id.h" #include "libyuv/planar_functions.h" @@ -403,6 +403,11 @@ void SplitRotateUV180(const uint8_t* src, MirrorSplitUVRow = MirrorSplitUVRow_AVX2; } #endif +#if defined(HAS_MIRRORSPLITUVROW_AVX512BW) + if (TestCpuFlag(kCpuHasAVX512BW) && IS_ALIGNED(width, 32)) { + MirrorSplitUVRow = MirrorSplitUVRow_AVX512BW; + } +#endif #if defined(HAS_MIRRORSPLITUVROW_LSX) if (TestCpuFlag(kCpuHasLSX) && IS_ALIGNED(width, 32)) { MirrorSplitUVRow = MirrorSplitUVRow_LSX; diff --git a/source/rotate_win.cc b/source/rotate_win.cc index 03eeee3a6..5b40f62a0 100644 --- a/source/rotate_win.cc +++ b/source/rotate_win.cc @@ -64,7 +64,7 @@ __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src, mov eax, ebp movdqa xmm7, xmm6 palignr xmm7, xmm7, 8 - // Second round of bit swap. + // Second round of bit swap. punpcklwd xmm0, xmm2 punpcklwd xmm1, xmm3 movdqa xmm2, xmm0 @@ -77,8 +77,8 @@ __declspec(naked) void TransposeWx8_SSSE3(const uint8_t* src, movdqa xmm7, xmm5 palignr xmm6, xmm6, 8 palignr xmm7, xmm7, 8 - // Third round of bit swap. - // Write to the destination pointer. + // Third round of bit swap. + // Write to the destination pointer. punpckldq xmm0, xmm4 movq qword ptr [edx], xmm0 movdqa xmm4, xmm0 @@ -173,7 +173,7 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src, movdqa xmm7, xmm5 lea eax, [eax + 8 * edi + 16] neg edi - // Second round of bit swap. + // Second round of bit swap. movdqa xmm5, xmm0 punpcklwd xmm0, xmm2 punpckhwd xmm5, xmm2 @@ -193,8 +193,8 @@ __declspec(naked) void TransposeUVWx8_SSE2(const uint8_t* src, punpckhwd xmm6, xmm7 movdqa xmm7, xmm6 - // Third round of bit swap. - // Write to the destination pointer. + // Third round of bit swap. + // Write to the destination pointer. movdqa xmm6, xmm0 punpckldq xmm0, xmm4 punpckhdq xmm6, xmm4 diff --git a/source/row_any.cc b/source/row_any.cc index cac6339d1..340adc188 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1919,6 +1919,9 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2, memcpy(dst_ptr + np * BPP, vout + (MASK + 1 - r) * BPP, r * BPP); \ } +#ifdef HAS_MIRRORROW_AVX512BW +ANY11M(MirrorRow_Any_AVX512BW, MirrorRow_AVX512BW, 1, 63) +#endif #ifdef HAS_MIRRORROW_AVX2 ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) #endif @@ -2022,6 +2025,9 @@ ANY1(ARGBSetRow_Any_LSX, ARGBSetRow_LSX, uint32_t, 4, 3) #ifdef HAS_SPLITUVROW_SSE2 ANY12(SplitUVRow_Any_SSE2, SplitUVRow_SSE2, 0, 2, 0, 15) #endif +#ifdef HAS_SPLITUVROW_AVX512BW +ANY12(SplitUVRow_Any_AVX512BW, SplitUVRow_AVX512BW, 0, 2, 0, 63) +#endif #ifdef HAS_SPLITUVROW_AVX2 ANY12(SplitUVRow_Any_AVX2, SplitUVRow_AVX2, 0, 2, 0, 31) #endif @@ -2193,7 +2199,7 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) uint8_t* dst_v, int width) { \ SIMD_ALIGNED(uint8_t vin[256 * 2]); \ SIMD_ALIGNED(uint8_t vout[256 * 2]); \ - memset(vin, 0, sizeof(vin)); /* for msan */ \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ memset(vout, 0, sizeof(vout)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ @@ -2215,29 +2221,29 @@ ANY14(SplitARGBRow_Any_NEON, SplitARGBRow_NEON, 4, 15) memcpy(dst_v + (np >> 1), vout + 256, SS(r, 1)); \ } -#define ANY12M(NAMEANY, ANY_SIMD, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \ - int width, const struct ArgbConstants* c) { \ - SIMD_ALIGNED(uint8_t vin[256]); \ - SIMD_ALIGNED(uint8_t vout[256 * 2]); \ - memset(vin, 0, sizeof(vin)); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_u, dst_v, n, c); \ - } \ - memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP); \ - ANY_SIMD(vin, vout, vout + 256, MASK + 1, c); \ - memcpy(dst_u + (ptrdiff_t)n, vout, (ptrdiff_t)r); \ - memcpy(dst_v + (ptrdiff_t)n, vout + 256, (ptrdiff_t)r); \ +#define ANY12M(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_u, uint8_t* dst_v, \ + int width, const struct ArgbConstants* c) { \ + SIMD_ALIGNED(uint8_t vin[256]); \ + SIMD_ALIGNED(uint8_t vout[256 * 2]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_u, dst_v, n, c); \ + } \ + memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP); \ + ANY_SIMD(vin, vout, vout + 256, MASK + 1, c); \ + memcpy(dst_u + (ptrdiff_t)n, vout, (ptrdiff_t)r); \ + memcpy(dst_v + (ptrdiff_t)n, vout + 256, (ptrdiff_t)r); \ } #define ANY12MS(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \ - uint8_t* dst_v, int width, const struct ArgbConstants* c) { \ + void NAMEANY(const uint8_t* src_ptr, int src_stride, uint8_t* dst_u, \ + uint8_t* dst_v, int width, const struct ArgbConstants* c) { \ SIMD_ALIGNED(uint8_t vin[256 * 2]); \ SIMD_ALIGNED(uint8_t vout[256 * 2]); \ - memset(vin, 0, sizeof(vin)); /* for msan */ \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ memset(vout, 0, sizeof(vout)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ @@ -2291,6 +2297,9 @@ ANY12MS(ARGB4444ToUVMatrixRow_Any_AVX2, ARGB4444ToUVMatrixRow_AVX2, 0, 2, 31) #ifdef HAS_ARGBTOUVMATRIXROW_AVX512BW ANY12MS(ARGBToUVMatrixRow_Any_AVX512BW, ARGBToUVMatrixRow_AVX512BW, 0, 4, 63) #endif +#ifdef HAS_RGBTOUVMATRIXROW_AVX512BW +ANY12MS(RGBToUVMatrixRow_Any_AVX512BW, RGBToUVMatrixRow_AVX512BW, 0, 3, 63) +#endif #ifdef HAS_ARGBTOUVMATRIXROW_SSSE3 ANY12MS(ARGBToUVMatrixRow_Any_SSSE3, ARGBToUVMatrixRow_SSSE3, 0, 4, 7) #endif @@ -2307,20 +2316,20 @@ ANY12M(ARGBToUV444MatrixRow_Any_SSSE3, ARGBToUV444MatrixRow_SSSE3, 4, 15) ANY12M(ARGBToUV444MatrixRow_Any_NEON, ARGBToUV444MatrixRow_NEON, 4, 7) #endif -#define ANY11MC(NAMEANY, ANY_SIMD, BPP, MASK) \ - void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width, \ - const struct ArgbConstants* c) { \ - SIMD_ALIGNED(uint8_t vin[256]); \ - SIMD_ALIGNED(uint8_t vout[256]); \ - memset(vin, 0, sizeof(vin)); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, n, c); \ - } \ - memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP); \ - ANY_SIMD(vin, vout, MASK + 1, c); \ - memcpy(dst_ptr + (ptrdiff_t)n, vout, (ptrdiff_t)r); \ +#define ANY11MC(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width, \ + const struct ArgbConstants* c) { \ + SIMD_ALIGNED(uint8_t vin[256]); \ + SIMD_ALIGNED(uint8_t vout[256]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n, c); \ + } \ + memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP); \ + ANY_SIMD(vin, vout, MASK + 1, c); \ + memcpy(dst_ptr + (ptrdiff_t)n, vout, (ptrdiff_t)r); \ } #ifdef HAS_ARGBTOYROW_SSSE3 diff --git a/source/row_common.cc b/source/row_common.cc index a18c90d12..f44b0f313 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -14,7 +14,7 @@ #include // For memcpy and memset. #include "libyuv/basic_types.h" -#include "libyuv/convert_argb.h" // For kYuvI601Constants +#include "libyuv/convert_argb.h" // For kYuvI601Constants #include "libyuv/convert_from_argb.h" // For ArgbConstants #ifdef __cplusplus @@ -764,7 +764,7 @@ static __inline uint8_t RGBToUMatrix(uint8_t b0, uint8_t b3, const struct ArgbConstants* c) { return (c->kAddUV[0] - (c->kRGBToU[0] * b0 + c->kRGBToU[1] * b1 + - c->kRGBToU[2] * b2 + c->kRGBToU[3] * b3)) >> + c->kRGBToU[2] * b2 + c->kRGBToU[3] * b3)) >> 8; } static __inline uint8_t RGBToVMatrix(uint8_t b0, @@ -773,7 +773,7 @@ static __inline uint8_t RGBToVMatrix(uint8_t b0, uint8_t b3, const struct ArgbConstants* c) { return (c->kAddUV[0] - (c->kRGBToV[0] * b0 + c->kRGBToV[1] * b1 + - c->kRGBToV[2] * b2 + c->kRGBToV[3] * b3)) >> + c->kRGBToV[2] * b2 + c->kRGBToV[3] * b3)) >> 8; } @@ -783,7 +783,8 @@ void ARGBToYMatrixRow_C(const uint8_t* src_argb, const struct ArgbConstants* c) { int x; for (x = 0; x < width; ++x) { - dst_y[0] = RGBToYMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c); + dst_y[0] = + RGBToYMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c); src_argb += 4; dst_y += 1; } @@ -1513,18 +1514,18 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = \ YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB); -#define MAKEARGBCONSTANTS(name, RY, GY, BY, RU, GU, BU, RV, GV, BV, AY, AUV) \ - extern const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) = \ - ARGBCONSTANTSBODY(BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), -(GV), \ - -(RV), 0, AY, AUV); \ - extern const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) = \ - ARGBCONSTANTSBODY(RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), -(GV), \ - -(BV), 0, AY, AUV); \ - extern const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) = \ - ARGBCONSTANTSBODY(0, BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), \ - -(GV), -(RV), AY, AUV); \ - extern const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) = \ - ARGBCONSTANTSBODY(0, RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), \ +#define MAKEARGBCONSTANTS(name, RY, GY, BY, RU, GU, BU, RV, GV, BV, AY, AUV) \ + extern const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) = \ + ARGBCONSTANTSBODY(BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), -(GV), \ + -(RV), 0, AY, AUV); \ + extern const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) = \ + ARGBCONSTANTSBODY(RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), -(GV), \ + -(BV), 0, AY, AUV); \ + extern const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) = \ + ARGBCONSTANTSBODY(0, BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), \ + -(GV), -(RV), AY, AUV); \ + extern const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) = \ + ARGBCONSTANTSBODY(0, RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), \ -(GV), -(BV), AY, AUV); // BT.601 limited range RGB to YUV coefficients @@ -3467,7 +3468,7 @@ void ARGBBlendRow_C(const uint8_t* src_argb, } #undef BLEND -#define UBLEND(f, b, a) (((a)*f) + ((255 - a) * b) + 255) >> 8 +#define UBLEND(f, b, a) (((a) * f) + ((255 - a) * b) + 255) >> 8 void BlendPlaneRow_C(const uint8_t* src0, const uint8_t* src1, const uint8_t* alpha, @@ -4618,8 +4619,7 @@ void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb, while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; RGB24ToARGBRow_AVX2(src_rgb, row, twidth); - RGB24ToARGBRow_AVX2(src_rgb + src_stride_rgb, - row + MAXTWIDTH * 4, twidth); + RGB24ToARGBRow_AVX2(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4, twidth); ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); src_rgb += twidth * 3; dst_u += twidth / 2; @@ -4629,6 +4629,29 @@ void RGBToUVMatrixRow_AVX2(const uint8_t* src_rgb, } #endif +#if defined(HAS_ARGBTOUVMATRIXROW_AVX512BW) && \ + defined(HAS_RGB24TOARGBROW_AVX512BW) +void RGBToUVMatrixRow_AVX512BW(const uint8_t* src_rgb, + int src_stride_rgb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c) { + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4 * 2]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + RGB24ToARGBRow_AVX512BW(src_rgb, row, twidth); + RGB24ToARGBRow_AVX512BW(src_rgb + src_stride_rgb, row + MAXTWIDTH * 4, + twidth); + ARGBToUVMatrixRow_AVX512BW(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); + src_rgb += twidth * 3; + dst_u += twidth / 2; + dst_v += twidth / 2; + width -= twidth; + } +} +#endif + #if defined(HAS_ARGBTOUVMATRIXROW_NEON) && defined(HAS_RGB24TOARGBROW_NEON) void RGBToUVMatrixRow_NEON(const uint8_t* src_rgb, int src_stride_rgb, @@ -4675,7 +4698,8 @@ void RGB565ToUVMatrixRow_C(const uint8_t* src_rgb565, while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; RGB565ToARGBRow_C(src_rgb565, row, twidth); - RGB565ToARGBRow_C(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4, twidth); + RGB565ToARGBRow_C(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4, + twidth); ARGBToUVMatrixRow_C(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); src_rgb565 += twidth * 2; dst_u += twidth / 2; @@ -4712,8 +4736,8 @@ void RGB565ToUVMatrixRow_AVX2(const uint8_t* src_rgb565, while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; RGB565ToARGBRow_AVX2(src_rgb565, row, twidth); - RGB565ToARGBRow_AVX2(src_rgb565 + src_stride_rgb565, - row + MAXTWIDTH * 4, twidth); + RGB565ToARGBRow_AVX2(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4, + twidth); ARGBToUVMatrixRow_AVX2(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); src_rgb565 += twidth * 2; dst_u += twidth / 2; @@ -4751,7 +4775,8 @@ void RGB565ToUVMatrixRow_NEON(const uint8_t* src_rgb565, while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; RGB565ToARGBRow_NEON(src_rgb565, row, twidth); - RGB565ToARGBRow_NEON(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4, twidth); + RGB565ToARGBRow_NEON(src_rgb565 + src_stride_rgb565, row + MAXTWIDTH * 4, + twidth); ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); src_rgb565 += twidth * 2; dst_u += twidth / 2; @@ -4786,7 +4811,8 @@ void ARGB1555ToUVMatrixRow_C(const uint8_t* src_argb1555, while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; ARGB1555ToARGBRow_C(src_argb1555, row, twidth); - ARGB1555ToARGBRow_C(src_argb1555 + src_stride_argb1555, row + MAXTWIDTH * 4, twidth); + ARGB1555ToARGBRow_C(src_argb1555 + src_stride_argb1555, row + MAXTWIDTH * 4, + twidth); ARGBToUVMatrixRow_C(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); src_argb1555 += twidth * 2; dst_u += twidth / 2; @@ -4820,7 +4846,8 @@ void ARGB4444ToUVMatrixRow_C(const uint8_t* src_argb4444, while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; ARGB4444ToARGBRow_C(src_argb4444, row, twidth); - ARGB4444ToARGBRow_C(src_argb4444 + src_stride_argb4444, row + MAXTWIDTH * 4, twidth); + ARGB4444ToARGBRow_C(src_argb4444 + src_stride_argb4444, row + MAXTWIDTH * 4, + twidth); ARGBToUVMatrixRow_C(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); src_argb4444 += twidth * 2; dst_u += twidth / 2; @@ -4956,7 +4983,8 @@ void ARGB1555ToUVMatrixRow_NEON(const uint8_t* src_argb1555, while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; ARGB1555ToARGBRow_NEON(src_argb1555, row, twidth); - ARGB1555ToARGBRow_NEON(src_argb1555 + src_stride_argb1555, row + MAXTWIDTH * 4, twidth); + ARGB1555ToARGBRow_NEON(src_argb1555 + src_stride_argb1555, + row + MAXTWIDTH * 4, twidth); ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); src_argb1555 += twidth * 2; dst_u += twidth / 2; @@ -4977,7 +5005,8 @@ void ARGB4444ToUVMatrixRow_NEON(const uint8_t* src_argb4444, while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; ARGB4444ToARGBRow_NEON(src_argb4444, row, twidth); - ARGB4444ToARGBRow_NEON(src_argb4444 + src_stride_argb4444, row + MAXTWIDTH * 4, twidth); + ARGB4444ToARGBRow_NEON(src_argb4444 + src_stride_argb4444, + row + MAXTWIDTH * 4, twidth); ARGBToUVMatrixRow_NEON(row, MAXTWIDTH * 4, dst_u, dst_v, twidth, c); src_argb4444 += twidth * 2; dst_u += twidth / 2; diff --git a/source/row_gcc.cc b/source/row_gcc.cc index e37e58b01..10ecf5910 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/row.h" #include "libyuv/convert_from_argb.h" // For ArgbConstants +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { @@ -120,11 +120,11 @@ static const lvec8 kShuffleNV21 = { #if defined(HAS_J400TOARGBROW_AVX2) || defined(HAS_J400TOARGBROW_AVX512BW) alignas(64) static const uint8_t kShuffleMaskJ400ToARGB[64] = { - 0u, 0u, 0u, 128u, 1u, 1u, 1u, 128u, 2u, 2u, 2u, 128u, 3u, 3u, 3u, 128u, - 4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, 7u, 7u, 7u, 128u, - 8u, 8u, 8u, 128u, 9u, 9u, 9u, 128u, 10u, 10u, 10u, 128u, 11u, 11u, 11u, 128u, - 12u, 12u, 12u, 128u, 13u, 13u, 13u, 128u, 14u, 14u, 14u, 128u, 15u, 15u, 15u, 128u -}; + 0u, 0u, 0u, 128u, 1u, 1u, 1u, 128u, 2u, 2u, 2u, 128u, 3u, 3u, + 3u, 128u, 4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, + 7u, 7u, 7u, 128u, 8u, 8u, 8u, 128u, 9u, 9u, 9u, 128u, 10u, 10u, + 10u, 128u, 11u, 11u, 11u, 128u, 12u, 12u, 12u, 128u, 13u, 13u, 13u, 128u, + 14u, 14u, 14u, 128u, 15u, 15u, 15u, 128u}; #endif #ifdef HAS_J400TOARGBROW_AVX2 @@ -149,16 +149,18 @@ void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) { "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(kShuffleMaskJ400ToARGB) // %3 + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(kShuffleMaskJ400ToARGB) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7"); } #endif // HAS_J400TOARGBROW_AVX2 #ifdef HAS_J400TOARGBROW_AVX512BW -void J400ToARGBRow_AVX512BW(const uint8_t* src_y, uint8_t* dst_argb, int width) { +void J400ToARGBRow_AVX512BW(const uint8_t* src_y, + uint8_t* dst_argb, + int width) { asm volatile( "vpternlogd $0xff,%%zmm7,%%zmm7,%%zmm7 \n" // 0xffffffff "vpslld $0x18,%%zmm7,%%zmm7 \n" // 0xff000000 @@ -179,10 +181,10 @@ void J400ToARGBRow_AVX512BW(const uint8_t* src_y, uint8_t* dst_argb, int width) "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_y), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleMaskJ400ToARGB) // %3 + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleMaskJ400ToARGB) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm5", "xmm7"); } #endif // HAS_J400TOARGBROW_AVX512BW @@ -221,15 +223,17 @@ void RGB24ToARGBRow_SSSE3(const uint8_t* src_rgb24, "lea 0x40(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 : "m"(kShuffleMaskRGB24ToARGB[0]) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #ifdef HAS_RGB24TOARGBROW_AVX2 -void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { +void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { // Reference to prevent discarding of kShuffleMaskRGB24ToARGB[1] which is // accessed via offset in assembly. const uvec8* dummy = &kShuffleMaskRGB24ToARGB[1]; @@ -267,9 +271,9 @@ void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 : "m"(kShuffleMaskRGB24ToARGB[0]) // %3 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } @@ -358,7 +362,10 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { static const uint32_t kPermdRAWToARGB_AVX512BW[16] = { 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 9, 9, 10, 11, 12}; -void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint32_t* shuffler, int width) { +void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, + uint8_t* dst_argb, + const uint32_t* shuffler, + int width) { asm volatile( "vpternlogd $0xff,%%zmm6,%%zmm6,%%zmm6 \n" // 0xffffffff "vpslld $0x18,%%zmm6,%%zmm6 \n" // 0xff000000 @@ -399,14 +406,20 @@ void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint "+r"(width) // %2 : "m"(kPermdRAWToARGB_AVX512BW), // %3 "m"(*shuffler) // %4 - : "memory", "cc", "rax", "k1", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + : "memory", "cc", "rax", "k1", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5", "xmm6"); } -void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - RGBToARGBRow_AVX512BW(src_raw, dst_argb, (const uint32_t*)&kShuffleMaskRAWToARGB, width); +void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, + uint8_t* dst_argb, + int width) { + RGBToARGBRow_AVX512BW(src_raw, dst_argb, + (const uint32_t*)&kShuffleMaskRAWToARGB, width); } -void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { +void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, (const uint32_t*)&kShuffleMaskRGB24ToARGB[0], width); } @@ -622,35 +635,35 @@ void ARGB4444ToARGBRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { #endif void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile("movdqa %3,%%xmm6 \n" + asm volatile("movdqa %3,%%xmm6 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -660,35 +673,35 @@ void ARGBToRGB24Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { } void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { - asm volatile("movdqa %3,%%xmm6 \n" + asm volatile("movdqa %3,%%xmm6 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "movdqu 0x20(%0),%%xmm2 \n" - "movdqu 0x30(%0),%%xmm3 \n" - "lea 0x40(%0),%0 \n" - "pshufb %%xmm6,%%xmm0 \n" - "pshufb %%xmm6,%%xmm1 \n" - "pshufb %%xmm6,%%xmm2 \n" - "pshufb %%xmm6,%%xmm3 \n" - "movdqa %%xmm1,%%xmm4 \n" - "psrldq $0x4,%%xmm1 \n" - "pslldq $0xc,%%xmm4 \n" - "movdqa %%xmm2,%%xmm5 \n" - "por %%xmm4,%%xmm0 \n" - "pslldq $0x8,%%xmm5 \n" - "movdqu %%xmm0,(%1) \n" - "por %%xmm5,%%xmm1 \n" - "psrldq $0x8,%%xmm2 \n" - "pslldq $0x4,%%xmm3 \n" - "por %%xmm3,%%xmm2 \n" - "movdqu %%xmm1,0x10(%1) \n" - "movdqu %%xmm2,0x20(%1) \n" - "lea 0x30(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "pshufb %%xmm6,%%xmm0 \n" + "pshufb %%xmm6,%%xmm1 \n" + "pshufb %%xmm6,%%xmm2 \n" + "pshufb %%xmm6,%%xmm3 \n" + "movdqa %%xmm1,%%xmm4 \n" + "psrldq $0x4,%%xmm1 \n" + "pslldq $0xc,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "por %%xmm4,%%xmm0 \n" + "pslldq $0x8,%%xmm5 \n" + "movdqu %%xmm0,(%1) \n" + "por %%xmm5,%%xmm1 \n" + "psrldq $0x8,%%xmm2 \n" + "pslldq $0x4,%%xmm3 \n" + "por %%xmm3,%%xmm2 \n" + "movdqu %%xmm1,0x10(%1) \n" + "movdqu %%xmm2,0x20(%1) \n" + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 @@ -1153,21 +1166,21 @@ void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - asm volatile("movdqa %3,%%xmm2 \n" + asm volatile("movdqa %3,%%xmm2 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "psrlw $8,%%xmm0 \n" - "psrlw $8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "pshufb %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x20(%0),%0 \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrlw $8,%%xmm0 \n" + "psrlw $8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "pshufb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src_ab64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1258,21 +1271,21 @@ void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { - asm volatile("vbroadcasti128 %3,%%ymm2 \n" LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "vpsrlw $8,%%ymm0,%%ymm0 \n" - "vpsrlw $8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x40(%0),%0 \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" + asm volatile("vbroadcasti128 %3,%%ymm2 \n" LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpsrlw $8,%%ymm0,%%ymm0 \n" + "vpsrlw $8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x40(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_ab64), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -1452,9 +1465,7 @@ void ARGBToYMatrixRow_SSSE3(const uint8_t* src_argb, "movdqa %%xmm4,%%xmm6 \n" "pmaddubsw %%xmm5,%%xmm6 \n" "phaddw %%xmm6,%%xmm6 \n" - "psubw %%xmm6,%%xmm7 \n" - LABELALIGN "" - RGBTOY(xmm7) + "psubw %%xmm6,%%xmm7 \n" LABELALIGN "" RGBTOY(xmm7) : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1478,10 +1489,8 @@ void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb, "vpmaddubsw %%ymm5,%%ymm4,%%ymm6 \n" "vphaddw %%ymm6,%%ymm6,%%ymm6 \n" "vpsubw %%ymm6,%%ymm7,%%ymm7 \n" - "vmovdqa %4,%%ymm6 \n" - LABELALIGN "" - RGBTOY_AVX2(ymm7) - "vzeroupper \n" + "vmovdqa %4,%%ymm6 \n" LABELALIGN + "" RGBTOY_AVX2(ymm7) "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1492,8 +1501,9 @@ void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb, } #endif -#if defined(HAS_ARGBTOYROW_AVX512BW) || defined(HAS_ARGBTOUV444ROW_AVX512BW) || defined(HAS_ARGBTOUVROW_AVX512BW) -static const uint32_t kPermdARGBToY_AVX512BW[16] = {0, 4, 8, 12, 1, 5, 9, 13, +#if defined(HAS_ARGBTOYROW_AVX512BW) || \ + defined(HAS_ARGBTOUV444ROW_AVX512BW) || defined(HAS_ARGBTOUVROW_AVX512BW) +static const uint32_t kPermdARGBToY_AVX512BW[16] = {0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15}; #endif @@ -1511,15 +1521,14 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb, "vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n" "vpsllw $15,%%zmm16,%%zmm5 \n" "vpacksswb %%zmm5,%%zmm5,%%zmm5 \n" - "vpsrlw $15,%%zmm16,%%zmm16 \n" // zmm16 = 1 + "vpsrlw $15,%%zmm16,%%zmm16 \n" // zmm16 = 1 "vbroadcasti64x4 0(%3),%%zmm4 \n" "vbroadcasti64x4 0x60(%3),%%zmm7 \n" "vpmaddubsw %%zmm5,%%zmm4,%%zmm6 \n" "vpmaddwd %%zmm16,%%zmm6,%%zmm6 \n" "vpackssdw %%zmm6,%%zmm6,%%zmm6 \n" "vpsubw %%zmm6,%%zmm7,%%zmm7 \n" - "vmovups %4,%%zmm6 \n" - LABELALIGN + "vmovups %4,%%zmm6 \n" LABELALIGN "1: \n" "vmovups (%0),%%zmm0 \n" "vmovups 0x40(%0),%%zmm1 \n" @@ -1551,11 +1560,11 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb, "sub $0x40,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(c), // %3 - "m"(kPermdARGBToY_AVX512BW) // %4 + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(c), // %3 + "m"(kPermdARGBToY_AVX512BW) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7", "xmm16"); } @@ -1713,8 +1722,8 @@ void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb, asm volatile( "vbroadcasti64x4 0x20(%4),%%zmm3 \n" // kRGBToU "vbroadcasti64x4 0x40(%4),%%zmm4 \n" // kRGBToV - "vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n" // -1 - "vpsllw $15,%%zmm16,%%zmm5 \n" // 0x8000 + "vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n" // -1 + "vpsllw $15,%%zmm16,%%zmm5 \n" // 0x8000 "vmovups %5,%%zmm7 \n" "sub %1,%2 \n" @@ -1874,8 +1883,8 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, int width, const struct ArgbConstants* c) { asm volatile( - "vbroadcasti128 0x20(%5),%%ymm4 \n" // RGBToU - "vbroadcasti128 0x40(%5),%%ymm5 \n" // RGBToV + "vbroadcasti128 0x20(%5),%%ymm4 \n" // RGBToU + "vbroadcasti128 0x40(%5),%%ymm5 \n" // RGBToV "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 0x0101 "vpabsb %%ymm6,%%ymm6 \n" "vmovdqa %6,%%ymm7 \n" // kShuffleAARRGGBB @@ -2174,8 +2183,8 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb, "vbroadcasti64x4 0x20(%5),%%zmm4 \n" // RGBToU "vbroadcasti64x4 0x40(%5),%%zmm5 \n" // RGBToV "vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n" - "vpabsb %%zmm16,%%zmm6 \n" // 0x0101 - "vpsllw $15,%%zmm16,%%zmm17 \n" // 0x8000 + "vpabsb %%zmm16,%%zmm6 \n" // 0x0101 + "vpsllw $15,%%zmm16,%%zmm17 \n" // 0x8000 "vbroadcasti64x4 %6,%%zmm7 \n" // kShuffleAARRGGBB "vmovups %7,%%zmm18 \n" // kPermdARGBToY_AVX512BW "vmovups %8,%%zmm19 \n" // kPermdARGBToUV_AVX512BW @@ -2209,7 +2218,8 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb, "vpmaddubsw %%zmm5,%%zmm0,%%zmm0 \n" // 16 V "vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n" "vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n" - "vpackssdw %%zmm0,%%zmm1,%%zmm0 \n" // mutates (U in lower, V in upper) + "vpackssdw %%zmm0,%%zmm1,%%zmm0 \n" // mutates (U in lower, V + // in upper) "vpaddw %%zmm17,%%zmm0,%%zmm0 \n" "vpsrlw $0x8,%%zmm0,%%zmm0 \n" "vpackuswb %%zmm0,%%zmm0,%%zmm0 \n" // mutates @@ -2659,12 +2669,12 @@ void OMITFP I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA444 YUVTORGB(yuvconstants) + LABELALIGN "1: \n" READYUVA444 YUVTORGB(yuvconstants) STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" + "subl $0x8,%[width] \n" + "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] @@ -2985,12 +2995,12 @@ void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA210 YUVTORGB(yuvconstants) + LABELALIGN "1: \n" READYUVA210 YUVTORGB(yuvconstants) STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" + "subl $0x8,%[width] \n" + "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] @@ -3017,12 +3027,12 @@ void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA410 YUVTORGB(yuvconstants) + LABELALIGN "1: \n" READYUVA410 YUVTORGB(yuvconstants) STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" + "subl $0x8,%[width] \n" + "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] @@ -3083,12 +3093,12 @@ void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA422 YUVTORGB(yuvconstants) + LABELALIGN "1: \n" READYUVA422 YUVTORGB(yuvconstants) STOREARGB - "subl $0x8,%[width] \n" - "jg 1b \n" + "subl $0x8,%[width] \n" + "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] @@ -3111,12 +3121,12 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READNV12 YUVTORGB(yuvconstants) + LABELALIGN "1: \n" READNV12 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [uv_buf] "+r"(uv_buf), // %[uv_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -3132,12 +3142,12 @@ void OMITFP NV21ToARGBRow_SSSE3(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READNV21 YUVTORGB(yuvconstants) + LABELALIGN "1: \n" READNV21 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [vu_buf] "+r"(vu_buf), // %[vu_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -3155,7 +3165,7 @@ void OMITFP YUY2ToARGBRow_SSSE3(const uint8_t* yuy2_buf, asm volatile( "movdqa %[kShuffleYUY2Y],%%xmm6 \n" "movdqa %[kShuffleYUY2UV],%%xmm7 \n" YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READYUY2 YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" @@ -3176,7 +3186,7 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, asm volatile( "movdqa %[kShuffleUYVYY],%%xmm6 \n" "movdqa %[kShuffleUYVYUV],%%xmm7 \n" YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" LABELALIGN "1: \n" READUYVY YUVTORGB(yuvconstants) STOREARGB "sub $0x8,%[width] \n" @@ -3196,12 +3206,12 @@ void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READP210 YUVTORGB(yuvconstants) + LABELALIGN "1: \n" READP210 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [uv_buf] "+r"(uv_buf), // %[u_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -3217,12 +3227,12 @@ void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP( - yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" + yuvconstants) "pcmpeqb %%xmm5,%%xmm5 \n" - LABELALIGN "1: \n" READP410 YUVTORGB(yuvconstants) + LABELALIGN "1: \n" READP410 YUVTORGB(yuvconstants) STOREARGB - "sub $0x8,%[width] \n" - "jg 1b \n" + "sub $0x8,%[width] \n" + "jg 1b \n" : [y_buf] "+r"(y_buf), // %[y_buf] [uv_buf] "+r"(uv_buf), // %[u_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -4045,13 +4055,13 @@ void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP_AVX2( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA210_AVX2 YUVTORGB_AVX2( + LABELALIGN "1: \n" READYUVA210_AVX2 YUVTORGB_AVX2( yuvconstants) STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] @@ -4080,13 +4090,13 @@ void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP_AVX2( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA410_AVX2 YUVTORGB_AVX2( + LABELALIGN "1: \n" READYUVA410_AVX2 YUVTORGB_AVX2( yuvconstants) STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] @@ -4155,13 +4165,13 @@ void OMITFP I444AlphaToARGBRow_AVX2(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP_AVX2( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA444_AVX2 YUVTORGB_AVX2( + LABELALIGN "1: \n" READYUVA444_AVX2 YUVTORGB_AVX2( yuvconstants) STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] @@ -4189,13 +4199,13 @@ void OMITFP I422AlphaToARGBRow_AVX2(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP_AVX2( - yuvconstants) "sub %[u_buf],%[v_buf] \n" + yuvconstants) "sub %[u_buf],%[v_buf] \n" - LABELALIGN "1: \n" READYUVA422_AVX2 YUVTORGB_AVX2( + LABELALIGN "1: \n" READYUVA422_AVX2 YUVTORGB_AVX2( yuvconstants) STOREARGB_AVX2 - "subl $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [u_buf] "+r"(u_buf), // %[u_buf] [v_buf] "+r"(v_buf), // %[v_buf] @@ -4265,13 +4275,13 @@ void OMITFP NV12ToARGBRow_AVX2(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READNV12_AVX2 YUVTORGB_AVX2( + LABELALIGN "1: \n" READNV12_AVX2 YUVTORGB_AVX2( yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [uv_buf] "+r"(uv_buf), // %[uv_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -4291,13 +4301,13 @@ void OMITFP NV21ToARGBRow_AVX2(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READNV21_AVX2 YUVTORGB_AVX2( + LABELALIGN "1: \n" READNV21_AVX2 YUVTORGB_AVX2( yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [vu_buf] "+r"(vu_buf), // %[vu_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -4319,7 +4329,7 @@ void OMITFP YUY2ToARGBRow_AVX2(const uint8_t* yuy2_buf, asm volatile( "vbroadcasti128 %[kShuffleYUY2Y],%%ymm6 \n" "vbroadcasti128 %[kShuffleYUY2UV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READYUY2_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -4346,7 +4356,7 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, asm volatile( "vbroadcasti128 %[kShuffleUYVYY],%%ymm6 \n" "vbroadcasti128 %[kShuffleUYVYUV],%%ymm7 \n" YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN "1: \n" READUYVY_AVX2 YUVTORGB_AVX2(yuvconstants) STOREARGB_AVX2 @@ -4372,13 +4382,13 @@ void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READP210_AVX2 YUVTORGB_AVX2( + LABELALIGN "1: \n" READP210_AVX2 YUVTORGB_AVX2( yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [uv_buf] "+r"(uv_buf), // %[uv_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -4398,13 +4408,13 @@ void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf, const struct YuvConstants* yuvconstants, int width) { asm volatile(YUVTORGB_SETUP_AVX2( - yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + yuvconstants) "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - LABELALIGN "1: \n" READP410_AVX2 YUVTORGB_AVX2( + LABELALIGN "1: \n" READP410_AVX2 YUVTORGB_AVX2( yuvconstants) STOREARGB_AVX2 - "sub $0x10,%[width] \n" - "jg 1b \n" - "vzeroupper \n" + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" : [y_buf] "+r"(y_buf), // %[y_buf] [uv_buf] "+r"(uv_buf), // %[uv_buf] [dst_argb] "+r"(dst_argb), // %[dst_argb] @@ -4583,16 +4593,16 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { ptrdiff_t temp_width = (ptrdiff_t)(width); - asm volatile("movdqa %3,%%xmm5 \n" + asm volatile("movdqa %3,%%xmm5 \n" LABELALIGN - "1: \n" - "movdqu -0x10(%0,%2,1),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu -0x10(%0,%2,1),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -4601,21 +4611,44 @@ void MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) { } #endif // HAS_MIRRORROW_SSSE3 +#ifdef HAS_MIRRORROW_AVX512BW +void MirrorRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) { + ptrdiff_t temp_width = (ptrdiff_t)(width); + asm volatile("vbroadcasti32x4 %3,%%zmm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu8 -0x40(%0,%2,1),%%zmm0 \n" + "vpshufb %%zmm5,%%zmm0,%%zmm0 \n" + "vshufi64x2 $0x1b,%%zmm0,%%zmm0,%%zmm0 \n" + "vmovdqu8 %%zmm0,(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirror) // %3 + : "memory", "cc", "zmm0", "zmm5"); +} +#endif // HAS_MIRRORROW_AVX512BW + #ifdef HAS_MIRRORROW_AVX2 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { ptrdiff_t temp_width = (ptrdiff_t)(width); - asm volatile("vbroadcasti128 %3,%%ymm5 \n" + asm volatile("vbroadcasti128 %3,%%ymm5 \n" LABELALIGN - "1: \n" - "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu -0x20(%0,%2,1),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -4624,11 +4657,50 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { } #endif // HAS_MIRRORROW_AVX2 -#ifdef HAS_MIRRORSPLITUVROW_AVX2 +#if defined(HAS_MIRRORSPLITUVROW_AVX2) || defined(HAS_MIRRORSPLITUVROW_AVX512BW) // Shuffle table for reversing the bytes of UV channels. static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; +#endif +#ifdef HAS_MIRRORSPLITUVROW_AVX512BW +static const uint64_t kMirrorSplitUVPermute[8] = {6, 4, 2, 0, 7, 5, 3, 1}; + +void MirrorSplitUVRow_AVX512BW(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ptrdiff_t temp_width = (ptrdiff_t)(width); + asm volatile( + "vbroadcasti32x4 %4,%%zmm1 \n" + "lea -0x40(%0,%3,2),%0 \n" + "sub %1,%2 \n" + "vmovdqu64 %5,%%zmm3 \n" + + LABELALIGN + "1: \n" + "vmovdqu8 (%0),%%zmm0 \n" + "lea -0x40(%0),%0 \n" + "vpshufb %%zmm1,%%zmm0,%%zmm0 \n" + "vpermq %%zmm0,%%zmm3,%%zmm0 \n" + "vextracti64x4 $0x1,%%zmm0,%%ymm2 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm2,0x00(%1,%2,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(temp_width) // %3 + : "m"(kShuffleMirrorSplitUV), // %4 + "m"(kMirrorSplitUVPermute) // %5 + : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3"); +} +#endif // HAS_MIRRORSPLITUVROW_AVX512BW + +#ifdef HAS_MIRRORSPLITUVROW_AVX2 void MirrorSplitUVRow_AVX2(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v, @@ -4668,16 +4740,16 @@ static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { ptrdiff_t temp_width = (ptrdiff_t)(width); - asm volatile("movdqa %3,%%xmm5 \n" + asm volatile("movdqa %3,%%xmm5 \n" LABELALIGN - "1: \n" - "movdqu -0x10(%0,%2,2),%%xmm0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu -0x10(%0,%2,2),%%xmm0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 "+r"(temp_width) // %2 @@ -4689,18 +4761,18 @@ void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) { #ifdef HAS_MIRRORUVROW_AVX2 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { ptrdiff_t temp_width = (ptrdiff_t)(width); - asm volatile("vbroadcasti128 %3,%%ymm5 \n" + asm volatile("vbroadcasti128 %3,%%ymm5 \n" LABELALIGN - "1: \n" - "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpermq $0x4e,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu -0x20(%0,%2,2),%%ymm0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpermq $0x4e,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_uv), // %1 "+r"(temp_width) // %2 @@ -4759,13 +4831,11 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, #ifdef HAS_RGB24MIRRORROW_AVX2 // Shuffle first 10 pixels to last 10 mirrored. first byte zero static const uvec8 kShuffleMirrorRGB0_AVX = { - 128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u -}; + 128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u}; // Shuffle last 2 pixels to first 2 mirrored. last byte zero static const uvec8 kShuffleMirrorRGB1_AVX = { - 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u -}; + 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u}; void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_rgb24, @@ -4801,9 +4871,9 @@ void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24, "sub $0x20,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_rgb24), // %1 - "+r"(temp_width) // %2 + : "+r"(src_rgb24), // %0 + "+r"(dst_rgb24), // %1 + "+r"(temp_width) // %2 : "m"(kShuffleMirrorRGB0_AVX), // %3 "m"(kShuffleMirrorRGB1_AVX) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); @@ -4814,17 +4884,17 @@ void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24, void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { ptrdiff_t temp_width = (ptrdiff_t)(width); - asm volatile("lea -0x10(%0,%2,4),%0 \n" + asm volatile("lea -0x10(%0,%2,4),%0 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "pshufd $0x1b,%%xmm0,%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "pshufd $0x1b,%%xmm0,%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -4838,16 +4908,16 @@ void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { ptrdiff_t temp_width = (ptrdiff_t)(width); - asm volatile("vmovdqu %3,%%ymm5 \n" + asm volatile("vmovdqu %3,%%ymm5 \n" LABELALIGN - "1: \n" - "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" - "vmovdqu %%ymm0,(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vpermd -0x20(%0,%2,4),%%ymm5,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -4894,6 +4964,47 @@ void SplitUVRow_AVX2(const uint8_t* src_uv, } #endif // HAS_SPLITUVROW_AVX2 +#ifdef HAS_SPLITUVROW_AVX512BW +static const uint64_t kSplitUVPermute[8] = {0, 2, 4, 6, 1, 3, 5, 7}; + +void SplitUVRow_AVX512BW(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + asm volatile( + "vpternlogd $0xff,%%zmm5,%%zmm5,%%zmm5 \n" + "vpsrlw $0x8,%%zmm5,%%zmm5 \n" + "vmovdqu64 %4,%%zmm4 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu8 (%0),%%zmm0 \n" + "vmovdqu8 0x40(%0),%%zmm1 \n" + "lea 0x80(%0),%0 \n" + "vpsrlw $0x8,%%zmm0,%%zmm2 \n" + "vpsrlw $0x8,%%zmm1,%%zmm3 \n" + "vpandd %%zmm5,%%zmm0,%%zmm0 \n" + "vpandd %%zmm5,%%zmm1,%%zmm1 \n" + "vpackuswb %%zmm1,%%zmm0,%%zmm0 \n" + "vpackuswb %%zmm3,%%zmm2,%%zmm2 \n" + "vpermq %%zmm0,%%zmm4,%%zmm0 \n" + "vpermq %%zmm2,%%zmm4,%%zmm2 \n" + "vmovdqu8 %%zmm0,(%1) \n" + "vmovdqu8 %%zmm2,0x00(%1,%2,1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x40,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "m"(kSplitUVPermute) // %4 + : "memory", "cc", "zmm0", "zmm1", "zmm2", "zmm3", "zmm4", "zmm5"); +} +#endif // HAS_SPLITUVROW_AVX512BW + #ifdef HAS_SPLITUVROW_SSE2 void SplitUVRow_SSE2(const uint8_t* src_uv, uint8_t* dst_u, @@ -5071,20 +5182,20 @@ void MergeUVRow_AVX512BW(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile("sub %0,%1 \n" + asm volatile("sub %0,%1 \n" LABELALIGN - "1: \n" - "vpmovzxbw (%0),%%zmm0 \n" - "vpmovzxbw 0x00(%0,%1,1),%%zmm1 \n" - "lea 0x20(%0),%0 \n" - "vpsllw $0x8,%%zmm1,%%zmm1 \n" - "vporq %%zmm0,%%zmm1,%%zmm2 \n" - "vmovdqu64 %%zmm2,(%2) \n" - "lea 0x40(%2),%2 \n" - "sub $0x20,%3 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vpmovzxbw (%0),%%zmm0 \n" + "vpmovzxbw 0x00(%0,%1,1),%%zmm1 \n" + "lea 0x20(%0),%0 \n" + "vpsllw $0x8,%%zmm1,%%zmm1 \n" + "vporq %%zmm0,%%zmm1,%%zmm2 \n" + "vmovdqu64 %%zmm2,(%2) \n" + "lea 0x40(%2),%2 \n" + "sub $0x20,%3 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -5099,20 +5210,20 @@ void MergeUVRow_AVX2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile("sub %0,%1 \n" + asm volatile("sub %0,%1 \n" LABELALIGN - "1: \n" - "vpmovzxbw (%0),%%ymm0 \n" - "vpmovzxbw 0x00(%0,%1,1),%%ymm1 \n" - "lea 0x10(%0),%0 \n" - "vpsllw $0x8,%%ymm1,%%ymm1 \n" - "vpor %%ymm0,%%ymm1,%%ymm2 \n" - "vmovdqu %%ymm2,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vpmovzxbw (%0),%%ymm0 \n" + "vpmovzxbw 0x00(%0,%1,1),%%ymm1 \n" + "lea 0x10(%0),%0 \n" + "vpsllw $0x8,%%ymm1,%%ymm1 \n" + "vpor %%ymm0,%%ymm1,%%ymm2 \n" + "vmovdqu %%ymm2,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -5127,21 +5238,21 @@ void MergeUVRow_SSE2(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) { - asm volatile("sub %0,%1 \n" + asm volatile("sub %0,%1 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x00(%0,%1,1),%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm2 \n" - "punpcklbw %%xmm1,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm2 \n" - "movdqu %%xmm0,(%2) \n" - "movdqu %%xmm2,0x10(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x10,%3 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x00(%0,%1,1),%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm2,0x10(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x10,%3 \n" + "jg 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 @@ -5376,24 +5487,24 @@ void Convert16To8Row_AVX512BW(const uint16_t* src_y, uint8_t* dst_y, int scale, int width) { - asm volatile("vpbroadcastw %3,%%zmm2 \n" + asm volatile("vpbroadcastw %3,%%zmm2 \n" // 64 pixels per loop. LABELALIGN - "1: \n" - "vmovups (%0),%%zmm0 \n" - "vmovups 0x40(%0),%%zmm1 \n" - "add $0x80,%0 \n" - "vpmulhuw %%zmm2,%%zmm0,%%zmm0 \n" - "vpmulhuw %%zmm2,%%zmm1,%%zmm1 \n" - "vpmovuswb %%zmm0,%%ymm0 \n" - "vpmovuswb %%zmm1,%%ymm1 \n" - "vmovups %%ymm0,(%1) \n" - "vmovups %%ymm1,0x20(%1) \n" - "add $0x40,%1 \n" - "sub $0x40,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovups (%0),%%zmm0 \n" + "vmovups 0x40(%0),%%zmm1 \n" + "add $0x80,%0 \n" + "vpmulhuw %%zmm2,%%zmm0,%%zmm0 \n" + "vpmulhuw %%zmm2,%%zmm1,%%zmm1 \n" + "vpmovuswb %%zmm0,%%ymm0 \n" + "vpmovuswb %%zmm1,%%ymm1 \n" + "vmovups %%ymm0,(%1) \n" + "vmovups %%ymm1,0x20(%1) \n" + "add $0x40,%1 \n" + "sub $0x40,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -5443,24 +5554,24 @@ void Convert8To16Row_AVX2(const uint8_t* src_y, int scale, int width) { const int shift = __builtin_clz(scale) - 15; - asm volatile("vmovd %3,%%xmm2 \n" + asm volatile("vmovd %3,%%xmm2 \n" // 32 pixels per loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vpermq $0xd8,%%ymm0,%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" - "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" - "vpsrlw %%xmm2,%%ymm0,%%ymm0 \n" - "vpsrlw %%xmm2,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "add $0x40,%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm2,%%ymm0,%%ymm0 \n" + "vpsrlw %%xmm2,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "add $0x40,%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -6241,7 +6352,7 @@ void MergeXR30Row_AVX2(const uint16_t* src_r, #if defined(__i386__) : "m"(shift) // %5 #else - : "rm"(shift) // %5 + : "rm"(shift) // %5 #endif : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } @@ -6577,7 +6688,7 @@ void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width) { // Multiple of 1. void CopyRow_ERMS(const uint8_t* src, uint8_t* dst, int width) { size_t width_tmp = (size_t)(width); - asm volatile("rep movsb \n" + asm volatile("rep movsb \n" : "+S"(src), // %0 "+D"(dst), // %1 "+c"(width_tmp) // %2 @@ -6787,7 +6898,7 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width >> 2); const uint32_t v32 = v8 * 0x01010101u; // Duplicate byte to all bytes. - asm volatile("rep stosl \n" + asm volatile("rep stosl \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 @@ -6796,7 +6907,7 @@ void SetRow_X86(uint8_t* dst, uint8_t v8, int width) { void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { size_t width_tmp = (size_t)(width); - asm volatile("rep stosb \n" + asm volatile("rep stosb \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 : "a"(v8) // %2 @@ -6805,7 +6916,7 @@ void SetRow_ERMS(uint8_t* dst, uint8_t v8, int width) { void ARGBSetRow_X86(uint8_t* dst_argb, uint32_t v32, int width) { size_t width_tmp = (size_t)(width); - asm volatile("rep stosl \n" + asm volatile("rep stosl \n" : "+D"(dst_argb), // %0 "+c"(width_tmp) // %1 : "a"(v32) // %2 @@ -7966,28 +8077,28 @@ void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile("pxor %%xmm5,%%xmm5 \n" + asm volatile("pxor %%xmm5,%%xmm5 \n" // 4 pixel loop. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqu (%1),%%xmm2 \n" - "lea 0x10(%1),%1 \n" - "movdqu %%xmm0,%%xmm1 \n" - "movdqu %%xmm2,%%xmm3 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm3,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "movdqu %%xmm0,(%2) \n" - "lea 0x10(%2),%2 \n" - "sub $0x4,%3 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqu (%1),%%xmm2 \n" + "lea 0x10(%1),%1 \n" + "movdqu %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,%%xmm3 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%2) \n" + "lea 0x10(%2),%2 \n" + "sub $0x4,%3 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -8003,27 +8114,27 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, const uint8_t* src_argb1, uint8_t* dst_argb, int width) { - asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" // 4 pixel loop. LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm1 \n" - "lea 0x20(%0),%0 \n" - "vmovdqu (%1),%%ymm3 \n" - "lea 0x20(%1),%1 \n" - "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" - "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,(%2) \n" - "lea 0x20(%2),%2 \n" - "sub $0x8,%3 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm1 \n" + "lea 0x20(%0),%0 \n" + "vmovdqu (%1),%%ymm3 \n" + "lea 0x20(%1),%1 \n" + "vpunpcklbw %%ymm1,%%ymm1,%%ymm0 \n" + "vpunpckhbw %%ymm1,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%2) \n" + "lea 0x20(%2),%2 \n" + "sub $0x8,%3 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(src_argb1), // %1 "+r"(dst_argb), // %2 @@ -8783,10 +8894,14 @@ void InterpolateRow_16_AVX2(uint16_t* dst_ptr, "vmovd %3,%%xmm5 \n" "vpunpcklwd %%xmm0,%%xmm5,%%xmm5 \n" "vpbroadcastd %%xmm5,%%ymm5 \n" - "mov $0x80008000,%%eax \n" // 0x80008000 used to bias unsigned words to signed range for vpmaddwd. + "mov $0x80008000,%%eax \n" // 0x80008000 used to bias + // unsigned words to + // signed range for + // vpmaddwd. "vmovd %%eax,%%xmm4 \n" "vbroadcastss %%xmm4,%%ymm4 \n" - "mov $8388736,%%eax \n" // 32768 * 256 + 128 rounding constant. + "mov $8388736,%%eax \n" // 32768 * 256 + 128 + // rounding constant. "vmovd %%eax,%%xmm3 \n" "vbroadcastss %%xmm3,%%ymm3 \n" @@ -8811,8 +8926,7 @@ void InterpolateRow_16_AVX2(uint16_t* dst_ptr, "jg 1b \n" "jmp 99f \n" - "50: \n" - LABELALIGN + "50: \n" LABELALIGN "2: \n" "vmovdqu (%1),%%ymm0 \n" "vpavgw (%1,%4,2),%%ymm0,%%ymm0 \n" @@ -8822,8 +8936,7 @@ void InterpolateRow_16_AVX2(uint16_t* dst_ptr, "jg 2b \n" "jmp 99f \n" - "100: \n" - LABELALIGN + "100: \n" LABELALIGN "3: \n" "vmovdqu (%1),%%ymm0 \n" "vmovdqu %%ymm0,0x00(%1,%0,1) \n" @@ -8832,7 +8945,7 @@ void InterpolateRow_16_AVX2(uint16_t* dst_ptr, "jg 3b \n" "99: \n" - "vzeroupper \n" + "vzeroupper \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(width), // %2 @@ -8848,20 +8961,20 @@ void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm volatile("movdqu (%3),%%xmm5 \n" + asm volatile("movdqu (%3),%%xmm5 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x8,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -8876,21 +8989,21 @@ void ARGBShuffleRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm volatile("vbroadcasti128 (%3),%%ymm5 \n" + asm volatile("vbroadcasti128 (%3),%%ymm5 \n" LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -8905,27 +9018,26 @@ void ARGBShuffleRow_AVX512BW(const uint8_t* src_argb, uint8_t* dst_argb, const uint8_t* shuffler, int width) { - asm volatile( - "vbroadcasti32x4 (%3),%%zmm5 \n" + asm volatile("vbroadcasti32x4 (%3),%%zmm5 \n" - LABELALIGN - "1: \n" - "vmovdqu8 (%0),%%zmm0 \n" - "vmovdqu8 0x40(%0),%%zmm1 \n" - "lea 0x80(%0),%0 \n" - "vpshufb %%zmm5,%%zmm0,%%zmm0 \n" - "vpshufb %%zmm5,%%zmm1,%%zmm1 \n" - "vmovdqu8 %%zmm0,(%1) \n" - "vmovdqu8 %%zmm1,0x40(%1) \n" - "lea 0x80(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(shuffler) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm5"); + LABELALIGN + "1: \n" + "vmovdqu8 (%0),%%zmm0 \n" + "vmovdqu8 0x40(%0),%%zmm1 \n" + "lea 0x80(%0),%0 \n" + "vpshufb %%zmm5,%%zmm0,%%zmm0 \n" + "vpshufb %%zmm5,%%zmm1,%%zmm1 \n" + "vmovdqu8 %%zmm0,(%1) \n" + "vmovdqu8 %%zmm1,0x40(%1) \n" + "lea 0x80(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(shuffler) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); } #endif // HAS_ARGBSHUFFLEROW_AVX512BW @@ -8935,24 +9047,24 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { - asm volatile("sub %1,%2 \n" + asm volatile("sub %1,%2 \n" LABELALIGN - "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "add $0x10,%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%3) \n" - "movdqu %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "add $0x10,%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%3) \n" + "movdqu %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -8969,24 +9081,24 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { - asm volatile("sub %1,%2 \n" + asm volatile("sub %1,%2 \n" LABELALIGN - "1: \n" - "movq (%1),%%xmm2 \n" - "movq 0x00(%1,%2,1),%%xmm1 \n" - "add $0x8,%1 \n" - "punpcklbw %%xmm1,%%xmm2 \n" - "movdqu (%0),%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "add $0x10,%0 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqu %%xmm1,(%3) \n" - "movdqu %%xmm2,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" + "1: \n" + "movq (%1),%%xmm2 \n" + "movq 0x00(%1,%2,1),%%xmm1 \n" + "add $0x8,%1 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "add $0x10,%0 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqu %%xmm1,(%3) \n" + "movdqu %%xmm2,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -9003,27 +9115,27 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_yuy2, int width) { - asm volatile("sub %1,%2 \n" + asm volatile("sub %1,%2 \n" LABELALIGN - "1: \n" - "vpmovzxbw (%1),%%ymm1 \n" - "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" - "add $0x10,%1 \n" - "vpsllw $0x8,%%ymm2,%%ymm2 \n" - "vpor %%ymm1,%%ymm2,%%ymm2 \n" - "vmovdqu (%0),%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" - "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" - "vextractf128 $0x0,%%ymm1,(%3) \n" - "vextractf128 $0x0,%%ymm2,0x10(%3) \n" - "vextractf128 $0x1,%%ymm1,0x20(%3) \n" - "vextractf128 $0x1,%%ymm2,0x30(%3) \n" - "lea 0x40(%3),%3 \n" - "sub $0x20,%4 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n" + "vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -9040,27 +9152,27 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y, const uint8_t* src_v, uint8_t* dst_uyvy, int width) { - asm volatile("sub %1,%2 \n" + asm volatile("sub %1,%2 \n" LABELALIGN - "1: \n" - "vpmovzxbw (%1),%%ymm1 \n" - "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" - "add $0x10,%1 \n" - "vpsllw $0x8,%%ymm2,%%ymm2 \n" - "vpor %%ymm1,%%ymm2,%%ymm2 \n" - "vmovdqu (%0),%%ymm0 \n" - "add $0x20,%0 \n" - "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" - "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" - "vextractf128 $0x0,%%ymm1,(%3) \n" - "vextractf128 $0x0,%%ymm2,0x10(%3) \n" - "vextractf128 $0x1,%%ymm1,0x20(%3) \n" - "vextractf128 $0x1,%%ymm2,0x30(%3) \n" - "lea 0x40(%3),%3 \n" - "sub $0x20,%4 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vpmovzxbw (%1),%%ymm1 \n" + "vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n" + "add $0x10,%1 \n" + "vpsllw $0x8,%%ymm2,%%ymm2 \n" + "vpor %%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu (%0),%%ymm0 \n" + "add $0x20,%0 \n" + "vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n" + "vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n" + "vextractf128 $0x0,%%ymm1,(%3) \n" + "vextractf128 $0x0,%%ymm2,0x10(%3) \n" + "vextractf128 $0x1,%%ymm1,0x20(%3) \n" + "vextractf128 $0x1,%%ymm2,0x30(%3) \n" + "lea 0x40(%3),%3 \n" + "sub $0x20,%4 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -9076,47 +9188,47 @@ void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, uint8_t* dst_argb, const float* poly, int width) { - asm volatile("pxor %%xmm3,%%xmm3 \n" + asm volatile("pxor %%xmm3,%%xmm3 \n" // 2 pixel loop. LABELALIGN - "1: \n" - "movq (%0),%%xmm0 \n" - "lea 0x8(%0),%0 \n" - "punpcklbw %%xmm3,%%xmm0 \n" - "movdqa %%xmm0,%%xmm4 \n" - "punpcklwd %%xmm3,%%xmm0 \n" - "punpckhwd %%xmm3,%%xmm4 \n" - "cvtdq2ps %%xmm0,%%xmm0 \n" - "cvtdq2ps %%xmm4,%%xmm4 \n" - "movdqa %%xmm0,%%xmm1 \n" - "movdqa %%xmm4,%%xmm5 \n" - "mulps 0x10(%3),%%xmm0 \n" - "mulps 0x10(%3),%%xmm4 \n" - "addps (%3),%%xmm0 \n" - "addps (%3),%%xmm4 \n" - "movdqa %%xmm1,%%xmm2 \n" - "movdqa %%xmm5,%%xmm6 \n" - "mulps %%xmm1,%%xmm2 \n" - "mulps %%xmm5,%%xmm6 \n" - "mulps %%xmm2,%%xmm1 \n" - "mulps %%xmm6,%%xmm5 \n" - "mulps 0x20(%3),%%xmm2 \n" - "mulps 0x20(%3),%%xmm6 \n" - "mulps 0x30(%3),%%xmm1 \n" - "mulps 0x30(%3),%%xmm5 \n" - "addps %%xmm2,%%xmm0 \n" - "addps %%xmm6,%%xmm4 \n" - "addps %%xmm1,%%xmm0 \n" - "addps %%xmm5,%%xmm4 \n" - "cvttps2dq %%xmm0,%%xmm0 \n" - "cvttps2dq %%xmm4,%%xmm4 \n" - "packuswb %%xmm4,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movq %%xmm0,(%1) \n" - "lea 0x8(%1),%1 \n" - "sub $0x2,%2 \n" - "jg 1b \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm3,%%xmm0 \n" + "movdqa %%xmm0,%%xmm4 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm4,%%xmm5 \n" + "mulps 0x10(%3),%%xmm0 \n" + "mulps 0x10(%3),%%xmm4 \n" + "addps (%3),%%xmm0 \n" + "addps (%3),%%xmm4 \n" + "movdqa %%xmm1,%%xmm2 \n" + "movdqa %%xmm5,%%xmm6 \n" + "mulps %%xmm1,%%xmm2 \n" + "mulps %%xmm5,%%xmm6 \n" + "mulps %%xmm2,%%xmm1 \n" + "mulps %%xmm6,%%xmm5 \n" + "mulps 0x20(%3),%%xmm2 \n" + "mulps 0x20(%3),%%xmm6 \n" + "mulps 0x30(%3),%%xmm1 \n" + "mulps 0x30(%3),%%xmm5 \n" + "addps %%xmm2,%%xmm0 \n" + "addps %%xmm6,%%xmm4 \n" + "addps %%xmm1,%%xmm0 \n" + "addps %%xmm5,%%xmm4 \n" + "cvttps2dq %%xmm0,%%xmm0 \n" + "cvttps2dq %%xmm4,%%xmm4 \n" + "packuswb %%xmm4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x2,%2 \n" + "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 @@ -9204,7 +9316,7 @@ void HalfFloatRow_AVX2(const uint16_t* src, #if defined(__x86_64__) : "x"(scale) // %3 #else - : "m"(scale) // %3 + : "m"(scale) // %3 #endif : "memory", "cc", "xmm2", "xmm3", "xmm4", "xmm5"); } @@ -9242,7 +9354,7 @@ void HalfFloatRow_F16C(const uint16_t* src, #if defined(__x86_64__) : "x"(scale) // %3 #else - : "m"(scale) // %3 + : "m"(scale) // %3 #endif : "memory", "cc", "xmm2", "xmm3", "xmm4"); } @@ -9576,20 +9688,20 @@ static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, // Convert UV plane of NV12 to VU of NV21. void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile("movdqu %3,%%xmm5 \n" + asm volatile("movdqu %3,%%xmm5 \n" LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "movdqu 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 "+r"(width) // %2 @@ -9600,21 +9712,21 @@ void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width) { #ifdef HAS_SWAPUVROW_AVX2 void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) { - asm volatile("vbroadcasti128 %3,%%ymm5 \n" + asm volatile("vbroadcasti128 %3,%%ymm5 \n" LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm0 \n" - "vmovdqu 0x20(%0),%%ymm1 \n" - "lea 0x40(%0),%0 \n" - "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" - "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_uv), // %0 "+r"(dst_vu), // %1 "+r"(width) // %2 diff --git a/source/row_lasx.cc b/source/row_lasx.cc index 94cb44ed1..e0802c15e 100644 --- a/source/row_lasx.cc +++ b/source/row_lasx.cc @@ -2027,10 +2027,12 @@ struct ArgbConstants { // R * 0.2990 coefficient = 77 // Add 0.5 = 0x80 static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, - 128, - 0}; + 128, + 0}; -static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; +static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, + 128, + 0}; // RGB to BT.601 coefficients // B * 0.1016 coefficient = 25 @@ -2039,19 +2041,19 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0} // Add 16.5 = 0x1080 static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, - 0x1080, - 0}; + 0x1080, + 0}; static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, - 0x1080, - 0}; + 0x1080, + 0}; #endif // ArgbConstants // ARGB expects first 3 values to contain RGB and 4th value is ignored. void ARGBToYMatrixRow_LASX(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c) { + uint8_t* dst_y, + int width, + const struct ArgbConstants* c) { int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; asm volatile( "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants @@ -2216,18 +2218,14 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba, "xvst $xr10, %1, 0 \n\t" "addi.d %1, %1, 32 \n\t" "bnez %2, 1b \n\t" - : "+&r"(src_rgba), // %0 - "+&r"(dst_y), // %1 - "+&r"(width) // %2 - : "r"(c), // %3 - "r"(shuff) // %4 + : "+&r"(src_rgba), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(c), // %3 + "r"(shuff) // %4 : "memory"); } - - - - void ARGBToUVJRow_LASX(const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u, diff --git a/source/row_lsx.cc b/source/row_lsx.cc index 41689578a..3e6d5154c 100644 --- a/source/row_lsx.cc +++ b/source/row_lsx.cc @@ -2812,10 +2812,12 @@ struct ArgbConstants { // R * 0.2990 coefficient = 77 // Add 0.5 = 0x80 static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, - 128, - 0}; + 128, + 0}; -static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0}; +static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, + 128, + 0}; // RGB to BT.601 coefficients // B * 0.1016 coefficient = 25 @@ -2824,19 +2826,19 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128, 0} // Add 16.5 = 0x1080 static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, - 0x1080, - 0}; + 0x1080, + 0}; static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, - 0x1080, - 0}; + 0x1080, + 0}; #endif // ArgbConstants // ARGB expects first 3 values to contain RGB and 4th value is ignored. void ARGBToYMatrixRow_LSX(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c) { + uint8_t* dst_y, + int width, + const struct ArgbConstants* c) { asm volatile( "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants @@ -2987,18 +2989,14 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba, "vst $vr10, %1, 0 \n\t" "addi.d %1, %1, 16 \n\t" "bnez %2, 1b \n\t" - : "+&r"(src_rgba), // %0 - "+&r"(dst_y), // %1 - "+&r"(width) // %2 - : "r"(c), // %3 - "r"(shuff) // %4 + : "+&r"(src_rgba), // %0 + "+&r"(dst_y), // %1 + "+&r"(width) // %2 + : "r"(c), // %3 + "r"(shuff) // %4 : "memory"); } - - - - // undef for unified sources build #undef YUVTORGB_SETUP #undef READYUV422_D diff --git a/source/row_neon.cc b/source/row_neon.cc index 62644a321..08608005f 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/row.h" #include "libyuv/convert_from_argb.h" // For ArgbConstants +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { @@ -272,7 +272,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, "subs %[width], %[width], #8 \n" // YUVTORGB // RGBTORGB8 // - STORERGBA // + STORERGBA // "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] @@ -325,9 +325,8 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, YUVTORGB_SETUP "vmov.u8 d6, #255 \n" "1: \n" // - READYUV422 - "subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 - ARGBTORGB565 + READYUV422 "subs %[width], %[width], #8 \n" YUVTORGB + RGBTORGB8 ARGBTORGB565 "vst1.8 {q2}, [%[dst_rgb565]]! \n" // store 8 pixels RGB565. "bgt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -1887,13 +1886,13 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"(&c->kRGBToU), // %4 - "r"(&c->kRGBToV), // %5 - "r"(&c->kAddUV) // %6 + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(&c->kRGBToU), // %4 + "r"(&c->kRGBToV), // %5 + "r"(&c->kAddUV) // %6 : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q10", "q11", "q12"); } @@ -1912,7 +1911,6 @@ void ARGBToUVJ444Row_NEON(const uint8_t* src_argb, ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbJPEGConstants); } - // clang-format off // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. #define RGBTOUV(QB, QG, QR) \ @@ -1934,8 +1932,9 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, int width, const struct ArgbConstants* c) { const uint8_t* src_argb_1 = src_argb + src_stride_argb; - asm volatile ( - "vld1.8 {d24}, [%5] \n" // load kRGBToU (8 bytes, only 4 used) + asm volatile( + "vld1.8 {d24}, [%5] \n" // load kRGBToU (8 bytes, + // only 4 used) "vld1.8 {d25}, [%6] \n" // load kRGBToV "vmovl.s8 q14, d24 \n" // U coeffs in d28 "vmovl.s8 q15, d25 \n" // V coeffs in d30 @@ -1943,7 +1942,8 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. - "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB + // pixels. "subs %4, %4, #16 \n" // 16 processed per loop. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. @@ -1985,16 +1985,15 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : "r"(&c->kRGBToU), // %5 - "r"(&c->kRGBToV) // %6 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q11", "q12", "q14", "q15" - ); + : "+r"(src_argb), // %0 + "+r"(src_argb_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : "r"(&c->kRGBToU), // %5 + "r"(&c->kRGBToV) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", + "q9", "q11", "q12", "q14", "q15"); } void ARGBToUVRow_NEON(const uint8_t* src_argb, @@ -2704,9 +2703,9 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, // ARGB expects first 3 values to contain RGB and 4th value is ignored. void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c) { + uint8_t* dst_y, + int width, + const struct ArgbConstants* c) { asm volatile( "vld1.8 {d24}, [%3] \n" // load kRGBToY "vld1.16 {d25[0]}, [%4] \n" // load kAddY[0] @@ -2773,9 +2772,9 @@ void BGRAToYJRow_NEON(const uint8_t* src_bgra, uint8_t* dst_yj, int width) { } void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c) { + uint8_t* dst_y, + int width, + const struct ArgbConstants* c) { asm volatile( "vld1.8 {d24}, [%3] \n" // load kRGBToY "vld1.16 {d25[0]}, [%4] \n" // load kAddY[0] @@ -2807,10 +2806,6 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, "d24", "d25"); } - - - - // Bilinear filter 16x2 -> 16x1 void InterpolateRow_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 65d7b65a5..f90b4a18b 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -8,8 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/row.h" #include "libyuv/convert_from_argb.h" +#include "libyuv/row.h" #ifdef __cplusplus namespace libyuv { @@ -292,12 +292,12 @@ void I210ToAR30Row_NEON(const uint16_t* src_y, uint16_t limit = 0x3ff0; uint16_t alpha = 0xc000; asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "dup v23.8h, %w[alpha] \n" - "1: \n" // + "dup v22.8h, %w[limit] \n" + "dup v23.8h, %w[alpha] \n" + "1: \n" // READYUV210 - "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 - "b.gt 1b \n" + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -321,12 +321,12 @@ void I410ToAR30Row_NEON(const uint16_t* src_y, uint16_t limit = 0x3ff0; uint16_t alpha = 0xc000; asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "dup v23.8h, %w[alpha] \n" - "1: \n" // + "dup v22.8h, %w[limit] \n" + "dup v23.8h, %w[alpha] \n" + "1: \n" // READYUV410 - "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 - "b.gt 1b \n" + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -349,12 +349,12 @@ void I212ToAR30Row_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const uint16_t limit = 0x3ff0; asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "movi v23.8h, #0xc0, lsl #8 \n" // A - "1: \n" // + "dup v22.8h, %w[limit] \n" + "movi v23.8h, #0xc0, lsl #8 \n" // A + "1: \n" // READYUV212 - "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 - "b.gt 1b \n" + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_u] "+r"(src_u), // %[src_u] [src_v] "+r"(src_v), // %[src_v] @@ -531,13 +531,13 @@ void P210ToAR30Row_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const uint16_t limit = 0x3ff0; asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "movi v23.8h, #0xc0, lsl #8 \n" // A - "ldr q2, [%[kIndices]] \n" - "1: \n" // + "dup v22.8h, %w[limit] \n" + "movi v23.8h, #0xc0, lsl #8 \n" // A + "ldr q2, [%[kIndices]] \n" + "1: \n" // READYUVP210 - "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 - "b.gt 1b \n" + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_uv), // %[src_uv] [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] @@ -558,13 +558,13 @@ void P410ToAR30Row_NEON(const uint16_t* src_y, const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; uint16_t limit = 0x3ff0; asm volatile(YUVTORGB_SETUP - "dup v22.8h, %w[limit] \n" - "movi v23.8h, #0xc0, lsl #8 \n" // A - "ldr q2, [%[kIndices]] \n" - "1: \n" // + "dup v22.8h, %w[limit] \n" + "movi v23.8h, #0xc0, lsl #8 \n" // A + "ldr q2, [%[kIndices]] \n" + "1: \n" // READYUVP410 - "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 - "b.gt 1b \n" + "subs %w[width], %w[width], #8 \n" NVTORGB STOREAR30 + "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] [src_uv] "+r"(src_uv), // %[src_uv] [dst_ar30] "+r"(dst_ar30), // %[dst_ar30] @@ -783,9 +783,8 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "1: \n" // - READYUV422 - "subs %w[width], %w[width], #8 \n" I4XXTORGB RGBTORGB8_TOP - ARGBTORGB565_FROM_TOP + READYUV422 "subs %w[width], %w[width], #8 \n" I4XXTORGB + RGBTORGB8_TOP ARGBTORGB565_FROM_TOP "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 pixels RGB565. "b.gt 1b \n" : [src_y] "+r"(src_y), // %[src_y] @@ -1036,9 +1035,8 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, YUVTORGB_SETUP "ldr q2, [%[kNV12Table]] \n" "1: \n" // - READNV12 - "subs %w[width], %w[width], #8 \n" NVTORGB RGBTORGB8_TOP - ARGBTORGB565_FROM_TOP + READNV12 "subs %w[width], %w[width], #8 \n" NVTORGB + RGBTORGB8_TOP ARGBTORGB565_FROM_TOP "st1 {v18.8h}, [%[dst_rgb565]], #16 \n" // store 8 // pixels // RGB565. @@ -2742,20 +2740,22 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, int width, const struct ArgbConstants* c) { asm volatile( - "ldr q16, [%[c], #16] \n" // kRGBToU - "ldr q17, [%[c], #32] \n" // kRGBToV - "ldr s0, [%[c], #64] \n" // kAddUV - "sxtl v16.8h, v16.8b \n" // sign extend U coeffs to 16-bit - "sxtl v17.8h, v17.8b \n" // sign extend V coeffs to 16-bit - "dup v20.8h, v16.h[0] \n" // U0 - "dup v21.8h, v16.h[1] \n" // U1 - "dup v22.8h, v16.h[2] \n" // U2 - "dup v23.8h, v16.h[3] \n" // U3 - "dup v24.8h, v17.h[0] \n" // V0 - "dup v26.8h, v17.h[1] \n" // V1 - "dup v27.8h, v17.h[2] \n" // V2 - "dup v28.8h, v17.h[3] \n" // V3 - "dup v25.8h, v0.h[0] \n" // kAddUV + "ldr q16, [%[c], #16] \n" // kRGBToU + "ldr q17, [%[c], #32] \n" // kRGBToV + "ldr s0, [%[c], #64] \n" // kAddUV + "sxtl v16.8h, v16.8b \n" // sign extend U coeffs + // to 16-bit + "sxtl v17.8h, v17.8b \n" // sign extend V coeffs + // to 16-bit + "dup v20.8h, v16.h[0] \n" // U0 + "dup v21.8h, v16.h[1] \n" // U1 + "dup v22.8h, v16.h[2] \n" // U2 + "dup v23.8h, v16.h[3] \n" // U3 + "dup v24.8h, v17.h[0] \n" // V0 + "dup v26.8h, v17.h[1] \n" // V1 + "dup v27.8h, v17.h[2] \n" // V2 + "dup v28.8h, v17.h[3] \n" // V3 + "dup v25.8h, v0.h[0] \n" // kAddUV "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "subs %w3, %w3, #8 \n" // 8 processed per loop. @@ -2783,27 +2783,26 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, "st1 {v0.8b}, [%1], #8 \n" "st1 {v1.8b}, [%2], #8 \n" "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : [c] "r"(c) // %4 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", - "v26", "v27", "v28"); + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : [c] "r"(c) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", + "v27", "v28"); } -static void ARGBToUV444MatrixRow_NEON_I8MM( - const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { +static void ARGBToUV444MatrixRow_NEON_I8MM(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c) { asm volatile( - "ldr q16, [%[c], #16] \n" // kRGBToU - "ldr q17, [%[c], #32] \n" // kRGBToV - "ldr s0, [%[c], #64] \n" // kAddUV - "dup v29.8h, v0.h[0] \n" // 128.0 + "ldr q16, [%[c], #16] \n" // kRGBToU + "ldr q17, [%[c], #32] \n" // kRGBToV + "ldr s0, [%[c], #64] \n" // kAddUV + "dup v29.8h, v0.h[0] \n" // 128.0 "1: \n" "ldp q0, q1, [%[src]], #32 \n" "subs %w[width], %w[width], #8 \n" // 8 processed per loop. @@ -2823,11 +2822,11 @@ static void ARGBToUV444MatrixRow_NEON_I8MM( "str d0, [%[dst_u]], #8 \n" // store 8 pixels U. "str d1, [%[dst_v]], #8 \n" // store 8 pixels V. "b.gt 1b \n" - : [src] "+r"(src_argb), // %[src] - [dst_u] "+r"(dst_u), // %[dst_u] - [dst_v] "+r"(dst_v), // %[dst_v] - [width] "+r"(width) // %[width] - : [c] "r"(c) // %[c] + : [src] "+r"(src_argb), // %[src] + [dst_u] "+r"(dst_u), // %[dst_u] + [dst_v] "+r"(dst_v), // %[dst_v] + [width] "+r"(width) // %[width] + : [c] "r"(c) // %[c] : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v29"); } @@ -2844,8 +2843,7 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, - &kArgbI601Constants); + ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbI601Constants); } void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb, @@ -2860,8 +2858,7 @@ void ARGBToUVJ444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, - &kArgbJPEGConstants); + ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbJPEGConstants); } void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb, @@ -2903,23 +2900,27 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, int width, const struct ArgbConstants* c) { const uint8_t* src_argb_1 = src_argb + src_stride_argb; - asm volatile ( - "ldr q16, [%[c], #16] \n" // kRGBToU - "ldr q17, [%[c], #32] \n" // kRGBToV - "sxtl v16.8h, v16.8b \n" // sign extend U coeffs to 16-bit - "sxtl v17.8h, v17.8b \n" // sign extend V coeffs to 16-bit - "dup v20.8h, v16.h[0] \n" // U0 - "dup v21.8h, v16.h[1] \n" // U1 - "dup v22.8h, v16.h[2] \n" // U2 - "dup v23.8h, v16.h[3] \n" // U3 - "dup v24.8h, v17.h[0] \n" // V0 - "dup v26.8h, v17.h[1] \n" // V1 - "dup v27.8h, v17.h[2] \n" // V2 - "dup v28.8h, v17.h[3] \n" // V3 - "movi v25.8h, #0x80, lsl #8 \n" // 128.0 in 16-bit (0x8000) + asm volatile( + "ldr q16, [%[c], #16] \n" // kRGBToU + "ldr q17, [%[c], #32] \n" // kRGBToV + "sxtl v16.8h, v16.8b \n" // sign extend U coeffs + // to 16-bit + "sxtl v17.8h, v17.8b \n" // sign extend V coeffs + // to 16-bit + "dup v20.8h, v16.h[0] \n" // U0 + "dup v21.8h, v16.h[1] \n" // U1 + "dup v22.8h, v16.h[2] \n" // U2 + "dup v23.8h, v16.h[3] \n" // U3 + "dup v24.8h, v17.h[0] \n" // V0 + "dup v26.8h, v17.h[1] \n" // V1 + "dup v27.8h, v17.h[2] \n" // V2 + "dup v28.8h, v17.h[3] \n" // V3 + "movi v25.8h, #0x80, lsl #8 \n" // 128.0 in 16-bit + // (0x8000) "1: \n" - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 + // pixels. "subs %w4, %w4, #16 \n" // 16 processed per loop. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%0, 448] \n" @@ -2927,7 +2928,8 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "uaddlp v18.8h, v3.16b \n" // A 16 bytes -> 8 shorts. - "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 + // more. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. @@ -2940,34 +2942,33 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb, "urshr v18.8h, v18.8h, #2 \n" // U = B*U0 + G*U1 + R*U2 + A*U3 - "mul v3.8h, v0.8h, v20.8h \n" - "mla v3.8h, v1.8h, v21.8h \n" - "mla v3.8h, v2.8h, v22.8h \n" - "mla v3.8h, v18.8h, v23.8h \n" + "mul v3.8h, v0.8h, v20.8h \n" + "mla v3.8h, v1.8h, v21.8h \n" + "mla v3.8h, v2.8h, v22.8h \n" + "mla v3.8h, v18.8h, v23.8h \n" // V = B*V0 + G*V1 + R*V2 + A*V3 - "mul v4.8h, v0.8h, v24.8h \n" - "mla v4.8h, v1.8h, v26.8h \n" - "mla v4.8h, v2.8h, v27.8h \n" - "mla v4.8h, v18.8h, v28.8h \n" + "mul v4.8h, v0.8h, v24.8h \n" + "mla v4.8h, v1.8h, v26.8h \n" + "mla v4.8h, v2.8h, v27.8h \n" + "mla v4.8h, v18.8h, v28.8h \n" // U = (128.0 - U) >> 8, V = (128.0 - V) >> 8 - "subhn v0.8b, v25.8h, v3.8h \n" - "subhn v1.8b, v25.8h, v4.8h \n" + "subhn v0.8b, v25.8h, v3.8h \n" + "subhn v1.8b, v25.8h, v4.8h \n" "st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. "st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(src_argb_1), // %1 - "+r"(dst_u), // %2 - "+r"(dst_v), // %3 - "+r"(width) // %4 - : [c] "r"(c) // %5 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v16", "v17", "v18", "v20", "v21", "v22", "v23", "v24", "v25", "v26", - "v27", "v28" - ); + : "+r"(src_argb), // %0 + "+r"(src_argb_1), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(width) // %4 + : [c] "r"(c) // %5 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", + "v28"); } void ARGBToUVRow_NEON(const uint8_t* src_argb, @@ -3330,11 +3331,11 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, // Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the ArgbConstants layout. static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src, - int src_stride, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct ArgbConstants* c) { + int src_stride, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c) { const uint8_t* src1 = src + src_stride; asm volatile( "movi v23.8h, #0x80, lsl #8 \n" // 128.0 (0x8000 in @@ -3388,12 +3389,12 @@ static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src, "str d0, [%[dst_u]], #8 \n" // store 8 pixels U "str d1, [%[dst_v]], #8 \n" // store 8 pixels V "b.gt 1b \n" - : [src] "+r"(src), // %[src] - [src1] "+r"(src1), // %[src1] - [dst_u] "+r"(dst_u), // %[dst_u] - [dst_v] "+r"(dst_v), // %[dst_v] - [width] "+r"(width) // %[width] - : [c] "r"(c) // %[c] + : [src] "+r"(src), // %[src] + [src1] "+r"(src1), // %[src1] + [dst_u] "+r"(dst_u), // %[dst_u] + [dst_v] "+r"(dst_v), // %[dst_v] + [width] "+r"(width) // %[width] + : [c] "r"(c) // %[c] : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v23", "v24", "v25"); } @@ -3404,8 +3405,8 @@ void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct ArgbConstants* c) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width, - c); + ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, + width, c); } void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb, @@ -3413,8 +3414,8 @@ void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width, - &kArgbI601Constants); + ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, + width, &kArgbI601Constants); } void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr, @@ -3422,8 +3423,8 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width, - &kAbgrI601Constants); + ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, + width, &kAbgrI601Constants); } void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra, @@ -3431,8 +3432,8 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width, - &kBgraI601Constants); + ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, + width, &kBgraI601Constants); } void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba, @@ -3440,8 +3441,8 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width, - &kRgbaI601Constants); + ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, + width, &kRgbaI601Constants); } void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb, @@ -3449,8 +3450,8 @@ void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width, - &kArgbJPEGConstants); + ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, + width, &kArgbJPEGConstants); } void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr, @@ -3458,8 +3459,8 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width, - &kAbgrJPEGConstants); + ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, + width, &kAbgrJPEGConstants); } void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { @@ -3558,13 +3559,11 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"); } - - // ARGB expects first 3 values to contain RGB and 4th value is ignored. void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c) { + uint8_t* dst_y, + int width, + const struct ArgbConstants* c) { asm volatile( "ldr s16, [%3] \n" // load 4 coeffs "ldr s17, [%3, #48] \n" // load kAddY[0] @@ -3589,20 +3588,18 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, "addhn v1.8b, v1.8h, v22.8h \n" "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(c) // %3 + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(c) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22"); } - -void ARGBToYMatrixRow_NEON_DotProd( - const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c) { +void ARGBToYMatrixRow_NEON_DotProd(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c) { asm volatile( "ldr s16, [%3] \n" // load 4 coeffs "ldr s17, [%3, #48] \n" // load kAddY[0] @@ -3625,14 +3622,14 @@ void ARGBToYMatrixRow_NEON_DotProd( "addhn v1.8b, v1.8h, v19.8h \n" "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(c) // %3 - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19"); + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(c) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", + "v17", "v18", "v19"); } - // RGB to JPeg coefficients void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { @@ -3708,9 +3705,9 @@ void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra, } void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c) { + uint8_t* dst_y, + int width, + const struct ArgbConstants* c) { asm volatile( "ldr s16, [%3] \n" // load 4 coeffs "ldr s17, [%3, #48] \n" // load kAddY[0] @@ -3732,18 +3729,14 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, "addhn v1.8b, v1.8h, v21.8h \n" "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. "b.gt 1b \n" - : "+r"(src_rgb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(c) // %3 + : "+r"(src_rgb), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : "r"(c) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18", "v19", "v20", "v21"); } - - - - // Bilinear filter 16x2 -> 16x1 void InterpolateRow_NEON(uint8_t* dst_ptr, const uint8_t* src_ptr, diff --git a/source/row_rvv.cc b/source/row_rvv.cc index 93bc431bc..91752ed16 100644 --- a/source/row_rvv.cc +++ b/source/row_rvv.cc @@ -1249,16 +1249,22 @@ void MergeUVRow_RVV(const uint8_t* src_u, } #endif - - // RGB to JPeg coefficients // B * 0.1140 coefficient = 29 // G * 0.5870 coefficient = 150 // R * 0.2990 coefficient = 77 // Add 0.5 = 0x80 -static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, {0}, {0}, {128}, {0}}; +static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, + {0}, + {0}, + {128}, + {0}}; -static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {0}, {0}, {128}, {0}}; +static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, + {0}, + {0}, + {128}, + {0}}; // RGB to BT.601 coefficients // B * 0.1016 coefficient = 25 @@ -1266,16 +1272,24 @@ static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {0}, {0 // R * 0.2578 coefficient = 66 // Add 16.5 = 0x1080 -static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, {0}, {0}, {0x1080}, {0}}; +static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, + {0}, + {0}, + {0x1080}, + {0}}; -static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, {0}, {0}, {0x1080}, {0}}; +static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, + {0}, + {0}, + {0x1080}, + {0}}; // ARGB expects first 3 values to contain RGB and 4th value is ignored #ifdef HAS_ARGBTOYMATRIXROW_RVV void ARGBToYMatrixRow_RVV(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct ArgbConstants* c) { + uint8_t* dst_y, + int width, + const struct ArgbConstants* c) { assert(width != 0); size_t w = (size_t)width; vuint8m2_t v_by, v_gy, v_ry; // vectors are to store RGBToY constant diff --git a/source/row_sme.cc b/source/row_sme.cc index fca536dc4..2291562e2 100644 --- a/source/row_sme.cc +++ b/source/row_sme.cc @@ -1127,9 +1127,10 @@ __arm_locally_streaming void ARGBToUVMatrixRow_SME( uint8_t* dst_v, int width, const struct ArgbConstants* c) { - int8_t uvconstants[8] = { - (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3], - (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]}; + int8_t uvconstants[8] = {(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], + (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3], + (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], + (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]}; ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width, uvconstants); } diff --git a/source/row_sve.cc b/source/row_sve.cc index 7d8734921..662685882 100644 --- a/source/row_sve.cc +++ b/source/row_sve.cc @@ -223,9 +223,10 @@ void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb, uint8_t* dst_v, int width, const struct ArgbConstants* c) { - int8_t uvconstants[8] = { - (int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3], - (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]}; + int8_t uvconstants[8] = {(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], + (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3], + (int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], + (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]}; ARGBToUVMatrixRow_SVE_SC(src_argb, src_stride_argb, dst_u, dst_v, width, uvconstants); } diff --git a/source/row_win.cc b/source/row_win.cc index 441fe1451..a7ed75199 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -8,19 +8,19 @@ * be found in the AUTHORS file in the root of the source tree. */ -#include "libyuv/row.h" #include "libyuv/convert_from_argb.h" // For ArgbConstants +#include "libyuv/row.h" // This module is for Visual C 32/64 bit -#if !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__) || \ - defined(_M_X64) || defined(_M_X86)) && \ - ((defined(_MSC_VER) && !defined(__clang__)) || \ +#if !defined(LIBYUV_DISABLE_X86) && \ + (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || \ + defined(_M_X86)) && \ + ((defined(_MSC_VER) && !defined(__clang__)) || \ defined(LIBYUV_ENABLE_ROWWIN)) #include -#include // For _mm_maddubs_epi16 #include // For AVX2 intrinsics +#include // For _mm_maddubs_epi16 #ifdef __cplusplus namespace libyuv { @@ -266,27 +266,33 @@ void BGRAToYRow_AVX2(const uint8_t* src_bgra, uint8_t* dst_y, int width) { LIBYUV_TARGET_AVX2 void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { __m256i ymm_alpha = _mm256_set1_epi32(0xff000000); - __m128i shuf_low = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2); - __m128i shuf_high = _mm_set_epi8(-1, 13, 14, 15, -1, 10, 11, 12, -1, 7, 8, 9, -1, 4, 5, 6); + __m128i shuf_low = + _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2); + __m128i shuf_high = + _mm_set_epi8(-1, 13, 14, 15, -1, 10, 11, 12, -1, 7, 8, 9, -1, 4, 5, 6); __m256i ymm_shuf = _mm256_broadcastsi128_si256(shuf_low); __m256i ymm_shuf2 = _mm256_broadcastsi128_si256(shuf_high); while (width > 0) { __m128i xmm0 = _mm_loadu_si128((const __m128i*)src_raw); __m256i ymm0 = _mm256_castsi128_si256(xmm0); - ymm0 = _mm256_inserti128_si256(ymm0, _mm_loadu_si128((const __m128i*)(src_raw + 12)), 1); + ymm0 = _mm256_inserti128_si256( + ymm0, _mm_loadu_si128((const __m128i*)(src_raw + 12)), 1); __m128i xmm1 = _mm_loadu_si128((const __m128i*)(src_raw + 24)); __m256i ymm1 = _mm256_castsi128_si256(xmm1); - ymm1 = _mm256_inserti128_si256(ymm1, _mm_loadu_si128((const __m128i*)(src_raw + 36)), 1); + ymm1 = _mm256_inserti128_si256( + ymm1, _mm_loadu_si128((const __m128i*)(src_raw + 36)), 1); __m128i xmm2 = _mm_loadu_si128((const __m128i*)(src_raw + 48)); __m256i ymm2 = _mm256_castsi128_si256(xmm2); - ymm2 = _mm256_inserti128_si256(ymm2, _mm_loadu_si128((const __m128i*)(src_raw + 60)), 1); + ymm2 = _mm256_inserti128_si256( + ymm2, _mm_loadu_si128((const __m128i*)(src_raw + 60)), 1); __m128i xmm3 = _mm_loadu_si128((const __m128i*)(src_raw + 68)); __m256i ymm3 = _mm256_castsi128_si256(xmm3); - ymm3 = _mm256_inserti128_si256(ymm3, _mm_loadu_si128((const __m128i*)(src_raw + 80)), 1); + ymm3 = _mm256_inserti128_si256( + ymm3, _mm_loadu_si128((const __m128i*)(src_raw + 80)), 1); ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf); ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf); @@ -312,10 +318,13 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) { #ifdef HAS_RAWTOARGBROW_AVX512BW LIBYUV_TARGET_AVX512BW -void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const __m128i* shuffler, int width) { +void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, + uint8_t* dst_argb, + const __m128i* shuffler, + int width) { __m512i zmm_alpha = _mm512_set1_epi32(0xff000000); - __m512i zmm_perm = _mm512_set_epi32( - 12, 11, 10, 9, 9, 8, 7, 6, 6, 5, 4, 3, 3, 2, 1, 0); + __m512i zmm_perm = + _mm512_set_epi32(12, 11, 10, 9, 9, 8, 7, 6, 6, 5, 4, 3, 3, 2, 1, 0); __m512i zmm_shuf = _mm512_broadcast_i32x4(_mm_loadu_si128(shuffler)); while (width > 0) { @@ -351,14 +360,20 @@ void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const __m1 } LIBYUV_TARGET_AVX512BW -void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) { - __m128i shuf = _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2); +void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, + uint8_t* dst_argb, + int width) { + __m128i shuf = + _mm_set_epi8(-1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2); RGBToARGBRow_AVX512BW(src_raw, dst_argb, &shuf, width); } LIBYUV_TARGET_AVX512BW -void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { - __m128i shuf = _mm_set_epi8(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0); +void RGB24ToARGBRow_AVX512BW(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { + __m128i shuf = + _mm_set_epi8(-1, 11, 10, 9, -1, 8, 7, 6, -1, 5, 4, 3, -1, 2, 1, 0); RGBToARGBRow_AVX512BW(src_rgb24, dst_argb, &shuf, width); } #endif @@ -374,16 +389,19 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, __m256i ymm_u = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU)); __m256i ymm_v = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToV)); __m256i ymm_0101 = _mm256_set1_epi16(0x0101); - __m256i ymm_shuf = _mm256_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15, - 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15); + __m256i ymm_shuf = + _mm256_setr_epi8(0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15, 0, + 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15); __m256i ymm_8000 = _mm256_set1_epi16((short)0x8000); __m256i ymm_zero = _mm256_setzero_si256(); while (width > 0) { __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb); __m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32)); - __m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb)); - __m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb + 32)); + __m256i ymm2 = + _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb)); + __m256i ymm3 = + _mm256_loadu_si256((const __m256i*)(src_argb + src_stride_argb + 32)); ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf); ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf); @@ -455,8 +473,8 @@ void MergeUVRow_AVX2(const uint8_t* src_u, #ifdef HAS_MIRRORROW_AVX2 LIBYUV_TARGET_AVX2 void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { - __m256i ymm_shuf = - _mm256_broadcastsi128_si256(_mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + __m256i ymm_shuf = _mm256_broadcastsi128_si256( + _mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); src += width; while (width > 0) { src -= 32; @@ -473,8 +491,8 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_MIRRORUVROW_AVX2 LIBYUV_TARGET_AVX2 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { - __m256i ymm_shuf = - _mm256_broadcastsi128_si256(_mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + __m256i ymm_shuf = _mm256_broadcastsi128_si256( + _mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); src_uv += width * 2; while (width > 0) { src_uv -= 32; @@ -494,8 +512,8 @@ void MirrorSplitUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, int width) { - __m256i ymm_shuf = - _mm256_broadcastsi128_si256(_mm_setr_epi8(14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1)); + __m256i ymm_shuf = _mm256_broadcastsi128_si256( + _mm_setr_epi8(14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1)); src_uv += width * 2; while (width > 0) { src_uv -= 32; @@ -516,25 +534,28 @@ LIBYUV_TARGET_AVX2 void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) { - __m256i shuf0 = _mm256_setr_epi8( - -1, 12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2, - -1, 12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2); - __m128i shuf1 = _mm_setr_epi8( - 13, 14, 15, 10, 11, 12, 7, 8, 9, 4, 5, 6, 1, 2, 3, -1); + __m256i shuf0 = + _mm256_setr_epi8(-1, 12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2, -1, + 12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2); + __m128i shuf1 = + _mm_setr_epi8(13, 14, 15, 10, 11, 12, 7, 8, 9, 4, 5, 6, 1, 2, 3, -1); src_rgb24 += width * 3 - 96; while (width > 0) { __m128i v0_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 0)); __m128i v0_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 15)); - __m256i v0 = _mm256_inserti128_si256(_mm256_castsi128_si256(v0_lo), v0_hi, 1); + __m256i v0 = + _mm256_inserti128_si256(_mm256_castsi128_si256(v0_lo), v0_hi, 1); __m128i v1_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 30)); __m128i v1_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 45)); - __m256i v1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v1_lo), v1_hi, 1); + __m256i v1 = + _mm256_inserti128_si256(_mm256_castsi128_si256(v1_lo), v1_hi, 1); __m128i v2_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 60)); __m128i v2_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 75)); - __m256i v2 = _mm256_inserti128_si256(_mm256_castsi128_si256(v2_lo), v2_hi, 1); + __m256i v2 = + _mm256_inserti128_si256(_mm256_castsi128_si256(v2_lo), v2_hi, 1); __m128i v3 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 80)); @@ -544,11 +565,14 @@ void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24, v3 = _mm_shuffle_epi8(v3, shuf1); _mm_storeu_si128((__m128i*)(dst_rgb24 + 80), _mm256_castsi256_si128(v0)); - _mm_storeu_si128((__m128i*)(dst_rgb24 + 65), _mm256_extracti128_si256(v0, 1)); + _mm_storeu_si128((__m128i*)(dst_rgb24 + 65), + _mm256_extracti128_si256(v0, 1)); _mm_storeu_si128((__m128i*)(dst_rgb24 + 50), _mm256_castsi256_si128(v1)); - _mm_storeu_si128((__m128i*)(dst_rgb24 + 35), _mm256_extracti128_si256(v1, 1)); + _mm_storeu_si128((__m128i*)(dst_rgb24 + 35), + _mm256_extracti128_si256(v1, 1)); _mm_storeu_si128((__m128i*)(dst_rgb24 + 20), _mm256_castsi256_si128(v2)); - _mm_storeu_si128((__m128i*)(dst_rgb24 + 5), _mm256_extracti128_si256(v2, 1)); + _mm_storeu_si128((__m128i*)(dst_rgb24 + 5), + _mm256_extracti128_si256(v2, 1)); _mm_storel_epi64((__m128i*)(dst_rgb24 + 0), v3); src_rgb24 -= 96; @@ -629,7 +653,8 @@ void InterpolateRow_16_AVX2(uint16_t* dst_ptr, for (i = 0; i < width; i += 16) { __m256i row0 = _mm256_loadu_si256((const __m256i*)(src_ptr + i)); __m256i row1 = _mm256_loadu_si256((const __m256i*)(src_ptr1 + i)); - _mm256_storeu_si256((__m256i*)(dst_ptr + i), _mm256_avg_epu16(row0, row1)); + _mm256_storeu_si256((__m256i*)(dst_ptr + i), + _mm256_avg_epu16(row0, row1)); } } else { for (i = 0; i < width; i += 16) { @@ -672,21 +697,23 @@ void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { #ifdef HAS_J400TOARGBROW_AVX2 alignas(32) static const uint8_t kShuffleMaskJ400ToARGB_0[32] = { 0u, 0u, 0u, 128u, 1u, 1u, 1u, 128u, 2u, 2u, 2u, 128u, 3u, 3u, 3u, 128u, - 4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, 7u, 7u, 7u, 128u -}; + 4u, 4u, 4u, 128u, 5u, 5u, 5u, 128u, 6u, 6u, 6u, 128u, 7u, 7u, 7u, 128u}; alignas(32) static const uint8_t kShuffleMaskJ400ToARGB_1[32] = { - 8u, 8u, 8u, 128u, 9u, 9u, 9u, 128u, 10u, 10u, 10u, 128u, 11u, 11u, 11u, 128u, - 12u, 12u, 12u, 128u, 13u, 13u, 13u, 128u, 14u, 14u, 14u, 128u, 15u, 15u, 15u, 128u -}; + 8u, 8u, 8u, 128u, 9u, 9u, 9u, 128u, 10u, 10u, 10u, + 128u, 11u, 11u, 11u, 128u, 12u, 12u, 12u, 128u, 13u, 13u, + 13u, 128u, 14u, 14u, 14u, 128u, 15u, 15u, 15u, 128u}; LIBYUV_TARGET_AVX2 void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) { - __m256i ymm_mask0 = _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_0); - __m256i ymm_mask1 = _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_1); + __m256i ymm_mask0 = + _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_0); + __m256i ymm_mask1 = + _mm256_load_si256((const __m256i*)kShuffleMaskJ400ToARGB_1); __m256i ymm_alpha = _mm256_set1_epi32((int)0xff000000u); while (width > 0) { - __m256i ymm0 = _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)src_y)); + __m256i ymm0 = + _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)src_y)); __m256i ymm1 = _mm256_shuffle_epi8(ymm0, ymm_mask0); __m256i ymm2 = _mm256_shuffle_epi8(ymm0, ymm_mask1); @@ -707,13 +734,15 @@ void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width) { #ifdef HAS_RGB24TOARGBROW_AVX2 alignas(16) static const uint8_t kShuffleMaskRGB24ToARGB[2][16] = { {0u, 1u, 2u, 128u, 3u, 4u, 5u, 128u, 6u, 7u, 8u, 128u, 9u, 10u, 11u, 128u}, - {4u, 5u, 6u, 128u, 7u, 8u, 9u, 128u, 10u, 11u, 12u, 128u, 13u, 14u, 15u, 128u} -}; + {4u, 5u, 6u, 128u, 7u, 8u, 9u, 128u, 10u, 11u, 12u, 128u, 13u, 14u, 15u, + 128u}}; #endif #ifdef HAS_RGB565TOARGBROW_AVX2 LIBYUV_TARGET_AVX2 -void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_argb, int width) { +void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, + uint8_t* dst_argb, + int width) { __m256i ymm_scale_rb = _mm256_set1_epi32(0x01080108); __m256i ymm_scale_g = _mm256_set1_epi32(0x20802080); __m256i ymm_mask_b = _mm256_set1_epi16((short)0xf800); @@ -730,11 +759,11 @@ void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_argb, int widt ymm1 = _mm256_mulhi_epu16(ymm1, ymm_scale_rb); ymm2 = _mm256_mulhi_epu16(ymm2, ymm_scale_rb); ymm1 = _mm256_slli_epi16(ymm1, 8); - ymm1 = _mm256_or_si256(ymm1, ymm2); // RB + ymm1 = _mm256_or_si256(ymm1, ymm2); // RB ymm0 = _mm256_and_si256(ymm0, ymm_mask_g); ymm0 = _mm256_mulhi_epu16(ymm0, ymm_scale_g); - ymm0 = _mm256_or_si256(ymm0, ymm_mask_a); // GA + ymm0 = _mm256_or_si256(ymm0, ymm_mask_a); // GA ymm2 = _mm256_unpacklo_epi8(ymm1, ymm0); ymm1 = _mm256_unpackhi_epi8(ymm1, ymm0); @@ -755,7 +784,9 @@ void RGB565ToARGBRow_AVX2(const uint8_t* src_rgb565, uint8_t* dst_argb, int widt #ifdef HAS_ARGB1555TOARGBROW_AVX2 LIBYUV_TARGET_AVX2 -void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, uint8_t* dst_argb, int width) { +void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, + uint8_t* dst_argb, + int width) { __m256i ymm_scale_rb = _mm256_set1_epi32(0x01080108); __m256i ymm_scale_g = _mm256_set1_epi32(0x42004200); __m256i ymm_mask_b = _mm256_set1_epi16((short)0xf800); @@ -773,14 +804,14 @@ void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, uint8_t* dst_argb, int ymm2 = _mm256_mulhi_epu16(ymm2, ymm_scale_rb); ymm1 = _mm256_mulhi_epu16(ymm1, ymm_scale_rb); ymm1 = _mm256_slli_epi16(ymm1, 8); - ymm1 = _mm256_or_si256(ymm1, ymm2); // RB + ymm1 = _mm256_or_si256(ymm1, ymm2); // RB ymm2 = ymm0; ymm0 = _mm256_and_si256(ymm0, ymm_mask_g); ymm2 = _mm256_srai_epi16(ymm2, 8); ymm0 = _mm256_mulhi_epu16(ymm0, ymm_scale_g); ymm2 = _mm256_and_si256(ymm2, ymm_mask_a); - ymm0 = _mm256_or_si256(ymm0, ymm2); // GA + ymm0 = _mm256_or_si256(ymm0, ymm2); // GA ymm2 = _mm256_unpacklo_epi8(ymm1, ymm0); ymm1 = _mm256_unpackhi_epi8(ymm1, ymm0); @@ -801,7 +832,9 @@ void ARGB1555ToARGBRow_AVX2(const uint8_t* src_argb1555, uint8_t* dst_argb, int #ifdef HAS_ARGB4444TOARGBROW_AVX2 LIBYUV_TARGET_AVX2 -void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, uint8_t* dst_argb, int width) { +void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, + uint8_t* dst_argb, + int width) { __m256i ymm_mask = _mm256_set1_epi32(0x0f0f0f0f); __m256i ymm_mask2 = _mm256_slli_epi32(ymm_mask, 4); @@ -841,27 +874,35 @@ void ARGB4444ToARGBRow_AVX2(const uint8_t* src_argb4444, uint8_t* dst_argb, int #ifdef HAS_RGB24TOARGBROW_AVX2 LIBYUV_TARGET_AVX2 -void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { +void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, + uint8_t* dst_argb, + int width) { __m256i ymm_alpha = _mm256_set1_epi32(0xff000000); - __m256i ymm_shuf = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[0])); - __m256i ymm_shuf2 = _mm256_broadcastsi128_si256(_mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[1])); + __m256i ymm_shuf = _mm256_broadcastsi128_si256( + _mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[0])); + __m256i ymm_shuf2 = _mm256_broadcastsi128_si256( + _mm_load_si128((const __m128i*)kShuffleMaskRGB24ToARGB[1])); while (width > 0) { __m128i xmm0 = _mm_loadu_si128((const __m128i*)src_rgb24); __m256i ymm0 = _mm256_castsi128_si256(xmm0); - ymm0 = _mm256_inserti128_si256(ymm0, _mm_loadu_si128((const __m128i*)(src_rgb24 + 12)), 1); + ymm0 = _mm256_inserti128_si256( + ymm0, _mm_loadu_si128((const __m128i*)(src_rgb24 + 12)), 1); __m128i xmm1 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 24)); __m256i ymm1 = _mm256_castsi128_si256(xmm1); - ymm1 = _mm256_inserti128_si256(ymm1, _mm_loadu_si128((const __m128i*)(src_rgb24 + 36)), 1); + ymm1 = _mm256_inserti128_si256( + ymm1, _mm_loadu_si128((const __m128i*)(src_rgb24 + 36)), 1); __m128i xmm2 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 48)); __m256i ymm2 = _mm256_castsi128_si256(xmm2); - ymm2 = _mm256_inserti128_si256(ymm2, _mm_loadu_si128((const __m128i*)(src_rgb24 + 60)), 1); + ymm2 = _mm256_inserti128_si256( + ymm2, _mm_loadu_si128((const __m128i*)(src_rgb24 + 60)), 1); __m128i xmm3 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 68)); __m256i ymm3 = _mm256_castsi128_si256(xmm3); - ymm3 = _mm256_inserti128_si256(ymm3, _mm_loadu_si128((const __m128i*)(src_rgb24 + 80)), 1); + ymm3 = _mm256_inserti128_si256( + ymm3, _mm_loadu_si128((const __m128i*)(src_rgb24 + 80)), 1); ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf); ymm1 = _mm256_shuffle_epi8(ymm1, ymm_shuf); @@ -886,6 +927,50 @@ void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) } #endif +#ifdef HAS_ARGBSHUFFLEROW_AVX2 +LIBYUV_TARGET_AVX2 +void ARGBShuffleRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + __m256i control = + _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)shuffler)); + while (width >= 16) { + __m256i row = _mm256_loadu_si256((const __m256i*)src_argb); + __m256i row1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32)); + row = _mm256_shuffle_epi8(row, control); + row1 = _mm256_shuffle_epi8(row1, control); + _mm256_storeu_si256((__m256i*)dst_argb, row); + _mm256_storeu_si256((__m256i*)(dst_argb + 32), row1); + src_argb += 64; + dst_argb += 64; + width -= 16; + } +} +#endif + +#ifdef HAS_ARGBSHUFFLEROW_AVX512BW +LIBYUV_TARGET_AVX512BW +void ARGBShuffleRow_AVX512BW(const uint8_t* src_argb, + uint8_t* dst_argb, + const uint8_t* shuffler, + int width) { + __m512i control = + _mm512_broadcast_i32x4(_mm_loadu_si128((const __m128i*)shuffler)); + while (width >= 32) { + __m512i row = _mm512_loadu_si512((const __m512i*)src_argb); + __m512i row1 = _mm512_loadu_si512((const __m512i*)(src_argb + 64)); + row = _mm512_shuffle_epi8(row, control); + row1 = _mm512_shuffle_epi8(row1, control); + _mm512_storeu_si512((__m512i*)dst_argb, row); + _mm512_storeu_si512((__m512i*)(dst_argb + 64), row1); + src_argb += 128; + dst_argb += 128; + width -= 32; + } +} +#endif + #endif #ifdef __cplusplus @@ -893,4 +978,7 @@ void RGB24ToARGBRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) } // namespace libyuv #endif -#endif // !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__) || defined(_M_X64) || defined(_M_X86)) && ((defined(_MSC_VER) && !defined(__clang__)) || defined(LIBYUV_ENABLE_ROWWIN)) +#endif // !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || + // defined(__i386__) || defined(_M_X64) || defined(_M_X86)) && + // ((defined(_MSC_VER) && !defined(__clang__)) || + // defined(LIBYUV_ENABLE_ROWWIN)) diff --git a/source/scale.cc b/source/scale.cc index 0064a0991..4b7b2d3bc 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1951,9 +1951,9 @@ int ScalePlane(const uint8_t* src, // Reject dimensions larger than 32768 (or smaller than -32768 for height). // This prevents FixedDiv signed integer overflows that can lead to division // by zero/overflow crashes (SIGFPE on x86) or incorrect step calculations. - if (!src || src_width <= 0 || src_height == 0 || - src_width > 32768 || src_height < -32768 || src_height > 32768 || - !dst || dst_width <= 0 || dst_height <= 0) { + if (!src || src_width <= 0 || src_height == 0 || src_width > 32768 || + src_height < -32768 || src_height > 32768 || !dst || dst_width <= 0 || + dst_height <= 0) { return -1; } // Simplify filtering when possible. @@ -2059,9 +2059,9 @@ int ScalePlane_16(const uint16_t* src, // Reject dimensions larger than 32768 (or smaller than -32768 for height). // This prevents FixedDiv signed integer overflows that can lead to division // by zero/overflow crashes (SIGFPE on x86) or incorrect step calculations. - if (!src || src_width <= 0 || src_height == 0 || - src_width > 32768 || src_height < -32768 || src_height > 32768 || - !dst || dst_width <= 0 || dst_height <= 0) { + if (!src || src_width <= 0 || src_height == 0 || src_width > 32768 || + src_height < -32768 || src_height > 32768 || !dst || dst_width <= 0 || + dst_height <= 0) { return -1; } // Simplify filtering when possible. @@ -2171,9 +2171,9 @@ int ScalePlane_12(const uint16_t* src, // Reject dimensions larger than 32768 (or smaller than -32768 for height). // This prevents FixedDiv signed integer overflows that can lead to division // by zero/overflow crashes (SIGFPE on x86) or incorrect step calculations. - if (!src || src_width <= 0 || src_height == 0 || - src_width > 32768 || src_height < -32768 || src_height > 32768 || - !dst || dst_width <= 0 || dst_height <= 0) { + if (!src || src_width <= 0 || src_height == 0 || src_width > 32768 || + src_height < -32768 || src_height > 32768 || !dst || dst_width <= 0 || + dst_height <= 0) { return -1; } // Simplify filtering when possible. diff --git a/source/scale_common.cc b/source/scale_common.cc index 7040d0add..e2447119b 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -792,10 +792,10 @@ void ScaleFilterCols64_C(uint8_t* dst_ptr, #undef BLENDER // Same as 8 bit arm blender but return is cast to uint16_t -#define BLENDER(a, b, f) \ - (uint16_t)( \ - (int)(a) + \ - (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> 16)) +#define BLENDER(a, b, f) \ + (uint16_t)((int)(a) + \ + (int)((((int64_t)((f)) * ((int64_t)(b) - (int)(a))) + 0x8000) >> \ + 16)) void ScaleFilterCols_16_C(uint16_t* dst_ptr, const uint16_t* src_ptr, @@ -1196,7 +1196,7 @@ void ScaleARGBColsUp2_C(uint8_t* dst_argb, // TODO(fbarchard): Replace 0x7f ^ f with 128-f. bug=607. // Mimics SSSE3 blender -#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b)*f) >> 7 +#define BLENDER1(a, b, f) ((a) * (0x7f ^ f) + (b) * f) >> 7 #define BLENDERC(a, b, f, s) \ (uint32_t)(BLENDER1(((a) >> s) & 255, ((b) >> s) & 255, f) << s) #define BLENDER(a, b, f) \ diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 5338482c5..773076669 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -1759,25 +1759,25 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, void ScaleAddRow_SSE2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm volatile("pxor %%xmm5,%%xmm5 \n" + asm volatile("pxor %%xmm5,%%xmm5 \n" // 16 pixel loop. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm3 \n" - "lea 0x10(%0),%0 \n" // src_ptr += 16 - "movdqu (%1),%%xmm0 \n" - "movdqu 0x10(%1),%%xmm1 \n" - "movdqa %%xmm3,%%xmm2 \n" - "punpcklbw %%xmm5,%%xmm2 \n" - "punpckhbw %%xmm5,%%xmm3 \n" - "paddusw %%xmm2,%%xmm0 \n" - "paddusw %%xmm3,%%xmm1 \n" - "movdqu %%xmm0,(%1) \n" - "movdqu %%xmm1,0x10(%1) \n" - "lea 0x20(%1),%1 \n" - "sub $0x10,%2 \n" - "jg 1b \n" + "1: \n" + "movdqu (%0),%%xmm3 \n" + "lea 0x10(%0),%0 \n" // src_ptr += 16 + "movdqu (%1),%%xmm0 \n" + "movdqu 0x10(%1),%%xmm1 \n" + "movdqa %%xmm3,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpckhbw %%xmm5,%%xmm3 \n" + "paddusw %%xmm2,%%xmm0 \n" + "paddusw %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 @@ -1790,23 +1790,23 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr, void ScaleAddRow_AVX2(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width) { - asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" + asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN - "1: \n" - "vmovdqu (%0),%%ymm3 \n" - "lea 0x20(%0),%0 \n" // src_ptr += 32 - "vpermq $0xd8,%%ymm3,%%ymm3 \n" - "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" - "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" - "vpaddusw (%1),%%ymm2,%%ymm0 \n" - "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" - "vmovdqu %%ymm0,(%1) \n" - "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" - "sub $0x20,%2 \n" - "jg 1b \n" - "vzeroupper \n" + "1: \n" + "vmovdqu (%0),%%ymm3 \n" + "lea 0x20(%0),%0 \n" // src_ptr += 32 + "vpermq $0xd8,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm5,%%ymm3,%%ymm2 \n" + "vpunpckhbw %%ymm5,%%ymm3,%%ymm3 \n" + "vpaddusw (%1),%%ymm2,%%ymm0 \n" + "vpaddusw 0x20(%1),%%ymm3,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(src_width) // %2 diff --git a/source/scale_win.cc b/source/scale_win.cc index 870ed77b3..4b7fd3590 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -104,7 +104,7 @@ __declspec(naked) void ScaleRowDown2_SSSE3(const uint8_t* src_ptr, movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - psrlw xmm0, 8 // isolate odd pixels. + psrlw xmm0, 8 // isolate odd pixels. psrlw xmm1, 8 packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -138,7 +138,7 @@ __declspec(naked) void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, lea eax, [eax + 32] pmaddubsw xmm0, xmm4 // horizontal add pmaddubsw xmm1, xmm4 - pavgw xmm0, xmm5 // (x + 1) / 2 + pavgw xmm0, xmm5 // (x + 1) / 2 pavgw xmm1, xmm5 packuswb xmm0, xmm1 movdqu [edx], xmm0 @@ -213,7 +213,7 @@ __declspec(naked) void ScaleRowDown2_AVX2(const uint8_t* src_ptr, vpsrlw ymm0, ymm0, 8 // isolate odd pixels. vpsrlw ymm1, ymm1, 8 vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 @@ -249,7 +249,7 @@ __declspec(naked) void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, vpavgw ymm0, ymm0, ymm5 // (x + 1) / 2 vpavgw ymm1, ymm1, ymm5 vpackuswb ymm0, ymm0, ymm1 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], ymm0 lea edx, [edx + 32] sub ecx, 32 @@ -319,7 +319,7 @@ __declspec(naked) void ScaleRowDown4_SSSE3(const uint8_t* src_ptr, // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 + pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 psrld xmm5, 24 pslld xmm5, 16 @@ -424,7 +424,7 @@ __declspec(naked) void ScaleRowDown4_AVX2(const uint8_t* src_ptr, vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vpsrlw ymm0, ymm0, 8 vpackuswb ymm0, ymm0, ymm0 - vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb + vpermq ymm0, ymm0, 0xd8 // unmutate vpackuswb vmovdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 16 @@ -687,7 +687,7 @@ __declspec(naked) void ScaleRowDown38_SSSE3(const uint8_t* src_ptr, pshufb xmm1, xmm5 paddusb xmm0, xmm1 - movq qword ptr [edx], xmm0 // write 12 pixels + movq qword ptr [edx], xmm0 // write 12 pixels movhlps xmm1, xmm0 movd [edx + 8], xmm1 lea edx, [edx + 12] @@ -1030,7 +1030,7 @@ __declspec(naked) void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb, lea eax, [eax + 32] movdqa xmm2, xmm0 shufps xmm0, xmm1, 0x88 // even pixels - shufps xmm2, xmm1, 0xdd // odd pixels + shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 movdqu [edx], xmm0 lea edx, [edx + 16] @@ -1216,7 +1216,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb, test ecx, 2 je xloop29 - // 2 Pixels. + // 2 Pixels. movd xmm0, [esi + eax * 4] // 1 source x0 pixels movd xmm1, [esi + edx * 4] // 1 source x1 pixels pextrw eax, xmm2, 5 // get x2 integer. @@ -1229,7 +1229,7 @@ __declspec(naked) void ScaleARGBCols_SSE2(uint8_t* dst_argb, test ecx, 1 je xloop99 - // 1 Pixels. + // 1 Pixels. movd xmm0, [esi + eax * 4] // 1 source x2 pixels movd dword ptr [edi], xmm0 xloop99: diff --git a/unit_test/color_test.cc b/unit_test/color_test.cc index 3048f728a..24456a524 100644 --- a/unit_test/color_test.cc +++ b/unit_test/color_test.cc @@ -464,8 +464,7 @@ static void YUVFToRGBReference(int y, int u, int v, int* r, int* g, int* b) { static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) { double y1 = (y - 16) * 1.164384; *r = RoundToByte(y1 - (v - 128) * -1.67867); - *g = RoundToByte(y1 - (u - 128) * 0.187326 - - (v - 128) * 0.65042); + *g = RoundToByte(y1 - (u - 128) * 0.187326 - (v - 128) * 0.65042); *b = RoundToByte(y1 - (u - 128) * -2.14177); } diff --git a/unit_test/convert_argb_test.cc b/unit_test/convert_argb_test.cc index d7776c479..7f545a435 100644 --- a/unit_test/convert_argb_test.cc +++ b/unit_test/convert_argb_test.cc @@ -53,9 +53,9 @@ namespace libyuv { #define ABGRToABGR ARGBCopy // subsample amount uses a divide. -#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a)) +#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a)) -#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN)) +#define ALIGNINT(V, ALIGN) (((V) + (ALIGN) - 1) / (ALIGN) * (ALIGN)) #define TESTBPTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \ SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \ @@ -82,15 +82,19 @@ namespace libyuv { (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1); \ const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X); \ const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y); \ - align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF); \ + align_buffer_page_end(src_y, \ + kPaddedWidth * kPaddedHeight * SRC_BPC + OFF); \ align_buffer_page_end( \ - src_uv, kSrcHalfPaddedWidth* kSrcHalfPaddedHeight* SRC_BPC * 2 + OFF); \ - align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \ - align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \ - align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + src_uv, \ + kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC * 2 + OFF); \ + align_buffer_page_end(dst_y_c, kWidth * kHeight * DST_BPC); \ + align_buffer_page_end(dst_u_c, kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + align_buffer_page_end(dst_v_c, kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + align_buffer_page_end(dst_y_opt, kWidth * kHeight * DST_BPC); \ + align_buffer_page_end(dst_u_opt, \ + kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + align_buffer_page_end(dst_v_opt, \ + kDstHalfWidth * kDstHalfHeight * DST_BPC); \ SRC_T* src_y_p = reinterpret_cast(src_y + OFF); \ SRC_T* src_uv_p = reinterpret_cast(src_uv + OFF); \ for (int i = 0; i < kPaddedWidth * kPaddedHeight; ++i) { \ @@ -101,12 +105,12 @@ namespace libyuv { src_uv_p[i] = \ (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ } \ - memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \ - memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \ - memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + memset(dst_y_c, 1, kWidth * kHeight * DST_BPC); \ + memset(dst_u_c, 2, kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + memset(dst_v_c, 3, kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + memset(dst_y_opt, 101, kWidth * kHeight * DST_BPC); \ + memset(dst_u_opt, 102, kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + memset(dst_v_opt, 103, kDstHalfWidth * kDstHalfHeight * DST_BPC); \ MaskCpuFlags(disable_cpu_flags_); \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ src_y_p, kWidth, src_uv_p, kSrcHalfWidth * 2, \ @@ -223,11 +227,11 @@ TESTBPTOP(P012, uint16_t, 2, 2, 2, I012, uint16_t, 2, 2, 2, 12, 1, 1) const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ - align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_y, kWidth * kHeight + OFF); \ align_buffer_page_end(src_u, kSizeUV + OFF); \ align_buffer_page_end(src_v, kSizeUV + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF); \ + align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ src_y[i + OFF] = (fastrand() & 0xff); \ } \ @@ -381,58 +385,58 @@ TESTPLANARTOB(I444, 1, 1, ABGR, 4, 4, 1) TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1) #endif -#define TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ - W1280, N, NEG, OFF) \ - TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ - const int kWidth = W1280; \ - const int kHeight = benchmark_height_; \ - const int kStrideB = kWidth * BPP_B; \ - const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ - align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ - align_buffer_page_end(src_uv, \ - kStrideUV* SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeight); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeight); \ - for (int i = 0; i < kHeight; ++i) \ - for (int j = 0; j < kWidth; ++j) \ - src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < kStrideUV * 2; ++j) { \ - src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff); \ - } \ - } \ - memset(dst_argb_c, 1, kStrideB* kHeight); \ - memset(dst_argb_opt, 101, kStrideB* kHeight); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \ - dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \ - dst_argb_opt, kWidth * BPP_B, kWidth, \ - NEG kHeight); \ - } \ - /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \ - align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight); \ - align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight); \ - memset(dst_argb32_c, 2, kWidth * 4 * kHeight); \ - memset(dst_argb32_opt, 102, kWidth * 4 * kHeight); \ - FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth, \ - kHeight); \ - FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \ - kHeight); \ - for (int i = 0; i < kHeight; ++i) { \ - for (int j = 0; j < kWidth * 4; ++j) { \ - ASSERT_EQ(dst_argb32_c[i * kWidth * 4 + j], \ - dst_argb32_opt[i * kWidth * 4 + j]); \ - } \ - } \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_uv); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ - free_aligned_buffer_page_end(dst_argb32_c); \ - free_aligned_buffer_page_end(dst_argb32_opt); \ +#define TESTBPTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ + W1280, N, NEG, OFF) \ + TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ + const int kWidth = W1280; \ + const int kHeight = benchmark_height_; \ + const int kStrideB = kWidth * BPP_B; \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ + align_buffer_page_end(src_y, kWidth * kHeight + OFF); \ + align_buffer_page_end( \ + src_uv, kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2 + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB * kHeight); \ + align_buffer_page_end(dst_argb_opt, kStrideB * kHeight); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kWidth; ++j) \ + src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < kStrideUV * 2; ++j) { \ + src_uv[i * kStrideUV * 2 + j + OFF] = (fastrand() & 0xff); \ + } \ + } \ + memset(dst_argb_c, 1, kStrideB * kHeight); \ + memset(dst_argb_opt, 101, kStrideB * kHeight); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \ + dst_argb_c, kWidth * BPP_B, kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_uv + OFF, kStrideUV * 2, \ + dst_argb_opt, kWidth * BPP_B, kWidth, \ + NEG kHeight); \ + } \ + /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \ + align_buffer_page_end(dst_argb32_c, kWidth * 4 * kHeight); \ + align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight); \ + memset(dst_argb32_c, 2, kWidth * 4 * kHeight); \ + memset(dst_argb32_opt, 102, kWidth * 4 * kHeight); \ + FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth, \ + kHeight); \ + FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \ + kHeight); \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth * 4; ++j) { \ + ASSERT_EQ(dst_argb32_c[i * kWidth * 4 + j], \ + dst_argb32_opt[i * kWidth * 4 + j]); \ + } \ + } \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_uv); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + free_aligned_buffer_page_end(dst_argb32_c); \ + free_aligned_buffer_page_end(dst_argb32_opt); \ } #if defined(ENABLE_FULL_TESTS) @@ -507,15 +511,16 @@ TESTBPTOB(NV12, 2, 2, RGB565, RGB565, 2) const int kStrideB = \ (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ align_buffer_page_end(src_argb, \ - kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeightB*(int)sizeof(TYPE_B)); \ + kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF); \ + align_buffer_page_end(dst_argb_c, \ + kStrideB * kHeightB * (int)sizeof(TYPE_B)); \ align_buffer_page_end(dst_argb_opt, \ - kStrideB* kHeightB*(int)sizeof(TYPE_B)); \ + kStrideB * kHeightB * (int)sizeof(TYPE_B)); \ for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \ src_argb[i + OFF] = (fastrand() & 0xff); \ } \ - memset(dst_argb_c, 1, kStrideB* kHeightB); \ - memset(dst_argb_opt, 101, kStrideB* kHeightB); \ + memset(dst_argb_c, 1, kStrideB * kHeightB); \ + memset(dst_argb_opt, 101, kStrideB * kHeightB); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_B*)dst_argb_c, \ kStrideB, kWidth, NEG kHeight); \ @@ -532,41 +537,42 @@ TESTBPTOB(NV12, 2, 2, RGB565, RGB565, 2) free_aligned_buffer_page_end(dst_argb_opt); \ } -#define TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, \ - TYPE_B, EPP_B, STRIDE_B, HEIGHT_B) \ - TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) { \ - for (int times = 0; times < benchmark_iterations_; ++times) { \ - const int kWidth = (fastrand() & 63) + 1; \ - const int kHeight = (fastrand() & 31) + 1; \ - const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ - const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ - const int kStrideA = \ - (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ - const int kStrideB = \ - (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ - align_buffer_page_end(src_argb, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \ - align_buffer_page_end(dst_argb_c, \ - kStrideB* kHeightB*(int)sizeof(TYPE_B)); \ - align_buffer_page_end(dst_argb_opt, \ - kStrideB* kHeightB*(int)sizeof(TYPE_B)); \ - for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \ - src_argb[i] = 0xfe; \ - } \ - memset(dst_argb_c, 123, kStrideB* kHeightB); \ - memset(dst_argb_opt, 123, kStrideB* kHeightB); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_c, \ - kStrideB, kWidth, kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_opt, \ - kStrideB, kWidth, kHeight); \ - for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) { \ - ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ - } \ - free_aligned_buffer_page_end(src_argb); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ - } \ +#define TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, \ + TYPE_B, EPP_B, STRIDE_B, HEIGHT_B) \ + TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) { \ + for (int times = 0; times < benchmark_iterations_; ++times) { \ + const int kWidth = (fastrand() & 63) + 1; \ + const int kHeight = (fastrand() & 31) + 1; \ + const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ + const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ + const int kStrideA = \ + (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ + const int kStrideB = \ + (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ + align_buffer_page_end(src_argb, \ + kStrideA * kHeightA * (int)sizeof(TYPE_A)); \ + align_buffer_page_end(dst_argb_c, \ + kStrideB * kHeightB * (int)sizeof(TYPE_B)); \ + align_buffer_page_end(dst_argb_opt, \ + kStrideB * kHeightB * (int)sizeof(TYPE_B)); \ + for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \ + src_argb[i] = 0xfe; \ + } \ + memset(dst_argb_c, 123, kStrideB * kHeightB); \ + memset(dst_argb_opt, 123, kStrideB * kHeightB); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_c, \ + kStrideB, kWidth, kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_opt, \ + kStrideB, kWidth, kHeight); \ + for (int i = 0; i < kStrideB * kHeightB * (int)sizeof(TYPE_B); ++i) { \ + ASSERT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ + } \ + free_aligned_buffer_page_end(src_argb); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + } \ } #if defined(ENABLE_FULL_TESTS) @@ -672,11 +678,11 @@ TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1) const int kStrideB = \ (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ align_buffer_page_end(src_argb, \ - kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \ + kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF); \ align_buffer_page_end(dst_argb_c, \ - kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \ + kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF); \ align_buffer_page_end(dst_argb_opt, \ - kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \ + kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF); \ for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \ src_argb[i + OFF] = (fastrand() & 0xff); \ } \ @@ -791,14 +797,14 @@ TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1) (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ const int kStrideB = \ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ - align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \ + align_buffer_page_end(src_argb, kStrideA * kHeightA + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \ + align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \ for (int i = 0; i < kStrideA * kHeightA; ++i) { \ src_argb[i + OFF] = (fastrand() & 0xff); \ } \ - memset(dst_argb_c, 1, kStrideB* kHeightB); \ - memset(dst_argb_opt, 101, kStrideB* kHeightB); \ + memset(dst_argb_c, 1, kStrideB * kHeightB); \ + memset(dst_argb_opt, 101, kStrideB * kHeightB); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, \ NULL, kWidth, NEG kHeight); \ @@ -827,14 +833,14 @@ TESTATOA(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1) (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ const int kStrideB = \ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ - align_buffer_page_end(src_argb, kStrideA* kHeightA); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \ + align_buffer_page_end(src_argb, kStrideA * kHeightA); \ + align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \ + align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \ for (int i = 0; i < kStrideA * kHeightA; ++i) { \ src_argb[i] = (fastrand() & 0xff); \ } \ - memset(dst_argb_c, 123, kStrideB* kHeightB); \ - memset(dst_argb_opt, 123, kStrideB* kHeightB); \ + memset(dst_argb_c, 123, kStrideB * kHeightB); \ + memset(dst_argb_opt, 123, kStrideB * kHeightB); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_A##To##FMT_B##Dither(src_argb, kStrideA, dst_argb_c, kStrideB, NULL, \ kWidth, kHeight); \ @@ -885,15 +891,16 @@ TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1) const int kStrideA = \ (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ align_buffer_page_end(src_argb, \ - kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideA* kHeightA*(int)sizeof(TYPE_A)); \ + kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF); \ + align_buffer_page_end(dst_argb_c, \ + kStrideA * kHeightA * (int)sizeof(TYPE_A)); \ align_buffer_page_end(dst_argb_opt, \ - kStrideA* kHeightA*(int)sizeof(TYPE_A)); \ + kStrideA * kHeightA * (int)sizeof(TYPE_A)); \ for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \ src_argb[i + OFF] = (fastrand() & 0xff); \ } \ - memset(dst_argb_c, 1, kStrideA* kHeightA); \ - memset(dst_argb_opt, 101, kStrideA* kHeightA); \ + memset(dst_argb_c, 1, kStrideA * kHeightA); \ + memset(dst_argb_opt, 101, kStrideA * kHeightA); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_ATOB((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_A*)dst_argb_c, \ kStrideA, kWidth, NEG kHeight); \ @@ -945,12 +952,12 @@ TESTEND(AB64ToAR64, uint16_t, 4, 4, 1) const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ - align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_y, kWidth * kHeight + OFF); \ align_buffer_page_end(src_u, kSizeUV + OFF); \ align_buffer_page_end(src_v, kSizeUV + OFF); \ - align_buffer_page_end(src_a, kWidth* kHeight + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \ + align_buffer_page_end(src_a, kWidth * kHeight + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF); \ + align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ src_y[i + OFF] = (fastrand() & 0xff); \ src_a[i + OFF] = (fastrand() & 0xff); \ @@ -1240,11 +1247,11 @@ TEST_F(LibYUVConvertTest, TestDither) { const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ - align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_y, kWidth * kHeight + OFF); \ align_buffer_page_end(src_u, kSizeUV + OFF); \ align_buffer_page_end(src_v, kSizeUV + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF); \ + align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ src_y[i + OFF] = (fastrand() & 0xff); \ } \ @@ -1265,10 +1272,10 @@ TEST_F(LibYUVConvertTest, TestDither) { dst_argb_opt + OFF, kStrideB, NULL, kWidth, NEG kHeight); \ } \ /* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \ - align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight); \ - align_buffer_page_end(dst_argb32_opt, kWidth* BPP_C* kHeight); \ - memset(dst_argb32_c, 2, kWidth* BPP_C* kHeight); \ - memset(dst_argb32_opt, 102, kWidth* BPP_C* kHeight); \ + align_buffer_page_end(dst_argb32_c, kWidth * BPP_C * kHeight); \ + align_buffer_page_end(dst_argb32_opt, kWidth * BPP_C * kHeight); \ + memset(dst_argb32_c, 2, kWidth * BPP_C * kHeight); \ + memset(dst_argb32_opt, 102, kWidth * BPP_C * kHeight); \ FMT_B##To##FMT_C(dst_argb_c + OFF, kStrideB, dst_argb32_c, kWidth * BPP_C, \ kWidth, kHeight); \ FMT_B##To##FMT_C(dst_argb_opt + OFF, kStrideB, dst_argb32_opt, \ @@ -1317,10 +1324,10 @@ TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4) const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ - align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_y, kWidth * kHeight + OFF); \ align_buffer_page_end(src_u, kSizeUV + OFF); \ align_buffer_page_end(src_v, kSizeUV + OFF); \ - align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \ + align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ src_y[i + OFF] = (fastrand() & 0xff); \ } \ @@ -1334,8 +1341,8 @@ TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, ARGB, 4) kWidth, NEG kHeight); \ /* Convert to a 3rd format in 1 step and 2 steps and compare */ \ const int kStrideC = kWidth * BPP_C; \ - align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \ - align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF); \ + align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF); \ memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \ memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ @@ -1464,14 +1471,14 @@ TESTPLANARTOE(I444, 1, 1, ABGR, 1, 4, ARGB, 4) const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \ const int kSizeUV = \ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y); \ - align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_y, kWidth * kHeight + OFF); \ align_buffer_page_end(src_u, kSizeUV + OFF); \ align_buffer_page_end(src_v, kSizeUV + OFF); \ - align_buffer_page_end(src_a, kWidth* kHeight + OFF); \ - align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \ + align_buffer_page_end(src_a, kWidth * kHeight + OFF); \ + align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF); \ const int kStrideC = kWidth * BPP_C; \ - align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \ - align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF); \ + align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF); \ memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \ memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \ memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \ @@ -1578,16 +1585,16 @@ TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4) const int kHeight = benchmark_height_; \ const int kStrideA = SUBSAMPLE(kWidth, SUB_A) * BPP_A; \ const int kStrideB = SUBSAMPLE(kWidth, SUB_B) * BPP_B; \ - align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF); \ - align_buffer_page_end(dst_argb_b, kStrideB* kHeight + OFF); \ + align_buffer_page_end(src_argb_a, kStrideA * kHeight + OFF); \ + align_buffer_page_end(dst_argb_b, kStrideB * kHeight + OFF); \ MemRandomize(src_argb_a + OFF, kStrideA * kHeight); \ memset(dst_argb_b + OFF, 1, kStrideB * kHeight); \ FMT_A##To##FMT_B(src_argb_a + OFF, kStrideA, dst_argb_b + OFF, kStrideB, \ kWidth, NEG kHeight); \ /* Convert to a 3rd format in 1 step and 2 steps and compare */ \ const int kStrideC = kWidth * BPP_C; \ - align_buffer_page_end(dst_argb_c, kStrideC* kHeight + OFF); \ - align_buffer_page_end(dst_argb_bc, kStrideC* kHeight + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideC * kHeight + OFF); \ + align_buffer_page_end(dst_argb_bc, kStrideC * kHeight + OFF); \ memset(dst_argb_c + OFF, 2, kStrideC * kHeight); \ memset(dst_argb_bc + OFF, 3, kStrideC * kHeight); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ @@ -1798,11 +1805,11 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) { const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ const int kBpc = 2; \ - align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \ - align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF); \ - align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \ + align_buffer_page_end(src_y, kWidth * kHeight * kBpc + SOFF); \ + align_buffer_page_end(src_u, kSizeUV * kBpc + SOFF); \ + align_buffer_page_end(src_v, kSizeUV * kBpc + SOFF); \ + align_buffer_page_end(dst_argb_c, kStrideB * kHeight + DOFF); \ + align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + DOFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ reinterpret_cast(src_y + SOFF)[i] = (fastrand() & FMT_MASK); \ } \ @@ -1913,12 +1920,12 @@ TESTPLANAR16TOB(I210, 2, 1, 0x3ff, AR30Filter, 4, 4, 1) const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ const int kBpc = 2; \ - align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF); \ - align_buffer_page_end(src_u, kSizeUV* kBpc + OFF); \ - align_buffer_page_end(src_v, kSizeUV* kBpc + OFF); \ - align_buffer_page_end(src_a, kWidth* kHeight* kBpc + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \ + align_buffer_page_end(src_y, kWidth * kHeight * kBpc + OFF); \ + align_buffer_page_end(src_u, kSizeUV * kBpc + OFF); \ + align_buffer_page_end(src_v, kSizeUV * kBpc + OFF); \ + align_buffer_page_end(src_a, kWidth * kHeight * kBpc + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB * kHeight + OFF); \ + align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + OFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ reinterpret_cast(src_y + OFF)[i] = \ (fastrand() & ((1 << S_DEPTH) - 1)); \ @@ -2146,10 +2153,10 @@ TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10) const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2; \ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; \ const int kBpc = 2; \ - align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \ - align_buffer_page_end(src_uv, kSizeUV* kBpc + SOFF); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \ + align_buffer_page_end(src_y, kWidth * kHeight * kBpc + SOFF); \ + align_buffer_page_end(src_uv, kSizeUV * kBpc + SOFF); \ + align_buffer_page_end(dst_argb_c, kStrideB * kHeight + DOFF); \ + align_buffer_page_end(dst_argb_opt, kStrideB * kHeight + DOFF); \ for (int i = 0; i < kWidth * kHeight; ++i) { \ reinterpret_cast(src_y + SOFF)[i] = \ (fastrand() & (((uint16_t)(-1)) << (16 - S_DEPTH))); \ @@ -2831,16 +2838,23 @@ TEST_F(LibYUVConvertTest, TestARGBToUVMatrixRow_Opt) { memset(dest_v_c, 0, sizeof(dest_v_c)); memset(dest_u_opt, 0, sizeof(dest_u_opt)); memset(dest_v_opt, 0, sizeof(dest_v_opt)); - + int src_stride = (height == 1) ? 0 : kMaxWidth * 4; - ARGBToUVMatrixRow_C(&orig_argb_pixels[0], src_stride, &dest_u_c[0], &dest_v_c[0], width, &kArgbI601Constants); - ARGBToUVMatrixRow_Any_NEON(&orig_argb_pixels[0], src_stride, &dest_u_opt[0], &dest_v_opt[0], width, &kArgbI601Constants); + ARGBToUVMatrixRow_C(&orig_argb_pixels[0], src_stride, &dest_u_c[0], + &dest_v_c[0], width, &kArgbI601Constants); + ARGBToUVMatrixRow_Any_NEON(&orig_argb_pixels[0], src_stride, + &dest_u_opt[0], &dest_v_opt[0], width, + &kArgbI601Constants); int half_width = (width + 1) / 2; for (int i = 0; i < half_width; ++i) { - ASSERT_EQ(dest_u_c[i], dest_u_opt[i]) << "u mismatch at " << i << " width " << width << " height " << height; - ASSERT_EQ(dest_v_c[i], dest_v_opt[i]) << "v mismatch at " << i << " width " << width << " height " << height; + ASSERT_EQ(dest_u_c[i], dest_u_opt[i]) + << "u mismatch at " << i << " width " << width << " height " + << height; + ASSERT_EQ(dest_v_c[i], dest_v_opt[i]) + << "v mismatch at " << i << " width " << width << " height " + << height; } } } @@ -2903,13 +2917,12 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) { free_aligned_buffer_page_end(dest_argb); free_aligned_buffer_page_end(orig_i400); } -#endif // DISABLE_SLOW_TESTS +#endif // DISABLE_SLOW_TESTS #endif // !defined(DISABLE_SLOW_TESTS) && \ // (defined(__x86_64__) || defined(_M_X64) || defined(__aarch64__)) #endif // !defined(LEAN_TESTS) - #define TESTATOBPI(FMT_A, TYPE_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, SUBSAMP_X, \ SUBSAMP_Y, W1280, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \ @@ -2922,17 +2935,17 @@ TEST_F(LibYUVConvertTest, TestI400LargeSize) { const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2; \ const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ align_buffer_page_end(src_argb, \ - kStrideA* kHeightA*(int)sizeof(TYPE_A) + OFF); \ - align_buffer_page_end(dst_y_c, kStrideY* kHeight); \ + kStrideA * kHeightA * (int)sizeof(TYPE_A) + OFF); \ + align_buffer_page_end(dst_y_c, kStrideY * kHeight); \ align_buffer_page_end(dst_uv_c, kSizeUV); \ - align_buffer_page_end(dst_y_opt, kStrideY* kHeight); \ + align_buffer_page_end(dst_y_opt, kStrideY * kHeight); \ align_buffer_page_end(dst_uv_opt, kSizeUV); \ for (int i = 0; i < kStrideA * kHeightA * (int)sizeof(TYPE_A); ++i) { \ src_argb[i + OFF] = (fastrand() & 0xff); \ } \ - memset(dst_y_c, 1, kStrideY* kHeight); \ + memset(dst_y_c, 1, kStrideY * kHeight); \ memset(dst_uv_c, 2, kSizeUV); \ - memset(dst_y_opt, 101, kStrideY* kHeight); \ + memset(dst_y_opt, 101, kStrideY * kHeight); \ memset(dst_uv_opt, 102, kSizeUV); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, dst_y_c, kStrideY, \ diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 9b7cc85d9..a38e7fdf9 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -51,9 +51,9 @@ namespace libyuv { #define ABGRToABGR ARGBCopy // subsample amount uses a divide. -#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a)) +#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a)) -#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN)) +#define ALIGNINT(V, ALIGN) (((V) + (ALIGN) - 1) / (ALIGN) * (ALIGN)) // Planar test @@ -78,17 +78,19 @@ namespace libyuv { const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \ const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \ - align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \ + align_buffer_page_end(src_y, kWidth * kHeight * SRC_BPC + OFF); \ align_buffer_page_end(src_u, \ - kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \ + kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF); \ align_buffer_page_end(src_v, \ - kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \ - align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \ - align_buffer_page_end(dst_u_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - align_buffer_page_end(dst_v_c, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \ - align_buffer_page_end(dst_u_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - align_buffer_page_end(dst_v_opt, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF); \ + align_buffer_page_end(dst_y_c, kWidth * kHeight * DST_BPC); \ + align_buffer_page_end(dst_u_c, kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + align_buffer_page_end(dst_v_c, kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + align_buffer_page_end(dst_y_opt, kWidth * kHeight * DST_BPC); \ + align_buffer_page_end(dst_u_opt, \ + kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + align_buffer_page_end(dst_v_opt, \ + kDstHalfWidth * kDstHalfHeight * DST_BPC); \ MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC); \ MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \ MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \ @@ -102,12 +104,12 @@ namespace libyuv { src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1); \ src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1); \ } \ - memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \ - memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \ - memset(dst_u_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ - memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC); \ + memset(dst_y_c, 1, kWidth * kHeight * DST_BPC); \ + memset(dst_u_c, 2, kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + memset(dst_v_c, 3, kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + memset(dst_y_opt, 101, kWidth * kHeight * DST_BPC); \ + memset(dst_u_opt, 102, kDstHalfWidth * kDstHalfHeight * DST_BPC); \ + memset(dst_v_opt, 103, kDstHalfWidth * kDstHalfHeight * DST_BPC); \ MaskCpuFlags(disable_cpu_flags_); \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth, \ @@ -212,15 +214,15 @@ TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12) const int kHeight = benchmark_height_; \ const int kSizeUV = \ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ - align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_y, kWidth * kHeight + OFF); \ align_buffer_page_end(src_uv, \ - kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF); \ - align_buffer_page_end(dst_y_c, kWidth* kHeight); \ + kSizeUV * ((PIXEL_STRIDE == 3) ? 3 : 2) + OFF); \ + align_buffer_page_end(dst_y_c, kWidth * kHeight); \ align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ + align_buffer_page_end(dst_y_opt, kWidth * kHeight); \ align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ @@ -239,12 +241,12 @@ TESTPLANARTOP(I412, uint16_t, 2, 1, 1, I444, uint8_t, 1, 1, 1, 12) (fastrand() & 0xff); \ } \ } \ - memset(dst_y_c, 1, kWidth* kHeight); \ + memset(dst_y_c, 1, kWidth * kHeight); \ memset(dst_u_c, 2, \ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ memset(dst_v_c, 3, \ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_opt, 101, kWidth* kHeight); \ + memset(dst_y_opt, 101, kWidth * kHeight); \ memset(dst_u_opt, 102, \ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ memset(dst_v_opt, 103, \ @@ -359,17 +361,17 @@ static int I400ToNV21(const uint8_t* src_y, const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \ const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \ - align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \ + align_buffer_page_end(src_y, kWidth * kHeight * SRC_BPC + OFF); \ align_buffer_page_end(src_u, \ - kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \ + kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF); \ align_buffer_page_end(src_v, \ - kSrcHalfWidth* kSrcHalfHeight* SRC_BPC + OFF); \ - align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \ + kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF); \ + align_buffer_page_end(dst_y_c, kWidth * kHeight * DST_BPC); \ align_buffer_page_end(dst_uv_c, \ - kDstHalfWidth* kDstHalfHeight* DST_BPC * 2); \ - align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \ + kDstHalfWidth * kDstHalfHeight * DST_BPC * 2); \ + align_buffer_page_end(dst_y_opt, kWidth * kHeight * DST_BPC); \ align_buffer_page_end(dst_uv_opt, \ - kDstHalfWidth* kDstHalfHeight* DST_BPC * 2); \ + kDstHalfWidth * kDstHalfHeight * DST_BPC * 2); \ MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC); \ MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \ MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \ @@ -383,10 +385,10 @@ static int I400ToNV21(const uint8_t* src_y, src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1); \ src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1); \ } \ - memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \ - memset(dst_uv_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2); \ - memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \ - memset(dst_uv_opt, 102, kDstHalfWidth* kDstHalfHeight* DST_BPC * 2); \ + memset(dst_y_c, 1, kWidth * kHeight * DST_BPC); \ + memset(dst_uv_c, 2, kDstHalfWidth * kDstHalfHeight * DST_BPC * 2); \ + memset(dst_y_opt, 101, kWidth * kHeight * DST_BPC); \ + memset(dst_uv_opt, 102, kDstHalfWidth * kDstHalfHeight * DST_BPC * 2); \ MaskCpuFlags(disable_cpu_flags_); \ SRC_FMT_PLANAR##To##FMT_PLANAR(src_y_p, kWidth, src_u_p, kSrcHalfWidth, \ src_v_p, kSrcHalfWidth, \ @@ -478,14 +480,15 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12) (kHeight + (TILE_HEIGHT - 1)) & ~(TILE_HEIGHT - 1); \ const int kSrcHalfPaddedWidth = SUBSAMPLE(kPaddedWidth, SRC_SUBSAMP_X); \ const int kSrcHalfPaddedHeight = SUBSAMPLE(kPaddedHeight, SRC_SUBSAMP_Y); \ - align_buffer_page_end(src_y, kPaddedWidth* kPaddedHeight* SRC_BPC + OFF); \ + align_buffer_page_end(src_y, \ + kPaddedWidth * kPaddedHeight * SRC_BPC + OFF); \ align_buffer_page_end( \ src_uv, \ 2 * kSrcHalfPaddedWidth * kSrcHalfPaddedHeight * SRC_BPC + OFF); \ - align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \ + align_buffer_page_end(dst_y_c, kWidth * kHeight * DST_BPC); \ align_buffer_page_end(dst_uv_c, \ 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ - align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \ + align_buffer_page_end(dst_y_opt, kWidth * kHeight * DST_BPC); \ align_buffer_page_end(dst_uv_opt, \ 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ SRC_T* src_y_p = reinterpret_cast(src_y + OFF); \ @@ -502,13 +505,13 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12) src_uv_p[i] = \ (fastrand() & (((SRC_T)(-1)) << ((8 * SRC_BPC) - SRC_DEPTH))); \ } \ - memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \ + memset(dst_y_c, 1, kWidth * kHeight * DST_BPC); \ memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ - memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \ + memset(dst_y_opt, 101, kWidth * kHeight * DST_BPC); \ memset(dst_uv_opt, 102, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \ MaskCpuFlags(disable_cpu_flags_); \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ - src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p, \ + src_y_p, kWidth * SRC_BPC / (int)sizeof(SRC_T), src_uv_p, \ 2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T), \ DOY ? reinterpret_cast(dst_y_c) : NULL, kWidth, \ reinterpret_cast(dst_uv_c), 2 * kDstHalfWidth, kWidth, \ @@ -516,7 +519,7 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12) MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ SRC_FMT_PLANAR##To##FMT_PLANAR( \ - src_y_p, kWidth* SRC_BPC / (int)sizeof(SRC_T), src_uv_p, \ + src_y_p, kWidth * SRC_BPC / (int)sizeof(SRC_T), src_uv_p, \ 2 * kSrcHalfWidth * SRC_BPC / (int)sizeof(SRC_T), \ DOY ? reinterpret_cast(dst_y_opt) : NULL, kWidth, \ reinterpret_cast(dst_uv_opt), 2 * kDstHalfWidth, kWidth, \ @@ -598,16 +601,16 @@ TESTBPTOBP(P010, uint16_t, 2, 2, 2, NV12, uint8_t, 1, 2, 2, 8, 1, 1) const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \ - align_buffer_page_end(src_argb, kStride* kHeight + OFF); \ - align_buffer_page_end(dst_y_c, kWidth* kHeight); \ + align_buffer_page_end(src_argb, kStride * kHeight + OFF); \ + align_buffer_page_end(dst_y_c, kWidth * kHeight); \ align_buffer_page_end(dst_uv_c, \ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ + align_buffer_page_end(dst_y_opt, kWidth * kHeight); \ align_buffer_page_end(dst_uv_opt, \ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_c, 1, kWidth* kHeight); \ + memset(dst_y_c, 1, kWidth * kHeight); \ memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_opt, 101, kWidth* kHeight); \ + memset(dst_y_opt, 101, kWidth * kHeight); \ memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kStride; ++j) \ @@ -691,20 +694,20 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1) const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ const int kStride = (kStrideUV * SUBSAMP_X * 8 * BPP_A + 7) / 8; \ - align_buffer_page_end(src_argb, kStride* kHeight + OFF); \ - align_buffer_page_end(dst_a_c, kWidth* kHeight); \ - align_buffer_page_end(dst_y_c, kWidth* kHeight); \ + align_buffer_page_end(src_argb, kStride * kHeight + OFF); \ + align_buffer_page_end(dst_a_c, kWidth * kHeight); \ + align_buffer_page_end(dst_y_c, kWidth * kHeight); \ align_buffer_page_end(dst_uv_c, \ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_a_opt, kWidth* kHeight); \ - align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ + align_buffer_page_end(dst_a_opt, kWidth * kHeight); \ + align_buffer_page_end(dst_y_opt, kWidth * kHeight); \ align_buffer_page_end(dst_uv_opt, \ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_a_c, 1, kWidth* kHeight); \ - memset(dst_y_c, 2, kWidth* kHeight); \ + memset(dst_a_c, 1, kWidth * kHeight); \ + memset(dst_y_c, 2, kWidth * kHeight); \ memset(dst_uv_c, 3, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_a_opt, 101, kWidth* kHeight); \ - memset(dst_y_opt, 102, kWidth* kHeight); \ + memset(dst_a_opt, 101, kWidth * kHeight); \ + memset(dst_y_opt, 102, kWidth * kHeight); \ memset(dst_uv_opt, 103, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kStride; ++j) \ @@ -765,19 +768,19 @@ TESTATOPLANARA(ARGB, 4, 1, I420Alpha, 2, 2) const int kHeight = benchmark_height_; \ const int kStride = SUBSAMPLE(kWidth, SUB_A) * BPP_A; \ const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ - align_buffer_page_end(src_argb, kStride* kHeight + OFF); \ - align_buffer_page_end(dst_y_c, kWidth* kHeight); \ + align_buffer_page_end(src_argb, kStride * kHeight + OFF); \ + align_buffer_page_end(dst_y_c, kWidth * kHeight); \ align_buffer_page_end(dst_uv_c, \ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ + align_buffer_page_end(dst_y_opt, kWidth * kHeight); \ align_buffer_page_end(dst_uv_opt, \ kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kStride; ++j) \ src_argb[(i * kStride) + j + OFF] = (fastrand() & 0xff); \ - memset(dst_y_c, 1, kWidth* kHeight); \ + memset(dst_y_c, 1, kWidth * kHeight); \ memset(dst_uv_c, 2, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_opt, 101, kWidth* kHeight); \ + memset(dst_y_opt, 101, kWidth * kHeight); \ memset(dst_uv_opt, 102, kStrideUV * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ MaskCpuFlags(disable_cpu_flags_); \ FMT_A##To##FMT_PLANAR(src_argb + OFF, kStride, dst_y_c, kWidth, dst_uv_c, \ @@ -1950,17 +1953,17 @@ TEST_F(LibYUVConvertTest, I420CropOddY) { const int kHeight = benchmark_height_; \ \ align_buffer_page_end(orig_uyvy, 4 * SUBSAMPLE(kWidth, 2) * kHeight); \ - align_buffer_page_end(orig_y, kWidth* kHeight); \ + align_buffer_page_end(orig_y, kWidth * kHeight); \ align_buffer_page_end(orig_u, \ SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \ align_buffer_page_end(orig_v, \ SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \ \ - align_buffer_page_end(dst_y_orig, kWidth* kHeight); \ + align_buffer_page_end(dst_y_orig, kWidth * kHeight); \ align_buffer_page_end(dst_uv_orig, \ 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \ \ - align_buffer_page_end(dst_y, kWidth* kHeight); \ + align_buffer_page_end(dst_y, kWidth * kHeight); \ align_buffer_page_end(dst_uv, \ 2 * SUBSAMPLE(kWidth, 2) * SUBSAMPLE(kHeight, 2)); \ \ @@ -2287,12 +2290,13 @@ TEST_F(LibYUVConvertTest, TestARGBToI420Matrix) { dst_v, kWidth / 2, &kArgbU2020Constants, kWidth, kHeight); // Reference BT.709 (limited range) - // Y = round(0.2126 * 219 / 255 * R + 0.7152 * 219 / 255 * G + 0.0722 * 219 / 255 * B + 16) - // Y = round(0.1826 * R + 0.6142 * G + 0.0620 * B + 16) - // 47 * 255 + 157 * 255 + 16 * 255 + 4224 = 11985 + 40035 + 4080 + 4224 = 60324 - // 60324 / 256 = 235.64 -> 235. Correct. + // Y = round(0.2126 * 219 / 255 * R + 0.7152 * 219 / 255 * G + 0.0722 * 219 / + // 255 * B + 16) Y = round(0.1826 * R + 0.6142 * G + 0.0620 * B + 16) 47 * 255 + // + 157 * 255 + 16 * 255 + 4224 = 11985 + 40035 + 4080 + 4224 = 60324 60324 / + // 256 = 235.64 -> 235. Correct. - for (int i = 0; i < kWidth * kHeight * 4; ++i) src_argb[i] = 255; + for (int i = 0; i < kWidth * kHeight * 4; ++i) + src_argb[i] = 255; ARGBToI420Matrix(src_argb, kWidth * 4, dst_y, kWidth, dst_u, kWidth / 2, dst_v, kWidth / 2, &kArgbH709Constants, kWidth, kHeight); ASSERT_EQ(dst_y[0], 235); @@ -2423,6 +2427,132 @@ TEST_F(LibYUVConvertTest, TestARGBToI444Matrix) { free_aligned_buffer_page_end(ref_v); } +template +static void TestRGBToI420(ConvertToYUV convert_to_yuv, + ConvertToARGB convert_to_argb, + int width, + int height, + int disable_cpu_flags, + int benchmark_cpu_info) { + align_buffer_page_end(src_rgb, width * height * 4); + align_buffer_page_end(dst_y, width * height); + align_buffer_page_end(dst_u, (width + 1) / 2 * (height + 1) / 2); + align_buffer_page_end(dst_v, (width + 1) / 2 * (height + 1) / 2); + + align_buffer_page_end(tmp_argb, width * height * 4); + align_buffer_page_end(ref_y, width * height); + align_buffer_page_end(ref_u, (width + 1) / 2 * (height + 1) / 2); + align_buffer_page_end(ref_v, (width + 1) / 2 * (height + 1) / 2); + + MemRandomize(src_rgb, width * height * 4); + + { + SCOPED_TRACE("C_Version"); + MaskCpuFlags(disable_cpu_flags); + + // Clear buffers + memset(dst_y, 0, width * height); + memset(dst_u, 0, (width + 1) / 2 * (height + 1) / 2); + memset(dst_v, 0, (width + 1) / 2 * (height + 1) / 2); + memset(ref_y, 0, width * height); + memset(ref_u, 0, (width + 1) / 2 * (height + 1) / 2); + memset(ref_v, 0, (width + 1) / 2 * (height + 1) / 2); + memset(tmp_argb, 0, width * height * 4); + + int r1 = + convert_to_yuv(src_rgb, width * 4, dst_y, width, dst_u, (width + 1) / 2, + dst_v, (width + 1) / 2, width, height); + ASSERT_EQ(r1, 0); + + int r2 = + convert_to_argb(src_rgb, width * 4, tmp_argb, width * 4, width, height); + ASSERT_EQ(r2, 0); + + int r3 = ARGBToI420(tmp_argb, width * 4, ref_y, width, ref_u, + (width + 1) / 2, ref_v, (width + 1) / 2, width, height); + ASSERT_EQ(r3, 0); + + for (int i = 0; i < width * height; ++i) { + ASSERT_EQ(dst_y[i], ref_y[i]); + } + for (int i = 0; i < (width + 1) / 2 * (height + 1) / 2; ++i) { + ASSERT_EQ(dst_u[i], ref_u[i]); + ASSERT_EQ(dst_v[i], ref_v[i]); + } + } + + { + SCOPED_TRACE("SIMD_Version"); + MaskCpuFlags(benchmark_cpu_info); + + // Clear buffers + memset(dst_y, 0, width * height); + memset(dst_u, 0, (width + 1) / 2 * (height + 1) / 2); + memset(dst_v, 0, (width + 1) / 2 * (height + 1) / 2); + memset(ref_y, 0, width * height); + memset(ref_u, 0, (width + 1) / 2 * (height + 1) / 2); + memset(ref_v, 0, (width + 1) / 2 * (height + 1) / 2); + memset(tmp_argb, 0, width * height * 4); + + int r1 = + convert_to_yuv(src_rgb, width * 4, dst_y, width, dst_u, (width + 1) / 2, + dst_v, (width + 1) / 2, width, height); + ASSERT_EQ(r1, 0); + + int r2 = + convert_to_argb(src_rgb, width * 4, tmp_argb, width * 4, width, height); + ASSERT_EQ(r2, 0); + + int r3 = ARGBToI420(tmp_argb, width * 4, ref_y, width, ref_u, + (width + 1) / 2, ref_v, (width + 1) / 2, width, height); + ASSERT_EQ(r3, 0); + + for (int i = 0; i < width * height; ++i) { + ASSERT_EQ(dst_y[i], ref_y[i]); + } + for (int i = 0; i < (width + 1) / 2 * (height + 1) / 2; ++i) { + ASSERT_EQ(dst_u[i], ref_u[i]); + ASSERT_EQ(dst_v[i], ref_v[i]); + } + } + + free_aligned_buffer_page_end(src_rgb); + free_aligned_buffer_page_end(dst_y); + free_aligned_buffer_page_end(dst_u); + free_aligned_buffer_page_end(dst_v); + free_aligned_buffer_page_end(tmp_argb); + free_aligned_buffer_page_end(ref_y); + free_aligned_buffer_page_end(ref_u); + free_aligned_buffer_page_end(ref_v); +} + +TEST_F(LibYUVConvertTest, BGRAToI420_Check) { + TestRGBToI420(BGRAToI420, BGRAToARGB, 16, 16, disable_cpu_flags_, + benchmark_cpu_info_); + TestRGBToI420(BGRAToI420, BGRAToARGB, 17, 17, disable_cpu_flags_, + benchmark_cpu_info_); + TestRGBToI420(BGRAToI420, BGRAToARGB, 1280, 720, disable_cpu_flags_, + benchmark_cpu_info_); +} + +TEST_F(LibYUVConvertTest, RGBAToI420_Check) { + TestRGBToI420(RGBAToI420, RGBAToARGB, 16, 16, disable_cpu_flags_, + benchmark_cpu_info_); + TestRGBToI420(RGBAToI420, RGBAToARGB, 17, 17, disable_cpu_flags_, + benchmark_cpu_info_); + TestRGBToI420(RGBAToI420, RGBAToARGB, 1280, 720, disable_cpu_flags_, + benchmark_cpu_info_); +} + +TEST_F(LibYUVConvertTest, ABGRToI420_Check) { + TestRGBToI420(ABGRToI420, ABGRToARGB, 16, 16, disable_cpu_flags_, + benchmark_cpu_info_); + TestRGBToI420(ABGRToI420, ABGRToARGB, 17, 17, disable_cpu_flags_, + benchmark_cpu_info_); + TestRGBToI420(ABGRToI420, ABGRToARGB, 1280, 720, disable_cpu_flags_, + benchmark_cpu_info_); +} + #endif // !defined(LEAN_TESTS) } // namespace libyuv diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index d37001f1b..7eba494b7 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -1212,10 +1212,10 @@ TEST_F(LibYUVPlanarTest, TestInterpolatePlane_16) { (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ const int kStrideB = \ (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ - align_buffer_page_end(src_argb_a, kStrideA* kHeight + OFF); \ - align_buffer_page_end(src_argb_b, kStrideA* kHeight + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeight); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeight); \ + align_buffer_page_end(src_argb_a, kStrideA * kHeight + OFF); \ + align_buffer_page_end(src_argb_b, kStrideA * kHeight + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB * kHeight); \ + align_buffer_page_end(dst_argb_opt, kStrideB * kHeight); \ for (int i = 0; i < kStrideA * kHeight; ++i) { \ src_argb_a[i + OFF] = (fastrand() & 0xff); \ src_argb_b[i + OFF] = (fastrand() & 0xff); \ @@ -1418,7 +1418,7 @@ TEST_F(LibYUVPlanarTest, BlendPlane_Invert) { disable_cpu_flags_, benchmark_cpu_info_, -1, 1); } -#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a)) +#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a)) static void TestI420Blend(int width, int height, diff --git a/unit_test/rotate_test.cc b/unit_test/rotate_test.cc index 9256f8de0..10ee64cbc 100644 --- a/unit_test/rotate_test.cc +++ b/unit_test/rotate_test.cc @@ -20,7 +20,7 @@ namespace libyuv { -#define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a)) +#define SUBSAMPLE(v, a) ((((v) + (a) - 1)) / (a)) static void I420TestRotate(int src_width, int src_height, @@ -495,15 +495,15 @@ TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) { const int kHeight = benchmark_height_; \ const int kSizeUV = \ SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \ - align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_y, kWidth * kHeight + OFF); \ align_buffer_page_end(src_uv, \ - kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF); \ - align_buffer_page_end(dst_y_c, kWidth* kHeight); \ + kSizeUV * ((PIXEL_STRIDE == 3) ? 3 : 2) + OFF); \ + align_buffer_page_end(dst_y_c, kWidth * kHeight); \ align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ + align_buffer_page_end(dst_y_opt, kWidth * kHeight); \ align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \ align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ @@ -522,12 +522,12 @@ TEST_F(LibYUVRotateTest, NV12Rotate270_Invert) { (fastrand() & 0xff); \ } \ } \ - memset(dst_y_c, 1, kWidth* kHeight); \ + memset(dst_y_c, 1, kWidth * kHeight); \ memset(dst_u_c, 2, \ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ memset(dst_v_c, 3, \ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_opt, 101, kWidth* kHeight); \ + memset(dst_y_opt, 101, kWidth * kHeight); \ memset(dst_u_opt, 102, \ SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ memset(dst_v_opt, 103, \ diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc index 219e196dd..3d3e36fc5 100644 --- a/unit_test/scale_argb_test.cc +++ b/unit_test/scale_argb_test.cc @@ -431,13 +431,13 @@ static void FillRamp(uint8_t* buf, // Test scaling with C vs Opt and return maximum pixel difference. 0 = exact. static void YUVToARGBTestFilter(int src_width, - int src_height, - int dst_width, - int dst_height, - FilterMode f, - int benchmark_iterations, - int error_threshold, - int* max_diff_out) { + int src_height, + int dst_width, + int dst_height, + FilterMode f, + int benchmark_iterations, + int error_threshold, + int* max_diff_out) { int64_t src_y_plane_size = Abs(src_width) * Abs(src_height); int64_t src_uv_plane_size = ((Abs(src_width) + 1) / 2) * ((Abs(src_height) + 1) / 2); @@ -448,8 +448,8 @@ static void YUVToARGBTestFilter(int src_width, align_buffer_page_end(src_u, src_uv_plane_size); align_buffer_page_end(src_v, src_uv_plane_size); - int64_t dst_argb_plane_size = (dst_width) * (dst_height)*4LL; - int dst_stride_argb = (dst_width)*4; + int64_t dst_argb_plane_size = (dst_width) * (dst_height) * 4LL; + int dst_stride_argb = (dst_width) * 4; align_buffer_page_end(dst_argb_c, dst_argb_plane_size); align_buffer_page_end(dst_argb_opt, dst_argb_plane_size); if (!dst_argb_c || !dst_argb_opt || !src_y || !src_u || !src_v) { @@ -516,10 +516,10 @@ TEST_F(LibYUVScaleTest, YUVToRGBScaleUp) { TEST_F(LibYUVScaleTest, YUVToRGBScaleDown) { int diff = 0; - YUVToARGBTestFilter( - benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2, benchmark_width_, - benchmark_height_, libyuv::kFilterBilinear, benchmark_iterations_, 10, - &diff); + YUVToARGBTestFilter(benchmark_width_ * 3 / 2, benchmark_height_ * 3 / 2, + benchmark_width_, benchmark_height_, + libyuv::kFilterBilinear, benchmark_iterations_, 10, + &diff); ASSERT_LE(diff, 10); } diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index 750e340fa..323094f3f 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -757,7 +757,7 @@ static int NV12TestFilter(int src_width, int src_height_uv = (Abs(src_height) + 1) >> 1; int64_t src_y_plane_size = (Abs(src_width)) * (Abs(src_height)); - int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv)*2; + int64_t src_uv_plane_size = (src_width_uv) * (src_height_uv) * 2; int src_stride_y = Abs(src_width); int src_stride_uv = src_width_uv * 2; @@ -775,7 +775,7 @@ static int NV12TestFilter(int src_width, int dst_height_uv = (dst_height + 1) >> 1; int64_t dst_y_plane_size = (dst_width) * (dst_height); - int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv)*2; + int64_t dst_uv_plane_size = (dst_width_uv) * (dst_height_uv) * 2; int dst_stride_y = dst_width; int dst_stride_uv = dst_width_uv * 2; diff --git a/unit_test/unit_test.h b/unit_test/unit_test.h index 2c11c983f..e9a55c62f 100644 --- a/unit_test/unit_test.h +++ b/unit_test/unit_test.h @@ -85,10 +85,11 @@ static inline bool SizeValid(int src_width, #define align_buffer_page_end_16(var, size) \ uint16_t* var = NULL; \ uint8_t* var##_mem = \ - reinterpret_cast(malloc(((size)*2 + 4095 + 63) & ~4095)); \ + reinterpret_cast(malloc(((size) * 2 + 4095 + 63) & ~4095)); \ if (var##_mem) \ var = reinterpret_cast( \ - (intptr_t)(var##_mem + (((size)*2 + 4095 + 63) & ~4095) - (size)*2) & \ + (intptr_t)(var##_mem + (((size) * 2 + 4095 + 63) & ~4095) - \ + (size) * 2) & \ ~63) #define free_aligned_buffer_page_end_16(var) \ diff --git a/util/ssim.cc b/util/ssim.cc index 096fbcf06..f8b4509f8 100644 --- a/util/ssim.cc +++ b/util/ssim.cc @@ -244,23 +244,23 @@ double GetSSIMFullKernel(const uint8_t* org, // Read 8 pixels at line #L, and convert to 16bit, perform weighting // and acccumulate. -#define LOAD_LINE_PAIR(L, WEIGHT) \ - do { \ - const __m128i v0 = \ - _mm_loadl_epi64(reinterpret_cast(org + (L)*stride)); \ - const __m128i v1 = \ - _mm_loadl_epi64(reinterpret_cast(rec + (L)*stride)); \ - const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \ - const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \ - const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \ - const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \ - x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \ - y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \ - x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \ - y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \ - xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \ - xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \ - yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \ +#define LOAD_LINE_PAIR(L, WEIGHT) \ + do { \ + const __m128i v0 = \ + _mm_loadl_epi64(reinterpret_cast(org + (L) * stride)); \ + const __m128i v1 = \ + _mm_loadl_epi64(reinterpret_cast(rec + (L) * stride)); \ + const __m128i w0 = _mm_unpacklo_epi8(v0, zero); \ + const __m128i w1 = _mm_unpacklo_epi8(v1, zero); \ + const __m128i ww0 = _mm_mullo_epi16(w0, (WEIGHT).values_.m_); \ + const __m128i ww1 = _mm_mullo_epi16(w1, (WEIGHT).values_.m_); \ + x = _mm_add_epi32(x, _mm_unpacklo_epi16(ww0, zero)); \ + y = _mm_add_epi32(y, _mm_unpacklo_epi16(ww1, zero)); \ + x = _mm_add_epi32(x, _mm_unpackhi_epi16(ww0, zero)); \ + y = _mm_add_epi32(y, _mm_unpackhi_epi16(ww1, zero)); \ + xx = _mm_add_epi32(xx, _mm_madd_epi16(ww0, w0)); \ + xy = _mm_add_epi32(xy, _mm_madd_epi16(ww0, w1)); \ + yy = _mm_add_epi32(yy, _mm_madd_epi16(ww1, w1)); \ } while (0) #define ADD_AND_STORE_FOUR_EPI32(M, OUT) \