From cda55fcf5321dc3f5dcce7a364a588794f78df13 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 15 May 2026 13:52:46 -0700 Subject: [PATCH] Mirrow AVX2 functions for Visual C Bug: libyuv:42280902 Change-Id: Iabbec9af3a4f4dd89294e60145823c7fc4dd6ec6 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7843378 Commit-Queue: Frank Barchard Reviewed-by: richard winterton --- GEMINI.md | 8 +- README.chromium | 2 +- docs/getting_started.md | 1 - include/libyuv/row.h | 28 ++-- include/libyuv/version.h | 2 +- source/planar_functions.cc | 11 +- source/rotate.cc | 6 +- source/row_any.cc | 13 +- source/row_common.cc | 8 -- source/row_gcc.cc | 280 +++++++++++++++++++++++-------------- source/row_win.cc | 122 ++++++++++++++++ unit_test/color_test.cc | 6 - unit_test/convert_test.cc | 4 - unit_test/planar_test.cc | 12 +- 14 files changed, 332 insertions(+), 171 deletions(-) diff --git a/GEMINI.md b/GEMINI.md index 1860b2e95..3bda686fd 100644 --- a/GEMINI.md +++ b/GEMINI.md @@ -10,7 +10,7 @@ Libyuv uses a dispatch system where high-level conversion functions call optimiz ### x86 Architectures (32-bit and 64-bit) -* **row_gcc.cc**: **Master copy.** Contains inline assembly in GCC syntax for GCC and Clang. Supports SSE, AVX, and AVX512. AVX512 implementations are strictly for 64-bit targets. +* **row_gcc.cc**: **Master copy.** Contains inline assembly in GCC syntax for GCC and Clang. Supports AVX, and AVX512. AVX512 implementations are strictly for 64-bit targets. * **row_win.cc**: Derivative of `row_gcc.cc`. Contains C++ intrinsics specifically for Visual C++ (MSVC). Can be tested with Clang using `-DLIBYUV_ENABLE_ROWWIN`. * **Note**: Use either `row_gcc` or `row_win`, never both. @@ -33,10 +33,8 @@ Libyuv uses a dispatch system where high-level conversion functions call optimiz ## Coding Guidelines -1. **Maintain Parity**: When modifying `row_common.cc`, update corresponding optimizations. `row_gcc.cc` is the master; `row_win.cc` should be updated to match. -2. **Assembly Safety**: Ensure inline assembly in `row_gcc.cc` correctly preserves registers according to the ABI. -3. **AVX512 Logic**: AVX512 row functions are strictly enabled for **64-bit x86 only**. -4. **Feature Macros**: Use the `HAS_` macros in `include/libyuv/row.h` to enable or disable specific AVX512 versions. +1. **AVX512 Logic**: AVX512 row functions are strictly enabled for **64-bit x86 only**. +2. **Feature Macros**: Use the `HAS_` macros in `include/libyuv/row.h` to enable or disable specific AVX512 versions. ## Changelist (CL) & Commit Guidelines diff --git a/README.chromium b/README.chromium index 3e36b6704..c0f7290d9 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1938 +Version: 1939 Revision: DEPS License: BSD-3-Clause License File: LICENSE diff --git a/docs/getting_started.md b/docs/getting_started.md index 6f5593576..06160bb20 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -72,7 +72,6 @@ Additional commonly used compiler options can be passed to Bazel via `--copt`: bazel build -c opt --config=android_arm64 \ --copt=-DLIBYUV_UNLIMITED_DATA \ - --copt=-DLIBYUV_BIT_EXACT=1 \ --copt=-DENABLE_ROW_TESTS \ //:libyuv_test diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 2bd046913..67e629aae 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -142,8 +142,13 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || defined(__i386__) || \ defined(_M_X64) || defined(_M_X86)) +#define HAS_ARGBMIRRORROW_AVX2 +#define HAS_RGB24MIRRORROW_AVX2 #define HAS_ARGBTOUVMATRIXROW_AVX2 #define HAS_MERGEUVROW_AVX2 +#define HAS_MIRRORROW_AVX2 +#define HAS_MIRRORSPLITUVROW_AVX2 +#define HAS_MIRRORUVROW_AVX2 #endif #if !defined(LIBYUV_DISABLE_X86) && defined(USE_ROW_GCC) && \ @@ -170,6 +175,7 @@ extern "C" { #define HAS_INTERPOLATEROW_AVX2 #define HAS_J422TOARGBROW_AVX2 #define HAS_MIRRORROW_AVX2 +#define HAS_MIRRORSPLITUVROW_AVX2 #define HAS_NV12TOARGBROW_AVX2 #define HAS_NV12TORGB24ROW_AVX2 #define HAS_NV12TORGB565ROW_AVX2 @@ -236,7 +242,6 @@ extern "C" { #define HAS_P410TOARGBROW_SSSE3 #define HAS_RAWTOARGBROW_AVX2 #define HAS_RAWTORGBAROW_SSSE3 -#define HAS_RGB24MIRRORROW_SSSE3 #define HAS_RGBATOYJROW_SSSE3 #define HAS_SPLITARGBROW_SSE2 #define HAS_SPLITARGBROW_SSSE3 @@ -2939,15 +2944,14 @@ void MirrorUVRow_LSX(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_LASX(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width); void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_LSX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void MirrorUVRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void MirrorSplitUVRow_SSSE3(const uint8_t* src, - uint8_t* dst_u, - uint8_t* dst_v, - int width); +void MirrorSplitUVRow_AVX2(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width); void MirrorSplitUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, @@ -2983,16 +2987,16 @@ void ARGBMirrorRow_Any_LASX(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, - uint8_t* dst_rgb24, - int width); +void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, + int width); void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width); void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width); -void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr, - uint8_t* dst_ptr, - int width); +void RGB24MirrorRow_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 1a7808bc2..97aa52c6a 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1938 +#define LIBYUV_VERSION 1939 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 7c78277e6..e65352163 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2583,7 +2583,6 @@ void MirrorUVPlane(const uint8_t* src_uv, #endif #if defined(HAS_MIRRORUVROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - MirrorUVRow = MirrorUVRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { MirrorUVRow = MirrorUVRow_SSSE3; } @@ -2823,11 +2822,11 @@ int RGB24Mirror(const uint8_t* src_rgb24, } } #endif -#if defined(HAS_RGB24MIRRORROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - RGB24MirrorRow = RGB24MirrorRow_SSSE3; +#if defined(HAS_RGB24MIRRORROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + RGB24MirrorRow = RGB24MirrorRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + RGB24MirrorRow = RGB24MirrorRow_AVX2; } } #endif diff --git a/source/rotate.cc b/source/rotate.cc index d4a9fcd27..520806236 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -397,9 +397,9 @@ void SplitRotateUV180(const uint8_t* src, MirrorSplitUVRow = MirrorSplitUVRow_NEON; } #endif -#if defined(HAS_MIRRORSPLITUVROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) { - MirrorSplitUVRow = MirrorSplitUVRow_SSSE3; +#if defined(HAS_MIRRORSPLITUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) { + MirrorSplitUVRow = MirrorSplitUVRow_AVX2; } #endif #if defined(HAS_MIRRORSPLITUVROW_LSX) diff --git a/source/row_any.cc b/source/row_any.cc index 4ae858560..86991ce7d 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1906,8 +1906,8 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2, // Any 1 to 1 mirror. #define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width) { \ - SIMD_ALIGNED(uint8_t vin[64]); \ - SIMD_ALIGNED(uint8_t vout[64]); \ + SIMD_ALIGNED(uint8_t vin[128]); \ + SIMD_ALIGNED(uint8_t vout[128]); \ memset(vin, 0, sizeof(vin)); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ @@ -1915,7 +1915,7 @@ ANY11IS(InterpolateRow_16To8_Any_AVX2, ANY_SIMD(src_ptr + r * BPP, dst_ptr, n); \ } \ ptrdiff_t np = n; \ - memcpy(vin, src_ptr, r* BPP); \ + memcpy(vin, src_ptr, r * BPP); \ ANY_SIMD(vin, vout, MASK + 1); \ memcpy(dst_ptr + np * BPP, vout + (MASK + 1 - r) * BPP, r * BPP); \ } @@ -1938,9 +1938,6 @@ ANY11M(MirrorRow_Any_LASX, MirrorRow_LASX, 1, 63) #ifdef HAS_MIRRORUVROW_AVX2 ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15) #endif -#ifdef HAS_MIRRORUVROW_SSSE3 -ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7) -#endif #ifdef HAS_MIRRORUVROW_NEON ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31) #endif @@ -1965,8 +1962,8 @@ ANY11M(ARGBMirrorRow_Any_LSX, ARGBMirrorRow_LSX, 4, 7) #ifdef HAS_ARGBMIRRORROW_LASX ANY11M(ARGBMirrorRow_Any_LASX, ARGBMirrorRow_LASX, 4, 15) #endif -#ifdef HAS_RGB24MIRRORROW_SSSE3 -ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15) +#ifdef HAS_RGB24MIRRORROW_AVX2 +ANY11M(RGB24MirrorRow_Any_AVX2, RGB24MirrorRow_AVX2, 3, 31) #endif #ifdef HAS_RGB24MIRRORROW_NEON ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15) diff --git a/source/row_common.cc b/source/row_common.cc index 50795cf91..67dc13019 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -37,10 +37,6 @@ extern "C" { // LIBYUV_UNLIMITED_BT709 // LIBYUV_UNLIMITED_BT2020 -#if defined(LIBYUV_BIT_EXACT) -#define LIBYUV_UNATTENUATE_DUP 1 -#endif - // llvm x86 is poor at ternary operator, so use branchless min/max. #define USE_BRANCHLESS 1 @@ -3578,12 +3574,8 @@ const uint32_t fixed_invtbl8[256] = { T(0xfc), T(0xfd), T(0xfe), 0x01000100}; #undef T -#if defined(LIBYUV_UNATTENUATE_DUP) // This code mimics the Intel SIMD version for better testability. #define UNATTENUATE(f, ia) clamp255(((f | (f << 8)) * ia) >> 16) -#else -#define UNATTENUATE(f, ia) clamp255((f * ia) >> 8) -#endif // mimics the Intel SIMD code for exactness. void ARGBUnattenuateRow_C(const uint8_t* src_argb, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index c4c3107f3..22b8d0b30 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -21,6 +21,10 @@ extern "C" { (defined(__x86_64__) || defined(__i386__)) && \ !defined(LIBYUV_ENABLE_ROWWIN) +// Note: for avx and avx512 declare clobber as xmm registers due to +// clang for windows needing to preserve xmm registers but not saving +// them if declared as ymm or zmm. + #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3) // Constants for ARGB @@ -268,11 +272,11 @@ static const uint32_t kPermdRAWToARGB_AVX512BW[16] = { void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint32_t* shuffler, int width) { asm volatile( - "vpternlogd $0xff,%%zmm22,%%zmm22,%%zmm22 \n" // 0xffffffff - "vpslld $0x18,%%zmm22,%%zmm22 \n" // 0xff000000 + "vpternlogd $0xff,%%zmm6,%%zmm6,%%zmm6 \n" // 0xffffffff + "vpslld $0x18,%%zmm6,%%zmm6 \n" // 0xff000000 "movabs $0xffffffffffff,%%rax \n" // 48 bytes mask "kmovq %%rax,%%k1 \n" - "vmovdqu32 %3,%%zmm21 \n" + "vmovdqu32 %3,%%zmm5 \n" "vbroadcasti32x4 %4,%%zmm4 \n" LABELALIGN // @@ -282,18 +286,18 @@ void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint "vmovdqu8 96(%0),%%zmm2%{%%k1%}%{z%} \n" "vmovdqu8 144(%0),%%zmm3%{%%k1%}%{z%} \n" "lea 192(%0),%0 \n" - "vpermd %%zmm0,%%zmm21,%%zmm0 \n" - "vpermd %%zmm1,%%zmm21,%%zmm1 \n" - "vpermd %%zmm2,%%zmm21,%%zmm2 \n" - "vpermd %%zmm3,%%zmm21,%%zmm3 \n" + "vpermd %%zmm0,%%zmm5,%%zmm0 \n" + "vpermd %%zmm1,%%zmm5,%%zmm1 \n" + "vpermd %%zmm2,%%zmm5,%%zmm2 \n" + "vpermd %%zmm3,%%zmm5,%%zmm3 \n" "vpshufb %%zmm4,%%zmm0,%%zmm0 \n" "vpshufb %%zmm4,%%zmm1,%%zmm1 \n" "vpshufb %%zmm4,%%zmm2,%%zmm2 \n" "vpshufb %%zmm4,%%zmm3,%%zmm3 \n" - "vpord %%zmm22,%%zmm0,%%zmm0 \n" - "vpord %%zmm22,%%zmm1,%%zmm1 \n" - "vpord %%zmm22,%%zmm2,%%zmm2 \n" - "vpord %%zmm22,%%zmm3,%%zmm3 \n" + "vpord %%zmm6,%%zmm0,%%zmm0 \n" + "vpord %%zmm6,%%zmm1,%%zmm1 \n" + "vpord %%zmm6,%%zmm2,%%zmm2 \n" + "vpord %%zmm6,%%zmm3,%%zmm3 \n" "vmovdqu32 %%zmm0,(%1) \n" "vmovdqu32 %%zmm1,0x40(%1) \n" "vmovdqu32 %%zmm2,0x80(%1) \n" @@ -307,7 +311,7 @@ void RGBToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, const uint "+r"(width) // %2 : "m"(kPermdRAWToARGB_AVX512BW), // %3 "m"(*shuffler) // %4 - : "memory", "cc", "rax", "k1", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm21", "xmm22"); + : "memory", "cc", "rax", "k1", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } void RAWToARGBRow_AVX512BW(const uint8_t* src_raw, uint8_t* dst_argb, int width) { @@ -1568,26 +1572,26 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb, const struct ArgbConstants* c) { asm volatile( "vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n" - "vpsllw $15,%%zmm16,%%zmm21 \n" - "vpacksswb %%zmm21,%%zmm21,%%zmm21 \n" + "vpsllw $15,%%zmm16,%%zmm5 \n" + "vpacksswb %%zmm5,%%zmm5,%%zmm5 \n" "vpsrlw $15,%%zmm16,%%zmm16 \n" // zmm16 = 1 "vbroadcasti64x4 0(%3),%%zmm4 \n" - "vbroadcasti64x4 0x60(%3),%%zmm23 \n" - "vpmaddubsw %%zmm21,%%zmm4,%%zmm22 \n" - "vpmaddwd %%zmm16,%%zmm22,%%zmm22 \n" - "vpackssdw %%zmm22,%%zmm22,%%zmm22 \n" - "vpsubw %%zmm22,%%zmm23,%%zmm23 \n" - "vmovups %4,%%zmm22 \n" + "vbroadcasti64x4 0x60(%3),%%zmm7 \n" + "vpmaddubsw %%zmm5,%%zmm4,%%zmm6 \n" + "vpmaddwd %%zmm16,%%zmm6,%%zmm6 \n" + "vpackssdw %%zmm6,%%zmm6,%%zmm6 \n" + "vpsubw %%zmm6,%%zmm7,%%zmm7 \n" + "vmovups %4,%%zmm6 \n" LABELALIGN "1: \n" "vmovups (%0),%%zmm0 \n" "vmovups 0x40(%0),%%zmm1 \n" "vmovups 0x80(%0),%%zmm2 \n" "vmovups 0xc0(%0),%%zmm3 \n" - "vpsubb %%zmm21,%%zmm0,%%zmm0 \n" - "vpsubb %%zmm21,%%zmm1,%%zmm1 \n" - "vpsubb %%zmm21,%%zmm2,%%zmm2 \n" - "vpsubb %%zmm21,%%zmm3,%%zmm3 \n" + "vpsubb %%zmm5,%%zmm0,%%zmm0 \n" + "vpsubb %%zmm5,%%zmm1,%%zmm1 \n" + "vpsubb %%zmm5,%%zmm2,%%zmm2 \n" + "vpsubb %%zmm5,%%zmm3,%%zmm3 \n" "vpmaddubsw %%zmm0,%%zmm4,%%zmm0 \n" "vpmaddubsw %%zmm1,%%zmm4,%%zmm1 \n" "vpmaddubsw %%zmm2,%%zmm4,%%zmm2 \n" @@ -1599,12 +1603,12 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb, "vpmaddwd %%zmm16,%%zmm2,%%zmm2 \n" "vpmaddwd %%zmm16,%%zmm3,%%zmm3 \n" "vpackssdw %%zmm3,%%zmm2,%%zmm2 \n" - "vpaddw %%zmm23,%%zmm0,%%zmm0 \n" - "vpaddw %%zmm23,%%zmm2,%%zmm2 \n" + "vpaddw %%zmm7,%%zmm0,%%zmm0 \n" + "vpaddw %%zmm7,%%zmm2,%%zmm2 \n" "vpsrlw $0x8,%%zmm0,%%zmm0 \n" "vpsrlw $0x8,%%zmm2,%%zmm2 \n" "vpackuswb %%zmm2,%%zmm0,%%zmm0 \n" - "vpermd %%zmm0,%%zmm22,%%zmm0 \n" + "vpermd %%zmm0,%%zmm6,%%zmm0 \n" "vmovups %%zmm0,(%1) \n" "lea 0x40(%1),%1 \n" "sub $0x40,%2 \n" @@ -1615,8 +1619,8 @@ void ARGBToYMatrixRow_AVX512BW(const uint8_t* src_argb, "+r"(width) // %2 : "r"(c), // %3 "m"(kPermdARGBToY_AVX512BW) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm16", "xmm21", - "xmm22", "xmm23"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm16"); } #endif @@ -1773,8 +1777,8 @@ void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb, "vbroadcasti64x4 0x20(%4),%%zmm3 \n" // kRGBToU "vbroadcasti64x4 0x40(%4),%%zmm4 \n" // kRGBToV "vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n" // -1 - "vpsllw $15,%%zmm16,%%zmm21 \n" // 0x8000 - "vmovups %5,%%zmm23 \n" + "vpsllw $15,%%zmm16,%%zmm5 \n" // 0x8000 + "vmovups %5,%%zmm7 \n" "sub %1,%2 \n" LABELALIGN @@ -1782,45 +1786,45 @@ void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb, "vmovups (%0),%%zmm0 \n" "vmovups 0x40(%0),%%zmm1 \n" "vmovups 0x80(%0),%%zmm2 \n" - "vmovups 0xc0(%0),%%zmm22 \n" + "vmovups 0xc0(%0),%%zmm6 \n" "vpmaddubsw %%zmm3,%%zmm0,%%zmm0 \n" "vpmaddubsw %%zmm3,%%zmm1,%%zmm1 \n" "vpmaddubsw %%zmm3,%%zmm2,%%zmm2 \n" - "vpmaddubsw %%zmm3,%%zmm22,%%zmm22 \n" + "vpmaddubsw %%zmm3,%%zmm6,%%zmm6 \n" "vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n" "vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n" "vpmaddwd %%zmm16,%%zmm2,%%zmm2 \n" - "vpmaddwd %%zmm16,%%zmm22,%%zmm22 \n" + "vpmaddwd %%zmm16,%%zmm6,%%zmm6 \n" "vpackssdw %%zmm1,%%zmm0,%%zmm0 \n" // mutates - "vpackssdw %%zmm22,%%zmm2,%%zmm2 \n" - "vpsubw %%zmm21,%%zmm0,%%zmm0 \n" - "vpsubw %%zmm21,%%zmm2,%%zmm2 \n" + "vpackssdw %%zmm6,%%zmm2,%%zmm2 \n" + "vpsubw %%zmm5,%%zmm0,%%zmm0 \n" + "vpsubw %%zmm5,%%zmm2,%%zmm2 \n" "vpsrlw $0x8,%%zmm0,%%zmm0 \n" "vpsrlw $0x8,%%zmm2,%%zmm2 \n" "vpackuswb %%zmm2,%%zmm0,%%zmm0 \n" // mutates - "vpermd %%zmm0,%%zmm23,%%zmm0 \n" // unmutate. + "vpermd %%zmm0,%%zmm7,%%zmm0 \n" // unmutate. "vmovups %%zmm0,(%1) \n" "vmovups (%0),%%zmm0 \n" "vmovups 0x40(%0),%%zmm1 \n" "vmovups 0x80(%0),%%zmm2 \n" - "vmovups 0xc0(%0),%%zmm22 \n" + "vmovups 0xc0(%0),%%zmm6 \n" "vpmaddubsw %%zmm4,%%zmm0,%%zmm0 \n" "vpmaddubsw %%zmm4,%%zmm1,%%zmm1 \n" "vpmaddubsw %%zmm4,%%zmm2,%%zmm2 \n" - "vpmaddubsw %%zmm4,%%zmm22,%%zmm22 \n" + "vpmaddubsw %%zmm4,%%zmm6,%%zmm6 \n" "vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n" "vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n" "vpmaddwd %%zmm16,%%zmm2,%%zmm2 \n" - "vpmaddwd %%zmm16,%%zmm22,%%zmm22 \n" + "vpmaddwd %%zmm16,%%zmm6,%%zmm6 \n" "vpackssdw %%zmm1,%%zmm0,%%zmm0 \n" // mutates - "vpackssdw %%zmm22,%%zmm2,%%zmm2 \n" - "vpsubw %%zmm21,%%zmm0,%%zmm0 \n" - "vpsubw %%zmm21,%%zmm2,%%zmm2 \n" + "vpackssdw %%zmm6,%%zmm2,%%zmm2 \n" + "vpsubw %%zmm5,%%zmm0,%%zmm0 \n" + "vpsubw %%zmm5,%%zmm2,%%zmm2 \n" "vpsrlw $0x8,%%zmm0,%%zmm0 \n" "vpsrlw $0x8,%%zmm2,%%zmm2 \n" "vpackuswb %%zmm2,%%zmm0,%%zmm0 \n" // mutates - "vpermd %%zmm0,%%zmm23,%%zmm0 \n" // unmutate. + "vpermd %%zmm0,%%zmm7,%%zmm0 \n" // unmutate. "vmovups %%zmm0,(%1,%2,1) \n" "lea 0x100(%0),%0 \n" "lea 0x40(%1),%1 \n" @@ -1837,8 +1841,8 @@ void ARGBToUV444MatrixRow_AVX512BW(const uint8_t* src_argb, #endif : "r"(c), // %4 "m"(kPermdARGBToY_AVX512BW) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm16", "xmm21", - "xmm22", "xmm23"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm16"); } #endif // HAS_ARGBTOUV444ROW_AVX512BW @@ -2233,11 +2237,11 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb, const struct ArgbConstants* c) { asm volatile( "vbroadcasti64x4 0x20(%5),%%zmm4 \n" // RGBToU - "vbroadcasti64x4 0x40(%5),%%zmm21 \n" // RGBToV + "vbroadcasti64x4 0x40(%5),%%zmm5 \n" // RGBToV "vpternlogd $0xff,%%zmm16,%%zmm16,%%zmm16 \n" - "vpabsb %%zmm16,%%zmm22 \n" // 0x0101 + "vpabsb %%zmm16,%%zmm6 \n" // 0x0101 "vpsllw $15,%%zmm16,%%zmm17 \n" // 0x8000 - "vbroadcasti64x4 %6,%%zmm23 \n" // kShuffleAARRGGBB + "vbroadcasti64x4 %6,%%zmm7 \n" // kShuffleAARRGGBB "vmovups %7,%%zmm18 \n" // kPermdARGBToY_AVX512BW "vmovups %8,%%zmm19 \n" // kPermdARGBToUV_AVX512BW "sub %1,%2 \n" @@ -2248,14 +2252,14 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb, "vmovups 0x40(%0),%%zmm1 \n" "vmovups 0x00(%0,%4,1),%%zmm2 \n" "vmovups 0x40(%0,%4,1),%%zmm3 \n" - "vpshufb %%zmm23,%%zmm0,%%zmm0 \n" // aarrggbb - "vpshufb %%zmm23,%%zmm1,%%zmm1 \n" - "vpshufb %%zmm23,%%zmm2,%%zmm2 \n" - "vpshufb %%zmm23,%%zmm3,%%zmm3 \n" - "vpmaddubsw %%zmm22,%%zmm0,%%zmm0 \n" // 32x2 -> 16x2 - "vpmaddubsw %%zmm22,%%zmm1,%%zmm1 \n" - "vpmaddubsw %%zmm22,%%zmm2,%%zmm2 \n" - "vpmaddubsw %%zmm22,%%zmm3,%%zmm3 \n" + "vpshufb %%zmm7,%%zmm0,%%zmm0 \n" // aarrggbb + "vpshufb %%zmm7,%%zmm1,%%zmm1 \n" + "vpshufb %%zmm7,%%zmm2,%%zmm2 \n" + "vpshufb %%zmm7,%%zmm3,%%zmm3 \n" + "vpmaddubsw %%zmm6,%%zmm0,%%zmm0 \n" // 32x2 -> 16x2 + "vpmaddubsw %%zmm6,%%zmm1,%%zmm1 \n" + "vpmaddubsw %%zmm6,%%zmm2,%%zmm2 \n" + "vpmaddubsw %%zmm6,%%zmm3,%%zmm3 \n" "vpaddw %%zmm0,%%zmm2,%%zmm0 \n" // 16x2 -> 16x1 "vpaddw %%zmm1,%%zmm3,%%zmm1 \n" "vpxorq %%zmm2,%%zmm2,%%zmm2 \n" // 0 for vpavgw @@ -2267,7 +2271,7 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb, "vpermd %%zmm0,%%zmm19,%%zmm0 \n" // unscramble pixels "vpmaddubsw %%zmm4,%%zmm0,%%zmm1 \n" // 16 U - "vpmaddubsw %%zmm21,%%zmm0,%%zmm0 \n" // 16 V + "vpmaddubsw %%zmm5,%%zmm0,%%zmm0 \n" // 16 V "vpmaddwd %%zmm16,%%zmm1,%%zmm1 \n" "vpmaddwd %%zmm16,%%zmm0,%%zmm0 \n" "vpackssdw %%zmm0,%%zmm1,%%zmm0 \n" // mutates (U in lower, V in upper) @@ -2298,8 +2302,8 @@ void ARGBToUVMatrixRow_AVX512BW(const uint8_t* src_argb, "m"(kShuffleAARRGGBB), // %6 "m"(kPermdARGBToY_AVX512BW), // %7 "m"(kPermdARGBToUV_AVX512BW) // %8 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm16", "xmm17", - "xmm18", "xmm19", "xmm21", "xmm22", "xmm23"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7", "xmm16", "xmm17", "xmm18", "xmm19"); } void ARGBToUVRow_AVX512BW(const uint8_t* src_argb, @@ -3626,14 +3630,14 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vmovdqa 128(%[yuvconstants]),%%ymm12 \n" #define YUVTORGB_SETUP_AVX512BW(yuvconstants) \ - "vpternlogd $0xff,%%zmm29,%%zmm29,%%zmm29 \n" \ - "vpbroadcastq (%[yuvconstants]),%%zmm24 \n" \ - "vpabsb %%zmm29,%%zmm29 \n" \ - "vpsllw $7,%%zmm29,%%zmm29 \n" \ - "vpbroadcastq 32(%[yuvconstants]),%%zmm25 \n" \ - "vpbroadcastq 64(%[yuvconstants]),%%zmm26 \n" \ - "vpbroadcastq 96(%[yuvconstants]),%%zmm27 \n" \ - "vpbroadcastq 128(%[yuvconstants]),%%zmm28 \n" \ + "vpternlogd $0xff,%%zmm13,%%zmm13,%%zmm13 \n" \ + "vpbroadcastq (%[yuvconstants]),%%zmm8 \n" \ + "vpabsb %%zmm13,%%zmm13 \n" \ + "vpsllw $7,%%zmm13,%%zmm13 \n" \ + "vpbroadcastq 32(%[yuvconstants]),%%zmm9 \n" \ + "vpbroadcastq 64(%[yuvconstants]),%%zmm10 \n" \ + "vpbroadcastq 96(%[yuvconstants]),%%zmm11 \n" \ + "vpbroadcastq 128(%[yuvconstants]),%%zmm12 \n" \ "vmovups (%[quadsplitperm]),%%zmm16 \n" \ "vmovups (%[dquadsplitperm]),%%zmm17 \n" \ "vmovups (%[unperm]),%%zmm18 \n" @@ -3650,12 +3654,12 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpaddsw %%ymm4,%%ymm2,%%ymm2 \n" #define YUVTORGB16_AVX512BW(yuvconstants) \ - "vpsubb %%zmm29,%%zmm3,%%zmm3 \n" \ - "vpmulhuw %%zmm27,%%zmm4,%%zmm4 \n" \ - "vpmaddubsw %%zmm3,%%zmm24,%%zmm0 \n" \ - "vpmaddubsw %%zmm3,%%zmm25,%%zmm1 \n" \ - "vpmaddubsw %%zmm3,%%zmm26,%%zmm2 \n" \ - "vpaddw %%zmm4,%%zmm28,%%zmm4 \n" \ + "vpsubb %%zmm13,%%zmm3,%%zmm3 \n" \ + "vpmulhuw %%zmm11,%%zmm4,%%zmm4 \n" \ + "vpmaddubsw %%zmm3,%%zmm8,%%zmm0 \n" \ + "vpmaddubsw %%zmm3,%%zmm9,%%zmm1 \n" \ + "vpmaddubsw %%zmm3,%%zmm10,%%zmm2 \n" \ + "vpaddw %%zmm4,%%zmm12,%%zmm4 \n" \ "vpaddsw %%zmm4,%%zmm0,%%zmm0 \n" \ "vpsubsw %%zmm1,%%zmm4,%%zmm1 \n" \ "vpaddsw %%zmm4,%%zmm2,%%zmm2 \n" @@ -3722,7 +3726,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, #define STOREARGB_AVX512BW \ "vpunpcklbw %%zmm1,%%zmm0,%%zmm0 \n" \ "vpermq %%zmm0,%%zmm18,%%zmm0 \n" \ - "vpunpcklbw %%zmm21,%%zmm2,%%zmm2 \n" \ + "vpunpcklbw %%zmm5,%%zmm2,%%zmm2 \n" \ "vpermq %%zmm2,%%zmm18,%%zmm2 \n" \ "vpunpcklwd %%zmm2,%%zmm0,%%zmm1 \n" \ "vpunpckhwd %%zmm2,%%zmm0,%%zmm0 \n" \ @@ -3844,7 +3848,7 @@ void OMITFP I422ToARGBRow_AVX512BW(const uint8_t* y_buf, YUVTORGB_SETUP_AVX512BW(yuvconstants) "sub %[u_buf],%[v_buf] \n" "vpcmpeqb %%xmm5,%%xmm5,%%xmm5 \n" - "vpbroadcastq %%xmm5,%%zmm21 \n" + "vpbroadcastq %%xmm5,%%zmm5 \n" LABELALIGN "1: \n" @@ -4685,6 +4689,43 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { } #endif // HAS_MIRRORROW_AVX2 +#ifdef HAS_MIRRORSPLITUVROW_AVX2 +// Shuffle table for reversing the bytes of UV channels. +static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, + 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; + +void MirrorSplitUVRow_AVX2(const uint8_t* src, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ptrdiff_t temp_width = (ptrdiff_t)(width); + asm volatile( + "vbroadcasti128 %4,%%ymm1 \n" + "lea -0x20(%0,%3,2),%0 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "lea -0x20(%0),%0 \n" + "vpshufb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0x72,%%ymm0,%%ymm0 \n" + "vextracti128 $0x1,%%ymm0,%%xmm2 \n" + "vmovdqu %%xmm0,(%1) \n" + "vmovdqu %%xmm2,0x00(%1,%2,1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(temp_width) // %3 + : "m"(kShuffleMirrorSplitUV) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif // HAS_MIRRORSPLITUVROW_AVX2 + #ifdef HAS_MIRRORUVROW_SSSE3 // Shuffle table for reversing the UV. static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u, @@ -4733,38 +4774,7 @@ void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { } #endif // HAS_MIRRORUVROW_AVX2 -#ifdef HAS_MIRRORSPLITUVROW_SSSE3 -// Shuffle table for reversing the bytes of UV channels. -static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, - 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u}; -void MirrorSplitUVRow_SSSE3(const uint8_t* src, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ptrdiff_t temp_width = (ptrdiff_t)(width); - asm volatile( - "movdqa %4,%%xmm1 \n" - "lea -0x10(%0,%3,2),%0 \n" - "sub %1,%2 \n" - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "pshufb %%xmm1,%%xmm0 \n" - "movlpd %%xmm0,(%1) \n" - "movhpd %%xmm0,0x00(%1,%2,1) \n" - "lea 0x8(%1),%1 \n" - "sub $8,%3 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(temp_width) // %3 - : "m"(kShuffleMirrorSplitUV) // %4 - : "memory", "cc", "xmm0", "xmm1"); -} -#endif // HAS_MIRRORSPLITUVROW_SSSE3 #ifdef HAS_RGB24MIRRORROW_SSSE3 @@ -4813,6 +4823,60 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, } #endif // HAS_RGB24MIRRORROW_SSSE3 +#ifdef HAS_RGB24MIRRORROW_AVX2 +// Shuffle first 10 pixels to last 10 mirrored. first byte zero +static const uvec8 kShuffleMirrorRGB0_AVX = { + 128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u +}; + +// Shuffle last 2 pixels to first 2 mirrored. last byte zero +static const uvec8 kShuffleMirrorRGB1_AVX = { + 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u +}; + +void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, + int width) { + ptrdiff_t temp_width = (ptrdiff_t)(width); + src_rgb24 += width * 3 - 96; + asm volatile( + "vbroadcasti128 %3,%%ymm4 \n" + "vmovdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // first 10 + "vinserti128 $1,15(%0),%%ymm0,%%ymm0 \n" + "vmovdqu 30(%0),%%xmm1 \n" // next 10 + "vinserti128 $1,45(%0),%%ymm1,%%ymm1 \n" + "vmovdqu 60(%0),%%xmm2 \n" // next 10 + "vinserti128 $1,75(%0),%%ymm2,%%ymm2 \n" + "vmovdqu 80(%0),%%xmm3 \n" // last 2 special + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm2,%%ymm2 \n" + "vpshufb %%xmm5,%%xmm3,%%xmm3 \n" + "lea -0x60(%0),%0 \n" + "vmovdqu %%xmm0,80(%1) \n" + "vextracti128 $1,%%ymm0,65(%1) \n" + "vmovdqu %%xmm1,50(%1) \n" + "vextracti128 $1,%%ymm1,35(%1) \n" + "vmovdqu %%xmm2,20(%1) \n" + "vextracti128 $1,%%ymm2,5(%1) \n" + "vmovq %%xmm3,0(%1) \n" + "lea 0x60(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_rgb24), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorRGB0_AVX), // %3 + "m"(kShuffleMirrorRGB1_AVX) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_RGB24MIRRORROW_AVX2 + #ifdef HAS_ARGBMIRRORROW_SSE2 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { diff --git a/source/row_win.cc b/source/row_win.cc index 87a4a5aeb..1d92043be 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -470,6 +470,128 @@ void MergeUVRow_AVX2(const uint8_t* src_u, } #endif +#ifdef HAS_MIRRORROW_AVX2 +LIBYUV_TARGET_AVX2 +void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + __m256i ymm_shuf = + _mm256_broadcastsi128_si256(_mm_setr_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); + src += width; + while (width > 0) { + src -= 32; + __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src); + ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf); + ymm0 = _mm256_permute4x64_epi64(ymm0, 0x4e); + _mm256_storeu_si256((__m256i*)dst, ymm0); + dst += 32; + width -= 32; + } +} +#endif + +#ifdef HAS_MIRRORUVROW_AVX2 +LIBYUV_TARGET_AVX2 +void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) { + __m256i ymm_shuf = + _mm256_broadcastsi128_si256(_mm_setr_epi8(14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1)); + src_uv += width * 2; + while (width > 0) { + src_uv -= 32; + __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_uv); + ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf); + ymm0 = _mm256_permute4x64_epi64(ymm0, 0x4e); + _mm256_storeu_si256((__m256i*)dst_uv, ymm0); + dst_uv += 32; + width -= 16; + } +} +#endif + +#ifdef HAS_MIRRORSPLITUVROW_AVX2 +LIBYUV_TARGET_AVX2 +void MirrorSplitUVRow_AVX2(const uint8_t* src_uv, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + __m256i ymm_shuf = + _mm256_broadcastsi128_si256(_mm_setr_epi8(14, 12, 10, 8, 6, 4, 2, 0, 15, 13, 11, 9, 7, 5, 3, 1)); + src_uv += width * 2; + while (width > 0) { + src_uv -= 32; + __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_uv); + ymm0 = _mm256_shuffle_epi8(ymm0, ymm_shuf); + ymm0 = _mm256_permute4x64_epi64(ymm0, 0x72); + _mm_storeu_si128((__m128i*)dst_u, _mm256_castsi256_si128(ymm0)); + _mm_storeu_si128((__m128i*)dst_v, _mm256_extracti128_si256(ymm0, 1)); + dst_u += 16; + dst_v += 16; + width -= 16; + } +} +#endif + +#ifdef HAS_RGB24MIRRORROW_AVX2 +LIBYUV_TARGET_AVX2 +void RGB24MirrorRow_AVX2(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, + int width) { + __m256i shuf0 = _mm256_setr_epi8( + -1, 12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2, + -1, 12, 13, 14, 9, 10, 11, 6, 7, 8, 3, 4, 5, 0, 1, 2); + __m128i shuf1 = _mm_setr_epi8( + 13, 14, 15, 10, 11, 12, 7, 8, 9, 4, 5, 6, 1, 2, 3, -1); + + src_rgb24 += width * 3 - 96; + while (width > 0) { + __m128i v0_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 0)); + __m128i v0_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 15)); + __m256i v0 = _mm256_inserti128_si256(_mm256_castsi128_si256(v0_lo), v0_hi, 1); + + __m128i v1_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 30)); + __m128i v1_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 45)); + __m256i v1 = _mm256_inserti128_si256(_mm256_castsi128_si256(v1_lo), v1_hi, 1); + + __m128i v2_lo = _mm_loadu_si128((const __m128i*)(src_rgb24 + 60)); + __m128i v2_hi = _mm_loadu_si128((const __m128i*)(src_rgb24 + 75)); + __m256i v2 = _mm256_inserti128_si256(_mm256_castsi128_si256(v2_lo), v2_hi, 1); + + __m128i v3 = _mm_loadu_si128((const __m128i*)(src_rgb24 + 80)); + + v0 = _mm256_shuffle_epi8(v0, shuf0); + v1 = _mm256_shuffle_epi8(v1, shuf0); + v2 = _mm256_shuffle_epi8(v2, shuf0); + v3 = _mm_shuffle_epi8(v3, shuf1); + + _mm_storeu_si128((__m128i*)(dst_rgb24 + 80), _mm256_castsi256_si128(v0)); + _mm_storeu_si128((__m128i*)(dst_rgb24 + 65), _mm256_extracti128_si256(v0, 1)); + _mm_storeu_si128((__m128i*)(dst_rgb24 + 50), _mm256_castsi256_si128(v1)); + _mm_storeu_si128((__m128i*)(dst_rgb24 + 35), _mm256_extracti128_si256(v1, 1)); + _mm_storeu_si128((__m128i*)(dst_rgb24 + 20), _mm256_castsi256_si128(v2)); + _mm_storeu_si128((__m128i*)(dst_rgb24 + 5), _mm256_extracti128_si256(v2, 1)); + _mm_storel_epi64((__m128i*)(dst_rgb24 + 0), v3); + + src_rgb24 -= 96; + dst_rgb24 += 96; + width -= 32; + } +} +#endif + +#ifdef HAS_ARGBMIRRORROW_AVX2 +LIBYUV_TARGET_AVX2 +void ARGBMirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) { + __m256i ymm_shuf = _mm256_setr_epi32(7, 6, 5, 4, 3, 2, 1, 0); + src += width * 4; + while (width > 0) { + src -= 32; + __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src); + ymm0 = _mm256_permutevar8x32_epi32(ymm0, ymm_shuf); + _mm256_storeu_si256((__m256i*)dst, ymm0); + dst += 32; + width -= 8; + } +} +#endif + #endif #ifdef __cplusplus diff --git a/unit_test/color_test.cc b/unit_test/color_test.cc index 9ed0b8344..3048f728a 100644 --- a/unit_test/color_test.cc +++ b/unit_test/color_test.cc @@ -22,14 +22,8 @@ namespace libyuv { // TODO(fbarchard): clang x86 has a higher accuracy YUV to RGB. // Port to Visual C and other CPUs -#if !defined(LIBYUV_BIT_EXACT) && !defined(LIBYUV_DISABLE_X86) && \ - (defined(__x86_64__) || defined(__i386__)) -#define ERROR_FULL 5 -#define ERROR_J420 4 -#else #define ERROR_FULL 6 #define ERROR_J420 6 -#endif #define ERROR_R 1 #define ERROR_G 1 #ifdef LIBYUV_UNLIMITED_DATA diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 481fe1272..9b7cc85d9 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -2050,7 +2050,6 @@ TEST_F(LibYUVConvertTest, MM21ToYUY2) { } // Test RGB24 to J420 is exact -#if defined(LIBYUV_BIT_EXACT) TEST_F(LibYUVConvertTest, TestRGB24ToJ420) { const int kSize = 256; align_buffer_page_end(orig_rgb24, kSize * 3 * 2); // 2 rows of RGB24 @@ -2075,10 +2074,8 @@ TEST_F(LibYUVConvertTest, TestRGB24ToJ420) { free_aligned_buffer_page_end(orig_rgb24); free_aligned_buffer_page_end(dest_j420); } -#endif // Test RGB24 to I420 is exact -#if defined(LIBYUV_BIT_EXACT) TEST_F(LibYUVConvertTest, TestRGB24ToI420) { const int kSize = 256; align_buffer_page_end(orig_rgb24, kSize * 3 * 2); // 2 rows of RGB24 @@ -2103,7 +2100,6 @@ TEST_F(LibYUVConvertTest, TestRGB24ToI420) { free_aligned_buffer_page_end(orig_rgb24); free_aligned_buffer_page_end(dest_i420); } -#endif TEST_F(LibYUVConvertTest, TestJ420ToI420) { const uint8_t src_y[12] = {0, 0, 128, 128, 255, 255, diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 672658c9c..d37001f1b 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -29,11 +29,7 @@ #include "libyuv/row.h" /* For ScaleSumSamples_Neon */ #endif -#if defined(LIBYUV_BIT_EXACT) #define EXPECTED_UNATTENUATE_DIFF 0 -#else -#define EXPECTED_UNATTENUATE_DIFF 2 -#endif namespace libyuv { @@ -284,28 +280,28 @@ TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Any) { int max_diff = TestUnattenuateI(benchmark_width_ + 1, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); - ASSERT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF); + ASSERT_EQ(max_diff, 0); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Unaligned) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 1); - ASSERT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF); + ASSERT_EQ(max_diff, 0); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Invert) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, -1, 0); - ASSERT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF); + ASSERT_EQ(max_diff, 0); } TEST_F(LibYUVPlanarTest, ARGBUnattenuate_Opt) { int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_, +1, 0); - ASSERT_LE(max_diff, EXPECTED_UNATTENUATE_DIFF); + ASSERT_EQ(max_diff, 0); } TEST_F(LibYUVPlanarTest, TestARGBComputeCumulativeSum) {