diff --git a/README.chromium b/README.chromium index 41eae983b..d372e45da 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1733 +Version: 1734 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 5cbdaadee..bae83154a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -275,6 +275,7 @@ extern "C" { #define HAS_I422TOAR30ROW_SSSE3 #define HAS_MERGERGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3 +#define HAS_SWAPUVROW_SSSE3 #endif // The following are available for AVX2 gcc/clang x86 platforms: @@ -295,6 +296,7 @@ extern "C" { #define HAS_I422TOYUY2ROW_AVX2 #define HAS_MERGEUVROW_16_AVX2 #define HAS_MULTIPLYROW_16_AVX2 +#define HAS_SWAPUVROW_AVX2 // TODO(fbarchard): Fix AVX2 version of YUV24 // #define HAS_NV21TOYUV24ROW_AVX2 #endif @@ -3374,6 +3376,10 @@ void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr, void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width); void SwapUVRow_Any_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width); +void SwapUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width); +void SwapUVRow_Any_SSSE3(const uint8_t* src_uv, uint8_t* dst_vu, int width); +void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width); +void SwapUVRow_Any_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width); void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width); void AYUVToUVRow_C(const uint8_t* src_ayuv, int stride_ayuv, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 0f245bf9a..0f9a4508f 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1733 +#define LIBYUV_VERSION 1734 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index d739baa14..5a9d56d88 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -527,6 +527,22 @@ void SwapUVPlane(const uint8_t* src_uv, src_stride_uv = dst_stride_vu = 0; } +#if defined(HAS_SWAPUVROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + SwapUVRow = SwapUVRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + SwapUVRow = SwapUVRow_SSSE3; + } + } +#endif +#if defined(HAS_SWAPUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + SwapUVRow = SwapUVRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + SwapUVRow = SwapUVRow_AVX2; + } + } +#endif #if defined(HAS_SWAPUVROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { SwapUVRow = SwapUVRow_Any_NEON; diff --git a/source/row_any.cc b/source/row_any.cc index ef89350ec..da91347b5 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -710,6 +710,12 @@ ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15) #ifdef HAS_AYUVTOYROW_NEON ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15) #endif +#ifdef HAS_SWAPUVROW_SSSE3 +ANY11(SwapUVRow_Any_SSSE3, SwapUVRow_SSSE3, 0, 2, 2, 15) +#endif +#ifdef HAS_SWAPUVROW_AVX2 +ANY11(SwapUVRow_Any_AVX2, SwapUVRow_AVX2, 0, 2, 2, 31) +#endif #ifdef HAS_SWAPUVROW_NEON ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15) #endif diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 18b6350b8..7c6dee156 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -6790,6 +6790,68 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y, } #endif // HAS_NV21TOYUV24ROW_AVX2 +#ifdef HAS_SWAPUVROW_SSSE3 + +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleUVToVU = {1u, 0u, 3u, 2u, 5u, 4u, 7u, 6u, + 9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u}; + +void SwapUVRow_SSSE3(const uint8_t* src_uv, + uint8_t* dst_vu, + int width) { + asm volatile( + + "movdqu %3,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : "m"(kShuffleUVToVU) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +#endif // HAS_SWAPUVROW_SSSE3 + +#ifdef HAS_SWAPUVROW_AVX2 +void SwapUVRow_AVX2(const uint8_t* src_uv, + uint8_t* dst_vu, + int width) { + asm volatile( + + "vbroadcastf128 %3,%%ymm5 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "lea 0x40(%0),%0 \n" + "vpshufb %%ymm5,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x40(%1),%1 \n" + "sub $0x20,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_uv), // %0 + "+r"(dst_vu), // %1 + "+r"(width) // %2 + : "m"(kShuffleUVToVU) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm5"); +} +#endif // HAS_SWAPUVROW_AVX2 + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index c564ced6a..a71e752cd 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -994,6 +994,7 @@ TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2) TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2) TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4) TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4) +// TODO(fbarchard): Investigate high error on Win32. TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, 10) TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5) TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)