diff --git a/README.chromium b/README.chromium index 3d1777bed..674775cdc 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1861 +Version: 1862 License: BSD License File: LICENSE diff --git a/include/libyuv/rotate_row.h b/include/libyuv/rotate_row.h index 64d0b59f7..d047b703e 100644 --- a/include/libyuv/rotate_row.h +++ b/include/libyuv/rotate_row.h @@ -42,6 +42,8 @@ extern "C" { // The following are available for GCC 32 or 64 bit: #if !defined(LIBYUV_DISABLE_X86) && (defined(__i386__) || defined(__x86_64__)) #define HAS_TRANSPOSEWX8_SSSE3 +#define HAS_TRANSPOSE4X4_32_SSE2 +#define HAS_TRANSPOSE4X4_32_AVX2 #endif // The following are available for 64 bit GCC: @@ -56,6 +58,11 @@ extern "C" { #define HAS_TRANSPOSEUVWX8_NEON #endif +#if !defined(LIBYUV_DISABLE_NEON) && \ + (defined(LIBYUV_NEON) || defined(__aarch64__)) +#define HAS_TRANSPOSE4X4_32_NEON +#endif + #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) #define HAS_TRANSPOSEWX16_MSA #define HAS_TRANSPOSEUVWX16_MSA @@ -240,19 +247,24 @@ void Transpose4x4_32_NEON(const uint8_t* src, int dst_stride, int width); +void Transpose4x4_32_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); + +void Transpose4x4_32_AVX2(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width); + void Transpose4x4_32_C(const uint8_t* src, int src_stride, uint8_t* dst, int dst_stride, int width); -// Transpose 32 bit values (ARGB) -void Transpose8x8_32_NEON(const uint8_t* src, - int src_stride, - uint8_t* dst, - int dst_stride, - int width); - #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 42f816626..cc1e66e71 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1861 +#define LIBYUV_VERSION 1862 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/rotate_gcc.cc b/source/rotate_gcc.cc index 1a3f8cbbd..fd5eee05f 100644 --- a/source/rotate_gcc.cc +++ b/source/rotate_gcc.cc @@ -365,6 +365,136 @@ void TransposeUVWx8_SSE2(const uint8_t* src, "xmm7", "xmm8", "xmm9"); } #endif // defined(HAS_TRANSPOSEUVWX8_SSE2) + +#if defined(HAS_TRANSPOSE4X4_32_SSE2) +// 4 values, little endian view +// a b c d +// e f g h +// i j k l +// m n o p + +// transpose 2x2 +// a e b f from row 0, 1 +// i m j n from row 2, 3 +// c g d h from row 0, 1 +// k o l p from row 2, 3 + +// transpose 4x4 +// a e i m from row 0, 1 +// b f j n from row 0, 1 +// c g k o from row 2, 3 +// d h l p from row 2, 3 + +// Transpose 32 bit values (ARGB) +void Transpose4x4_32_SSE2(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + asm volatile( + // Main loop transpose 4x4. Read a column, write a row. + "1: \n" + "movdqu (%0),%%xmm0 \n" // a b c d + "movdqu (%0,%3),%%xmm1 \n" // e f g h + "lea (%0,%3,2),%0 \n" // src += stride * 2 + "movdqu (%0),%%xmm2 \n" // i j k l + "movdqu (%0,%3),%%xmm3 \n" // m n o p + "lea (%0,%3,2),%0 \n" // src += stride * 2 + + // Transpose 2x2 + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm2,%%xmm5 \n" + "movdqa %%xmm0,%%xmm6 \n" + "movdqa %%xmm2,%%xmm7 \n" + "punpckldq %%xmm1,%%xmm4 \n" // a e b f from row 0, 1 + "punpckldq %%xmm3,%%xmm5 \n" // i m j n from row 2, 3 + "punpckhdq %%xmm1,%%xmm6 \n" // c g d h from row 0, 1 + "punpckhdq %%xmm3,%%xmm7 \n" // k o l p from row 2, 3 + + // Transpose 4x4 + "movdqa %%xmm4,%%xmm0 \n" + "movdqa %%xmm4,%%xmm1 \n" + "movdqa %%xmm6,%%xmm2 \n" + "movdqa %%xmm6,%%xmm3 \n" + "punpcklqdq %%xmm5,%%xmm0 \n" // a e i m from row 0, 1 + "punpckhqdq %%xmm5,%%xmm1 \n" // b f j n from row 0, 1 + "punpcklqdq %%xmm7,%%xmm2 \n" // c g k o from row 2, 3 + "punpckhqdq %%xmm7,%%xmm3 \n" // d h l p from row 2, 3 + + "movdqu %%xmm0,(%1) \n" + "lea 16(%1,%4),%1 \n" // dst += stride + 16 + "movdqu %%xmm1,-16(%1) \n" + "movdqu %%xmm2,-16(%1,%4) \n" + "movdqu %%xmm3,-16(%1,%4,2) \n" + "sub %4,%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+rm"(width) // %2 + : "r"((ptrdiff_t)(src_stride)), // %3 + "r"((ptrdiff_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // defined(HAS_TRANSPOSE4X4_32_SSE2) + +#if defined(HAS_TRANSPOSE4X4_32_AVX2) + +// Transpose 32 bit values (ARGB) +void Transpose4x4_32_AVX2(const uint8_t* src, + int src_stride, + uint8_t* dst, + int dst_stride, + int width) { + asm volatile( + // Main loop transpose 2 blocks of 4x4. Read a column, write a row. + "1: \n" + "vmovdqu (%0),%%xmm0 \n" // a b c d + "vmovdqu (%0,%3),%%xmm1 \n" // e f g h + "lea (%0,%3,2),%0 \n" // src += stride * 2 + "vmovdqu (%0),%%xmm2 \n" // i j k l + "vmovdqu (%0,%3),%%xmm3 \n" // m n o p + "lea (%0,%3,2),%0 \n" // src += stride * 2 + + "vinserti128 $1,(%0),%%ymm0,%%ymm0 \n" // a b c d + "vinserti128 $1,(%0,%3),%%ymm1,%%ymm1 \n" // e f g h + "lea (%0,%3,2),%0 \n" // src += stride * 2 + "vinserti128 $1,(%0),%%ymm2,%%ymm2 \n" // i j k l + "vinserti128 $1,(%0,%3),%%ymm3,%%ymm3 \n" // m n o p + "lea (%0,%3,2),%0 \n" // src += stride * 2 + + // Transpose 2x2 + "vpunpckldq %%ymm1,%%ymm0,%%ymm4 \n" // a e b f from row 0, 1 + "vpunpckldq %%ymm3,%%ymm2,%%ymm5 \n" // i m j n from row 2, 3 + "vpunpckhdq %%ymm1,%%ymm0,%%ymm6 \n" // c g d h from row 0, 1 + "vpunpckhdq %%ymm3,%%ymm2,%%ymm7 \n" // k o l p from row 2, 3 + + // Transpose 4x4 + "vpunpcklqdq %%ymm5,%%ymm4,%%ymm0 \n" // a e i m from row 0, 1 + "vpunpckhqdq %%ymm5,%%ymm4,%%ymm1 \n" // b f j n from row 0, 1 + "vpunpcklqdq %%ymm7,%%ymm6,%%ymm2 \n" // c g k o from row 2, 3 + "vpunpckhqdq %%ymm7,%%ymm6,%%ymm3 \n" // d h l p from row 2, 3 + + "vmovdqu %%ymm0,(%1) \n" + "lea 32(%1,%4),%1 \n" // dst += stride + 32 + "vmovdqu %%ymm1,-32(%1) \n" + "vmovdqu %%ymm2,-32(%1,%4) \n" + "vmovdqu %%ymm3,-32(%1,%4,2) \n" + "sub %4,%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+rm"(width) // %2 + : "r"((ptrdiff_t)(src_stride)), // %3 + "r"((ptrdiff_t)(dst_stride)) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif // defined(HAS_TRANSPOSE4X4_32_AVX2) + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/unit_test/rotate_test.cc b/unit_test/rotate_test.cc index e8d2ca164..abc08efa8 100644 --- a/unit_test/rotate_test.cc +++ b/unit_test/rotate_test.cc @@ -864,7 +864,55 @@ TEST_F(LibYUVRotateTest, I410Rotate270_Opt) { #if defined(ENABLE_ROW_TESTS) -TEST_F(LibYUVRotateTest, Transpose4x4) { +TEST_F(LibYUVRotateTest, Transpose4x4_Test) { + // dst width and height + const int width = 4; + const int height = 4; + int src_pixels[4][4]; + int dst_pixels_c[4][4]; + int dst_pixels_opt[4][4]; + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + src_pixels[i][j] = i * 10 + j; + } + } + memset(dst_pixels_c, 1, width * height * 4); + memset(dst_pixels_opt, 2, width * height * 4); + + Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_c, width * 4, width); + + const int benchmark_iterations = + (benchmark_iterations_ * benchmark_width_ * benchmark_height_ + 15) / + (4 * 4); + for (int i = 0; i < benchmark_iterations; ++i) { +#if defined(HAS_TRANSPOSE4X4_32_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_opt, width * 4, width); + } else +#elif defined(HAS_TRANSPOSE4X4_32_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_opt, width * 4, width); + } else +#endif + { + Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_opt, width * 4, width); + } + } + + for (int i = 0; i < 4; ++i) { + for (int j = 0; j < 4; ++j) { + EXPECT_EQ(dst_pixels_c[i][j], src_pixels[j][i]); + EXPECT_EQ(dst_pixels_c[i][j], dst_pixels_opt[i][j]); + } + } +} + +TEST_F(LibYUVRotateTest, Transpose4x4_Opt) { // dst width and height const int width = ((benchmark_width_ * benchmark_height_ + 3) / 4 + 3) & ~3; const int height = 4; @@ -874,29 +922,35 @@ TEST_F(LibYUVRotateTest, Transpose4x4) { MemRandomize(src_pixels, height * width * 4); memset(dst_pixels_c, 1, width * height * 4); - memset(dst_pixels_opt, 1, width * height * 4); + memset(dst_pixels_opt, 2, width * height * 4); Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4, (uint8_t*)dst_pixels_c, width * 4, width); for (int i = 0; i < benchmark_iterations_; ++i) { -#if defined(__aarch64__) +#if defined(HAS_TRANSPOSE4X4_32_NEON) if (TestCpuFlag(kCpuHasNEON)) { Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4, (uint8_t*)dst_pixels_opt, width * 4, width); - } else { + } else +#elif defined(HAS_TRANSPOSE4X4_32_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + Transpose4x4_32_AVX2((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_opt, width * 4, width); + } else if (TestCpuFlag(kCpuHasSSE2)) { + Transpose4x4_32_SSE2((const uint8_t*)src_pixels, height * 4, + (uint8_t*)dst_pixels_opt, width * 4, width); + } else +#endif + { Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4, (uint8_t*)dst_pixels_opt, width * 4, width); } -#else - Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4, - (uint8_t*)dst_pixels_opt, width * 4, width); -#endif } - // for (int i = 0; i < width * height; ++i) { - // EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); - // } + for (int i = 0; i < width * height; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } free_aligned_buffer_page_end(src_pixels); free_aligned_buffer_page_end(dst_pixels_c);