From ffec313dbe58c6b97d4943387bda618dccbe4591 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 29 Jan 2018 14:27:07 -0800 Subject: [PATCH] ABGRToAR30 used AVX2 with reversed shuffler vpshufb is used to reverse R and B channels; Code is otherwise the same as ARGBToAR30. Bug: libyuv:751 Test: ABGRToAR30 unittest Change-Id: I30e02925f5c729e4496c5963ba4ba4af16633b3b Reviewed-on: https://chromium-review.googlesource.com/891807 Commit-Queue: Frank Barchard Reviewed-by: richard winterton --- include/libyuv/convert_from_argb.h | 9 ++++ include/libyuv/row.h | 21 ++++---- source/convert_from_argb.cc | 49 ++++++++++++++++++ source/row_any.cc | 6 +++ source/row_common.cc | 19 +++++-- source/row_gcc.cc | 81 +++++++++++++++++++++++++++++- unit_test/convert_test.cc | 40 ++++++++++++++- 7 files changed, 210 insertions(+), 15 deletions(-) diff --git a/include/libyuv/convert_from_argb.h b/include/libyuv/convert_from_argb.h index c9a7e59de..a04826574 100644 --- a/include/libyuv/convert_from_argb.h +++ b/include/libyuv/convert_from_argb.h @@ -55,6 +55,15 @@ int ARGBToRGBA(const uint8_t* src_argb, int width, int height); +// Convert ABGR To AR30. +LIBYUV_API +int ABGRToAR30(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height); + // Convert ARGB To AR30. LIBYUV_API int ARGBToAR30(const uint8_t* src_argb, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 9cf711e27..caa45fb31 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -252,6 +252,7 @@ extern "C" { // TODO(fbarchard): Port to Visual C #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) +#define HAS_ABGRTOAR30ROW_SSSE3 #define HAS_ARGBTOAR30ROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT8TO16ROW_SSE2 @@ -268,6 +269,7 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) && \ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) +#define HAS_ABGRTOAR30ROW_AVX2 #define HAS_ARGBTOAR30ROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 @@ -1688,7 +1690,8 @@ void ARGBToARGB1555Row_SSE2(const uint8_t* src_argb, void ARGBToARGB4444Row_SSE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ARGBToAR30Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ABGRToAR30Row_SSSE3(const uint8_t* src_abgr, uint8_t* dst_ar30, int width); +void ARGBToAR30Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_ar30, int width); void ARGBToRGB565DitherRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, @@ -1710,7 +1713,8 @@ void ARGBToARGB1555Row_AVX2(const uint8_t* src_argb, void ARGBToARGB4444Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ARGBToAR30Row_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ABGRToAR30Row_AVX2(const uint8_t* src_abgr, uint8_t* dst_ar30, int width); +void ARGBToAR30Row_AVX2(const uint8_t* src_argb, uint8_t* dst_ar30, int width); void ARGBToRGB24Row_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, int width); @@ -1745,7 +1749,8 @@ void ARGBToRAWRow_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToRGB565Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToARGB1555Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); +void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width); +void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width); void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width); @@ -2407,9 +2412,8 @@ void ARGBToARGB1555Row_Any_SSE2(const uint8_t* src_argb, void ARGBToARGB4444Row_Any_SSE2(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width); +void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_abgr, uint8_t* dst_ar30, int width); +void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_argb, uint8_t* dst_ar30, int width); void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_argb, uint8_t* dst_rgb, @@ -2429,9 +2433,8 @@ void ARGBToARGB1555Row_Any_AVX2(const uint8_t* src_argb, void ARGBToARGB4444Row_Any_AVX2(const uint8_t* src_argb, uint8_t* dst_rgb, int width); -void ARGBToAR30Row_Any_AVX2(const uint8_t* src_argb, - uint8_t* dst_rgb, - int width); +void ABGRToAR30Row_Any_AVX2(const uint8_t* src_abgr, uint8_t* dst_ar30, int width); +void ARGBToAR30Row_Any_AVX2(const uint8_t* src_argb, uint8_t* dst_ar30, int width); void ARGBToRGB24Row_Any_NEON(const uint8_t* src_argb, uint8_t* dst_rgb, diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 1fc4289a8..839bc333e 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -1217,6 +1217,55 @@ int ARGBToARGB4444(const uint8_t* src_argb, return 0; } +// Convert ABGR To AR30. +LIBYUV_API +int ABGRToAR30(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + int y; + void (*ABGRToAR30Row)(const uint8_t* src_abgr, uint8_t* dst_rgb, int width) = + ABGRToAR30Row_C; + if (!src_abgr || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_abgr = src_abgr + (height - 1) * src_stride_abgr; + src_stride_abgr = -src_stride_abgr; + } + // Coalesce rows. + if (src_stride_abgr == width * 4 && dst_stride_ar30 == width * 4) { + width *= height; + height = 1; + src_stride_abgr = dst_stride_ar30 = 0; + } +#if defined(HAS_ABGRTOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ABGRToAR30Row = ABGRToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ABGRToAR30Row = ABGRToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_ABGRTOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToAR30Row = ABGRToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ABGRToAR30Row = ABGRToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + ABGRToAR30Row(src_abgr, dst_ar30, width); + src_abgr += src_stride_abgr; + dst_ar30 += dst_stride_ar30; + } + return 0; +} + // Convert ARGB To AR30. LIBYUV_API int ARGBToAR30(const uint8_t* src_argb, diff --git a/source/row_any.cc b/source/row_any.cc index 3f7c4a7b7..31e5ea37d 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -428,9 +428,15 @@ ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) #endif +#if defined(HAS_ABGRTOAR30ROW_SSSE3) +ANY11(ABGRToAR30Row_Any_SSSE3, ABGRToAR30Row_SSSE3, 0, 4, 4, 3) +#endif #if defined(HAS_ARGBTOAR30ROW_SSSE3) ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3) #endif +#if defined(HAS_ABGRTOAR30ROW_AVX2) +ANY11(ABGRToAR30Row_Any_AVX2, ABGRToAR30Row_AVX2, 0, 4, 4, 7) +#endif #if defined(HAS_ARGBTOAR30ROW_AVX2) ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7) #endif diff --git a/source/row_common.cc b/source/row_common.cc index a34dda407..297d87e01 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -348,15 +348,28 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { } } -void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width) { +void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width) { + int x; + for (x = 0; x < width; ++x) { + uint32_t b0 = (src_abgr[0] >> 6) | ((uint32_t)(src_abgr[0]) << 2); + uint32_t g0 = (src_abgr[1] >> 6) | ((uint32_t)(src_abgr[1]) << 2); + uint32_t r0 = (src_abgr[2] >> 6) | ((uint32_t)(src_abgr[2]) << 2); + uint32_t a0 = (src_abgr[3] >> 6); + *(uint32_t*)(dst_ar30) = r0 | (g0 << 10) | (b0 << 20) | (a0 << 30); + dst_ar30 += 4; + src_abgr += 4; + } +} + +void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { int x; for (x = 0; x < width; ++x) { uint32_t b0 = (src_argb[0] >> 6) | ((uint32_t)(src_argb[0]) << 2); uint32_t g0 = (src_argb[1] >> 6) | ((uint32_t)(src_argb[1]) << 2); uint32_t r0 = (src_argb[2] >> 6) | ((uint32_t)(src_argb[2]) << 2); uint32_t a0 = (src_argb[3] >> 6); - *(uint32_t*)(dst_rgb) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30); - dst_rgb += 4; + *(uint32_t*)(dst_ar30) = b0 | (g0 << 10) | (r0 << 20) | (a0 << 30); + dst_ar30 += 4; src_argb += 4; } } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index e430ec77a..4451fa2ab 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -730,6 +730,10 @@ result left 10 to position the A and G channels. // Shuffle table for converting RAW to RGB24. Last 8. static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u, 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u}; + +static const uvec8 kShuffleBR30 = {128u, 2u, 128u, 0u, 128u, 6u, 128u, 4u, + 128u, 10u, 128u, 8u, 128u, 14u, 128u, 12u}; + static const uint32_t kMulRB10 = 1028 * 16 * 65536 + 1028; static const uint32_t kMaskRB10 = 0x3ff003ff; static const uint32_t kMaskAG10 = 0xc000ff00; @@ -774,8 +778,46 @@ void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } -#ifdef HAS_ARGBTOAR30ROW_AVX2 +void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ABGR pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleBR30), // %3 reversed shuffler + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#ifdef HAS_ARGBTOAR30ROW_AVX2 void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB @@ -812,6 +854,43 @@ void ARGBToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { } #endif +#ifdef HAS_ABGRTOAR30ROW_AVX2 +void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { + asm volatile( + "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" + + "1: \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ABGR pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels + "add $0x20,%0 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + "vzeroupper \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleBR30), // %3 reversed shuffler + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + #ifdef HAS_ARGBTOYROW_SSSE3 // Convert 16 ARGB pixels (64 bytes) to 16 Y values. void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index f4382882c..0f7c1dfb1 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -41,6 +41,7 @@ namespace libyuv { // Alias to copy pixels as is #define AR30ToAR30 ARGBCopy +#define ABGRToABGR ARGBCopy #define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a)) @@ -1065,6 +1066,7 @@ TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0) TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0) TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1, 0) TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1, 0) +TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1, 0) TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1, 0) TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4) TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4) @@ -1945,9 +1947,9 @@ TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4) // Caveat: Destination needs to be 4 bytes TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4) +TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4) TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4) - -// TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ABGR, 4) +TESTPLANETOE(AR30, 1, 4, ABGR, 1, 4, ARGB, 4) TEST_F(LibYUVConvertTest, RotateWithARGBSource) { // 2x2 frames @@ -2018,6 +2020,40 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) { } #endif // HAS_ARGBTOAR30ROW_AVX2 +#ifdef HAS_ABGRTOAR30ROW_AVX2 +TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) { + // ABGRToAR30Row_AVX2 expects a multiple of 8 pixels. + const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7; + align_buffer_page_end(src, kPixels * 4); + align_buffer_page_end(dst_opt, kPixels * 4); + align_buffer_page_end(dst_c, kPixels * 4); + MemRandomize(src, kPixels * 4); + memset(dst_opt, 0, kPixels * 4); + memset(dst_c, 1, kPixels * 4); + + ABGRToAR30Row_C(src, dst_c, kPixels); + + int has_avx2 = TestCpuFlag(kCpuHasAVX2); + int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); + for (int i = 0; i < benchmark_iterations_; ++i) { + if (has_avx2) { + ABGRToAR30Row_AVX2(src, dst_opt, kPixels); + } else if (has_ssse3) { + ABGRToAR30Row_SSSE3(src, dst_opt, kPixels); + } else { + ABGRToAR30Row_C(src, dst_opt, kPixels); + } + } + for (int i = 0; i < kPixels * 4; ++i) { + EXPECT_EQ(dst_opt[i], dst_c[i]); + } + + free_aligned_buffer_page_end(src); + free_aligned_buffer_page_end(dst_opt); + free_aligned_buffer_page_end(dst_c); +} +#endif // HAS_ABGRTOAR30ROW_AVX2 + // TODO(fbarchard): Fix clamping issue affected by U channel. #define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ ALIGN, YALIGN, W1280, DIFF, N, NEG, SOFF, DOFF) \