From c3677514309ca27d3f8ffb22272a9b699e544131 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 12 Dec 2017 11:05:46 -0800 Subject: [PATCH] ARGBToAR30 SSSE3 use pmulhuw to replicate fields AR30 is optimized with 3 techniques 1. pmulhuw is used to replicate 8 bits to 10 bits. 2. Two channels are processed at a time. R and B, and A and G. 3. pshufb is used to shift and mask 2 channels of R and B Bug: libyuv:751 Test: ARGBToAR30_Opt Change-Id: I4e62d6caa4df7d0ae80395fa911d3c922b6b897b Reviewed-on: https://chromium-review.googlesource.com/822520 Reviewed-by: richard winterton Commit-Queue: Frank Barchard --- include/libyuv/row.h | 6 +- source/convert_argb.cc | 8 +-- source/convert_from_argb.cc | 8 +-- source/row_any.cc | 4 +- source/row_gcc.cc | 127 +++++++++++++++++------------------- unit_test/convert_test.cc | 6 +- 6 files changed, 75 insertions(+), 84 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 6937c7d37..cb7196938 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -268,7 +268,7 @@ extern "C" { // TODO(fbarchard): Port to Visual C #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) -#define HAS_ARGBTOAR30ROW_SSE2 +#define HAS_ARGBTOAR30ROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_MERGERGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3 @@ -1796,7 +1796,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToAR30Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToAR30Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, @@ -2424,7 +2424,7 @@ void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); -void ARGBToAR30Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); +void ARGBToAR30Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index f0e78b7f6..5b6ddadb9 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -478,11 +478,11 @@ static int H010ToAR30Matrix(const uint16* src_y, } } #endif -#if defined(HAS_ARGBTOAR30ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToAR30Row = ARGBToAR30Row_Any_SSE2; +#if defined(HAS_ARGBTOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBToAR30Row = ARGBToAR30Row_SSE2; + ARGBToAR30Row = ARGBToAR30Row_SSSE3; } } #endif diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 02a546858..4dca24058 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -1333,11 +1333,11 @@ int ARGBToAR30(const uint8* src_argb, height = 1; src_stride_argb = dst_stride_ar30 = 0; } -#if defined(HAS_ARGBTOAR30ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToAR30Row = ARGBToAR30Row_Any_SSE2; +#if defined(HAS_ARGBTOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 4)) { - ARGBToAR30Row = ARGBToAR30Row_SSE2; + ARGBToAR30Row = ARGBToAR30Row_SSSE3; } } #endif diff --git a/source/row_any.cc b/source/row_any.cc index a369b2490..6d65ca7d3 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -396,8 +396,8 @@ ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) #endif -#if defined(HAS_ARGBTOAR30ROW_SSE2) -ANY11(ARGBToAR30Row_Any_SSE2, ARGBToAR30Row_SSE2, 0, 4, 4, 3) +#if defined(HAS_ARGBTOAR30ROW_SSSE3) +ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3) #endif #if defined(HAS_ARGBTOAR30ROW_AVX2) ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7) diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 65d34c2ef..4eda05979 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -699,12 +699,16 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) { ); } #endif // HAS_RGB24TOARGBROW_SSSE3 + /* + +ARGBToAR30Row: + Red Blue -With the 8 bit value in the upper bits, vpmulhuw by (1024+4) will produce a 10 -bit value in the low 10 bits of each 16 bit value. This is whats wanted for the -blue channel. The red needs to be shifted 4 left, so multiply by (1024+4)*16 for -red. +With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will +produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats +wanted for the blue channel. The red needs to be shifted 4 left, so multiply by +(1024+4)*16 for red. Alpha Green Alpha and Green are already in the high bits so vpand can zero out the other @@ -717,61 +721,6 @@ and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the result left 10 to position the A and G channels. */ -void ARGBToAR30Row_SSE2(const uint8* src, uint8* dst, int width) { - asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" // 0x000000ff mask - "psrld $0x18,%%xmm4 \n" - "pcmpeqb %%xmm5,%%xmm5 \n" // 0xc0000000 mask - "pslld $30,%%xmm5 \n" - - LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - // alpha - "movdqa %%xmm0,%%xmm3 \n" - "pand %%xmm5,%%xmm3 \n" - // red - "movdqa %%xmm0,%%xmm1 \n" - "psrld $0x10,%%xmm1 \n" - "pand %%xmm4,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "psrld $0x6,%%xmm2 \n" - "pslld $22,%%xmm1 \n" - "pslld $20,%%xmm2 \n" - "por %%xmm1,%%xmm3 \n" - "por %%xmm2,%%xmm3 \n" - // green - "movdqa %%xmm0,%%xmm1 \n" - "psrld $0x08,%%xmm1 \n" - "pand %%xmm4,%%xmm1 \n" - "movdqa %%xmm1,%%xmm2 \n" - "psrld $0x6,%%xmm2 \n" - "pslld $12,%%xmm1 \n" - "pslld $10,%%xmm2 \n" - "por %%xmm1,%%xmm3 \n" - "por %%xmm2,%%xmm3 \n" - // blue - "pand %%xmm4,%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "psrld $0x6,%%xmm1 \n" - "pslld $2,%%xmm0 \n" - "por %%xmm0,%%xmm3 \n" - "por %%xmm1,%%xmm3 \n" - - "movdqu %%xmm3,(%1) \n" - "add $0x10,%0 \n" - "add $0x10,%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); -} - -#ifdef HAS_ARGBTOAR30ROW_AVX2 - // Shuffle table for converting RAW to RGB24. Last 8. static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u, 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u}; @@ -780,6 +729,47 @@ static const uint32 kMaskRB10 = 0x3ff003ff; static const uint32 kMaskAG10 = 0xc000ff00; static const uint32 kMulAG10 = 64 * 65536 + 1028; +void ARGBToAR30Row_SSSE3(const uint8* src, uint8* dst, int width) { + asm volatile( + "movdqa %3,%%xmm2 \n" // shuffler for RB + "movd %4,%%xmm3 \n" // multipler for RB + "movd %5,%%xmm4 \n" // mask for R10 B10 + "movd %6,%%xmm5 \n" // mask for AG + "movd %7,%%xmm6 \n" // multipler for AG + "pshufd $0x0,%%xmm3,%%xmm3 \n" + "pshufd $0x0,%%xmm4,%%xmm4 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm6,%%xmm6 \n" + "sub %0,%1 \n" + + "1: \n" + "movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm1 \n" // R0B0 + "pand %%xmm5,%%xmm0 \n" // A0G0 + "pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10 + "pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10 + "pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10 + "pslld $10,%%xmm0 \n" // A2 x10 G10 x10 + "por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10 + "movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels + "add $0x10,%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleRB30), // %3 + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#ifdef HAS_ARGBTOAR30ROW_AVX2 + void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) { asm volatile( "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB @@ -804,15 +794,16 @@ void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) { "jg 1b \n" "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "m"(kShuffleRB30), // %3 - "m"(kMulRB10), // %4 - "m"(kMaskRB10), // %5 - "m"(kMaskAG10), // %6 - "m"(kMulAG10) // %7 - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleRB30), // %3 + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", + "xmm6"); } #endif diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 326446480..394bb7821 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -1947,12 +1947,12 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) { ARGBToAR30Row_C(src, dst_c, kPixels); int has_avx2 = TestCpuFlag(kCpuHasAVX2); - int has_sse2 = TestCpuFlag(kCpuHasSSE2); + int has_ssse3 = TestCpuFlag(kCpuHasSSSE3); for (int i = 0; i < benchmark_iterations_; ++i) { if (has_avx2) { ARGBToAR30Row_AVX2(src, dst_opt, kPixels); - } else if (has_sse2) { - ARGBToAR30Row_SSE2(src, dst_opt, kPixels); + } else if (has_ssse3) { + ARGBToAR30Row_SSSE3(src, dst_opt, kPixels); } else { ARGBToAR30Row_C(src, dst_opt, kPixels); }