From 11dd1b956faee1bbf88f4966e2dba48805d13ec7 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 11 Dec 2017 18:21:34 -0800 Subject: [PATCH] ARGBToAR30 use vpmulhuw to replicate fields AR30 is optimized with 3 techniques 1. vpmulhuw is used to replicate 8 bits to 10 bits. 2. Two channels are processed at a time. R and B, and A and G. 3. vpshufb is used to shift and mask 2 channels of R and B Red Blue With the 8 bit value in the upper bits, vpmulhuw by (1024+4) will produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats wanted for the blue channel. The red needs to be shifted 4 left, so multiply by (1024+4)*16 for red. Alpha Green Alpha and Green are already in the high bits so vpand can zero out the other bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha would be a simple multiplier to shift it into position. It wants a gap of 10 above the green. Green is 10 bits, so there are 6 bits in the low short. 4 more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits, and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the result left 10 to position the A and G channels. Bug: libyuv:751 Test: ARGBToAR30_Opt Change-Id: Ie4f20dce18203bae7b75acb1fd5232db8a8a4f11 Reviewed-on: https://chromium-review.googlesource.com/820046 Commit-Queue: Frank Barchard Reviewed-by: Cheng Wang --- source/row_gcc.cc | 89 ++++++++++++++++++++++----------------- unit_test/convert_test.cc | 1 - 2 files changed, 51 insertions(+), 39 deletions(-) diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 0b47210cb..65d34c2ef 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -699,6 +699,23 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) { ); } #endif // HAS_RGB24TOARGBROW_SSSE3 +/* +Red Blue +With the 8 bit value in the upper bits, vpmulhuw by (1024+4) will produce a 10 +bit value in the low 10 bits of each 16 bit value. This is whats wanted for the +blue channel. The red needs to be shifted 4 left, so multiply by (1024+4)*16 for +red. + +Alpha Green +Alpha and Green are already in the high bits so vpand can zero out the other +bits, keeping just 2 upper bits of alpha and 8 bit green. The same multiplier +could be used for Green - (1024+4) putting the 10 bit green in the lsb. Alpha +would be a simple multiplier to shift it into position. It wants a gap of 10 +above the green. Green is 10 bits, so there are 6 bits in the low short. 4 +more are needed, so a multiplier of 4 gets the 2 bits into the upper 16 bits, +and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the +result left 10 to position the A and G channels. +*/ void ARGBToAR30Row_SSE2(const uint8* src, uint8* dst, int width) { asm volatile( @@ -754,52 +771,48 @@ void ARGBToAR30Row_SSE2(const uint8* src, uint8* dst, int width) { } #ifdef HAS_ARGBTOAR30ROW_AVX2 + +// Shuffle table for converting RAW to RGB24. Last 8. +static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u, + 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u}; +static const uint32 kMulRB10 = 1028 * 16 * 65536 + 1028; +static const uint32 kMaskRB10 = 0x3ff003ff; +static const uint32 kMaskAG10 = 0xc000ff00; +static const uint32 kMulAG10 = 64 * 65536 + 1028; + void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) { asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0x000000ff mask - "vpsrld $0x18,%%ymm4,%%ymm4 \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // 0xc0000000 mask - "vpslld $30,%%ymm5,%%ymm5 \n" + "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB + "vbroadcastss %4,%%ymm3 \n" // multipler for RB + "vbroadcastss %5,%%ymm4 \n" // mask for R10 B10 + "vbroadcastss %6,%%ymm5 \n" // mask for AG + "vbroadcastss %7,%%ymm6 \n" // multipler for AG + "sub %0,%1 \n" - LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm0 \n" - // alpha - "vpand %%ymm5,%%ymm0,%%ymm3 \n" - // red - "vpsrld $0x10,%%ymm0,%%ymm1 \n" - "vpand %%ymm4,%%ymm1,%%ymm1 \n" - "vpsrld $0x6,%%ymm1,%%ymm2 \n" - "vpslld $22,%%ymm1,%%ymm1 \n" - "vpslld $20,%%ymm2,%%ymm2 \n" - "vpor %%ymm1,%%ymm3,%%ymm3 \n" - "vpor %%ymm2,%%ymm3,%%ymm3 \n" - // green - "vpsrld $0x08,%%ymm0,%%ymm1 \n" - "vpand %%ymm4,%%ymm1,%%ymm1 \n" - "vpsrld $0x6,%%ymm1,%%ymm2 \n" - "vpslld $12,%%ymm1,%%ymm1 \n" - "vpslld $10,%%ymm2,%%ymm2 \n" - "vpor %%ymm1,%%ymm3,%%ymm3 \n" - "vpor %%ymm2,%%ymm3,%%ymm3 \n" - // blue - "vpand %%ymm4,%%ymm0,%%ymm1 \n" - "vpsrld $0x6,%%ymm1,%%ymm2 \n" - "vpslld $2,%%ymm1,%%ymm1 \n" - "vpor %%ymm1,%%ymm3,%%ymm3 \n" - "vpor %%ymm2,%%ymm3,%%ymm3 \n" - - "vmovdqu %%ymm3,(%1) \n" + "vmovdqu (%0),%%ymm0 \n" // fetch 8 ARGB pixels + "vpshufb %%ymm2,%%ymm0,%%ymm1 \n" // R0B0 + "vpand %%ymm5,%%ymm0,%%ymm0 \n" // A0G0 + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" // X2 R16 X4 B10 + "vpmulhuw %%ymm6,%%ymm0,%%ymm0 \n" // X10 A2 X10 G10 + "vpand %%ymm4,%%ymm1,%%ymm1 \n" // X2 R10 X10 B10 + "vpslld $10,%%ymm0,%%ymm0 \n" // A2 x10 G10 x10 + "vpor %%ymm1,%%ymm0,%%ymm0 \n" // A2 R10 G10 B10 + "vmovdqu %%ymm0,(%1,%0) \n" // store 8 AR30 pixels "add $0x20,%0 \n" - "add $0x20,%1 \n" "sub $0x8,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - ::"memory", - "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); + + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "m"(kShuffleRB30), // %3 + "m"(kMulRB10), // %4 + "m"(kMaskRB10), // %5 + "m"(kMaskAG10), // %6 + "m"(kMulAG10) // %7 + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index ba9ad8291..326446480 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -1940,7 +1940,6 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) { align_buffer_page_end(src, kPixels * 4); align_buffer_page_end(dst_opt, kPixels * 4); align_buffer_page_end(dst_c, kPixels * 4); - MemRandomize(src, kPixels * 4); memset(dst_opt, 0, kPixels * 4); memset(dst_c, 1, kPixels * 4);