ARGBToAR30 SSSE3 use pmulhuw to replicate fields

AR30 is optimized with 3 techniques
1. pmulhuw is used to replicate 8 bits to 10 bits.
2. Two channels are processed at a time.  R and B, and A and G.
3. pshufb is used to shift and mask 2 channels of R and B

Bug: libyuv:751
Test: ARGBToAR30_Opt
Change-Id: I4e62d6caa4df7d0ae80395fa911d3c922b6b897b
Reviewed-on: https://chromium-review.googlesource.com/822520
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2017-12-12 11:05:46 -08:00 committed by Commit Bot
parent d94a4867bf
commit c367751430
6 changed files with 75 additions and 84 deletions

View File

@ -268,7 +268,7 @@ extern "C" {
// TODO(fbarchard): Port to Visual C // TODO(fbarchard): Port to Visual C
#if !defined(LIBYUV_DISABLE_X86) && \ #if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_ARGBTOAR30ROW_SSE2 #define HAS_ARGBTOAR30ROW_SSSE3
#define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_MERGERGBROW_SSSE3 #define HAS_MERGERGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3
@ -1796,7 +1796,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToAR30Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToAR30Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRGB565DitherRow_C(const uint8* src_argb, void ARGBToRGB565DitherRow_C(const uint8* src_argb,
uint8* dst_rgb, uint8* dst_rgb,
@ -2424,7 +2424,7 @@ void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb,
void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb,
uint8* dst_rgb, uint8* dst_rgb,
int width); int width);
void ARGBToAR30Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width); void ARGBToAR30Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb,
uint8* dst_rgb, uint8* dst_rgb,

View File

@ -478,11 +478,11 @@ static int H010ToAR30Matrix(const uint16* src_y,
} }
} }
#endif #endif
#if defined(HAS_ARGBTOAR30ROW_SSE2) #if defined(HAS_ARGBTOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToAR30Row = ARGBToAR30Row_Any_SSE2; ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
if (IS_ALIGNED(width, 4)) { if (IS_ALIGNED(width, 4)) {
ARGBToAR30Row = ARGBToAR30Row_SSE2; ARGBToAR30Row = ARGBToAR30Row_SSSE3;
} }
} }
#endif #endif

View File

@ -1333,11 +1333,11 @@ int ARGBToAR30(const uint8* src_argb,
height = 1; height = 1;
src_stride_argb = dst_stride_ar30 = 0; src_stride_argb = dst_stride_ar30 = 0;
} }
#if defined(HAS_ARGBTOAR30ROW_SSE2) #if defined(HAS_ARGBTOAR30ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSE2)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToAR30Row = ARGBToAR30Row_Any_SSE2; ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
if (IS_ALIGNED(width, 4)) { if (IS_ALIGNED(width, 4)) {
ARGBToAR30Row = ARGBToAR30Row_SSE2; ARGBToAR30Row = ARGBToAR30Row_SSSE3;
} }
} }
#endif #endif

View File

@ -396,8 +396,8 @@ ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
#endif #endif
#if defined(HAS_ARGBTOAR30ROW_SSE2) #if defined(HAS_ARGBTOAR30ROW_SSSE3)
ANY11(ARGBToAR30Row_Any_SSE2, ARGBToAR30Row_SSE2, 0, 4, 4, 3) ANY11(ARGBToAR30Row_Any_SSSE3, ARGBToAR30Row_SSSE3, 0, 4, 4, 3)
#endif #endif
#if defined(HAS_ARGBTOAR30ROW_AVX2) #if defined(HAS_ARGBTOAR30ROW_AVX2)
ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7) ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)

View File

@ -699,12 +699,16 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
); );
} }
#endif // HAS_RGB24TOARGBROW_SSSE3 #endif // HAS_RGB24TOARGBROW_SSSE3
/* /*
ARGBToAR30Row:
Red Blue Red Blue
With the 8 bit value in the upper bits, vpmulhuw by (1024+4) will produce a 10 With the 8 bit value in the upper bits of a short, vpmulhuw by (1024+4) will
bit value in the low 10 bits of each 16 bit value. This is whats wanted for the produce a 10 bit value in the low 10 bits of each 16 bit value. This is whats
blue channel. The red needs to be shifted 4 left, so multiply by (1024+4)*16 for wanted for the blue channel. The red needs to be shifted 4 left, so multiply by
red. (1024+4)*16 for red.
Alpha Green Alpha Green
Alpha and Green are already in the high bits so vpand can zero out the other Alpha and Green are already in the high bits so vpand can zero out the other
@ -717,61 +721,6 @@ and then a shift of 4 is a multiply of 16, so (4*16) = 64. Then shift the
result left 10 to position the A and G channels. result left 10 to position the A and G channels.
*/ */
void ARGBToAR30Row_SSE2(const uint8* src, uint8* dst, int width) {
asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n" // 0x000000ff mask
"psrld $0x18,%%xmm4 \n"
"pcmpeqb %%xmm5,%%xmm5 \n" // 0xc0000000 mask
"pslld $30,%%xmm5 \n"
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
// alpha
"movdqa %%xmm0,%%xmm3 \n"
"pand %%xmm5,%%xmm3 \n"
// red
"movdqa %%xmm0,%%xmm1 \n"
"psrld $0x10,%%xmm1 \n"
"pand %%xmm4,%%xmm1 \n"
"movdqa %%xmm1,%%xmm2 \n"
"psrld $0x6,%%xmm2 \n"
"pslld $22,%%xmm1 \n"
"pslld $20,%%xmm2 \n"
"por %%xmm1,%%xmm3 \n"
"por %%xmm2,%%xmm3 \n"
// green
"movdqa %%xmm0,%%xmm1 \n"
"psrld $0x08,%%xmm1 \n"
"pand %%xmm4,%%xmm1 \n"
"movdqa %%xmm1,%%xmm2 \n"
"psrld $0x6,%%xmm2 \n"
"pslld $12,%%xmm1 \n"
"pslld $10,%%xmm2 \n"
"por %%xmm1,%%xmm3 \n"
"por %%xmm2,%%xmm3 \n"
// blue
"pand %%xmm4,%%xmm0 \n"
"movdqa %%xmm0,%%xmm1 \n"
"psrld $0x6,%%xmm1 \n"
"pslld $2,%%xmm0 \n"
"por %%xmm0,%%xmm3 \n"
"por %%xmm1,%%xmm3 \n"
"movdqu %%xmm3,(%1) \n"
"add $0x10,%0 \n"
"add $0x10,%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
::"memory",
"cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#ifdef HAS_ARGBTOAR30ROW_AVX2
// Shuffle table for converting RAW to RGB24. Last 8. // Shuffle table for converting RAW to RGB24. Last 8.
static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u, static const uvec8 kShuffleRB30 = {128u, 0u, 128u, 2u, 128u, 4u, 128u, 6u,
128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u}; 128u, 8u, 128u, 10u, 128u, 12u, 128u, 14u};
@ -780,6 +729,47 @@ static const uint32 kMaskRB10 = 0x3ff003ff;
static const uint32 kMaskAG10 = 0xc000ff00; static const uint32 kMaskAG10 = 0xc000ff00;
static const uint32 kMulAG10 = 64 * 65536 + 1028; static const uint32 kMulAG10 = 64 * 65536 + 1028;
void ARGBToAR30Row_SSSE3(const uint8* src, uint8* dst, int width) {
asm volatile(
"movdqa %3,%%xmm2 \n" // shuffler for RB
"movd %4,%%xmm3 \n" // multipler for RB
"movd %5,%%xmm4 \n" // mask for R10 B10
"movd %6,%%xmm5 \n" // mask for AG
"movd %7,%%xmm6 \n" // multipler for AG
"pshufd $0x0,%%xmm3,%%xmm3 \n"
"pshufd $0x0,%%xmm4,%%xmm4 \n"
"pshufd $0x0,%%xmm5,%%xmm5 \n"
"pshufd $0x0,%%xmm6,%%xmm6 \n"
"sub %0,%1 \n"
"1: \n"
"movdqu (%0),%%xmm0 \n" // fetch 4 ARGB pixels
"movdqa %%xmm0,%%xmm1 \n"
"pshufb %%xmm2,%%xmm1 \n" // R0B0
"pand %%xmm5,%%xmm0 \n" // A0G0
"pmulhuw %%xmm3,%%xmm1 \n" // X2 R16 X4 B10
"pmulhuw %%xmm6,%%xmm0 \n" // X10 A2 X10 G10
"pand %%xmm4,%%xmm1 \n" // X2 R10 X10 B10
"pslld $10,%%xmm0 \n" // A2 x10 G10 x10
"por %%xmm1,%%xmm0 \n" // A2 R10 G10 B10
"movdqu %%xmm0,(%1,%0) \n" // store 4 AR30 pixels
"add $0x10,%0 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "m"(kShuffleRB30), // %3
"m"(kMulRB10), // %4
"m"(kMaskRB10), // %5
"m"(kMaskAG10), // %6
"m"(kMulAG10) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#ifdef HAS_ARGBTOAR30ROW_AVX2
void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) { void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) {
asm volatile( asm volatile(
"vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB "vbroadcastf128 %3,%%ymm2 \n" // shuffler for RB
@ -804,15 +794,16 @@ void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) {
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
: "m"(kShuffleRB30), // %3 : "m"(kShuffleRB30), // %3
"m"(kMulRB10), // %4 "m"(kMulRB10), // %4
"m"(kMaskRB10), // %5 "m"(kMaskRB10), // %5
"m"(kMaskAG10), // %6 "m"(kMaskAG10), // %6
"m"(kMulAG10) // %7 "m"(kMulAG10) // %7
: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm6");
} }
#endif #endif

View File

@ -1947,12 +1947,12 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
ARGBToAR30Row_C(src, dst_c, kPixels); ARGBToAR30Row_C(src, dst_c, kPixels);
int has_avx2 = TestCpuFlag(kCpuHasAVX2); int has_avx2 = TestCpuFlag(kCpuHasAVX2);
int has_sse2 = TestCpuFlag(kCpuHasSSE2); int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
for (int i = 0; i < benchmark_iterations_; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
if (has_avx2) { if (has_avx2) {
ARGBToAR30Row_AVX2(src, dst_opt, kPixels); ARGBToAR30Row_AVX2(src, dst_opt, kPixels);
} else if (has_sse2) { } else if (has_ssse3) {
ARGBToAR30Row_SSE2(src, dst_opt, kPixels); ARGBToAR30Row_SSSE3(src, dst_opt, kPixels);
} else { } else {
ARGBToAR30Row_C(src, dst_opt, kPixels); ARGBToAR30Row_C(src, dst_opt, kPixels);
} }