mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
ARGBToUV for SSE use pshufb/pmaddubsw
Was ARGBToJ420_Opt (377 ms) Now ARGBToJ420_Opt (340 ms) Bug: None Change-Id: Iada2d6e9ecdb141b9e2acbdf343f890e4aaebe34 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6967754 Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
parent
d59fe1a2b8
commit
1b1c058787
@ -1733,6 +1733,13 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
|
|||||||
#endif // HAS_ARGBTOUV444ROW_AVX2
|
#endif // HAS_ARGBTOUV444ROW_AVX2
|
||||||
|
|
||||||
#ifdef HAS_ARGBTOUVROW_SSSE3
|
#ifdef HAS_ARGBTOUVROW_SSSE3
|
||||||
|
|
||||||
|
// ARGBARGB to AARRGGBB shuffle
|
||||||
|
static const lvec8 kShuffleAARRGGBB = {
|
||||||
|
0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
|
||||||
|
0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
|
||||||
|
};
|
||||||
|
|
||||||
// 8x2 -> 4x1 ARGB pixels converted to 4 U and 4 V
|
// 8x2 -> 4x1 ARGB pixels converted to 4 U and 4 V
|
||||||
// ARGBToUV does rounding average of 4 ARGB pixels
|
// ARGBToUV does rounding average of 4 ARGB pixels
|
||||||
void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
|
void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
|
||||||
@ -1742,69 +1749,52 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
|
|||||||
int width,
|
int width,
|
||||||
const struct RgbUVConstants* rgbuvconstants) {
|
const struct RgbUVConstants* rgbuvconstants) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"pcmpeqb %%xmm4,%%xmm4 \n" // 0x0101
|
"movdqa %5,%%xmm4 \n" // RGBToU
|
||||||
"pabsb %%xmm4,%%xmm4 \n"
|
"movdqa %6,%%xmm5 \n" // RGBToV
|
||||||
"movdqa %0,%%xmm6 \n" // ARGB to U
|
"pcmpeqb %%xmm6,%%xmm6 \n" // 0x0101
|
||||||
"movdqa %1,%%xmm7 \n" // ARGB to V
|
"pabsb %%xmm6,%%xmm6 \n"
|
||||||
:
|
"movdqa %7,%%xmm7 \n" // kShuffleAARRGGBB
|
||||||
: "m"(rgbuvconstants->kRGBToU), // %0
|
"sub %1,%2 \n"
|
||||||
"m"(rgbuvconstants->kRGBToV) // %1
|
|
||||||
: "memory", "cc");
|
|
||||||
|
|
||||||
asm volatile(
|
|
||||||
|
|
||||||
"sub %1,%2 \n"
|
|
||||||
|
|
||||||
LABELALIGN
|
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqu (%0),%%xmm0 \n" // Read 8 ARGB Pixels
|
"movdqu (%0),%%xmm0 \n" // Read 8x2 ARGB Pixels
|
||||||
"movdqu 0x10(%0),%%xmm5 \n"
|
"movdqu 0x10(%0),%%xmm1 \n"
|
||||||
"movdqa %%xmm0,%%xmm1 \n"
|
"movdqu 0x00(%0,%4,1),%%xmm2 \n"
|
||||||
"shufps $0x88,%%xmm5,%%xmm0 \n" // Even pixels
|
"movdqu 0x10(%0,%4,1),%%xmm3 \n"
|
||||||
"shufps $0xdd,%%xmm5,%%xmm1 \n" // Odd pixels
|
"pshufb %%xmm7,%%xmm0 \n" // aarrggbb
|
||||||
"movdqa %%xmm0,%%xmm5 \n"
|
"pshufb %%xmm7,%%xmm1 \n"
|
||||||
"punpcklbw %%xmm1,%%xmm0 \n" // aarrgbb
|
"pshufb %%xmm7,%%xmm2 \n"
|
||||||
"punpckhbw %%xmm5,%%xmm1 \n"
|
"pshufb %%xmm7,%%xmm3 \n"
|
||||||
"pmaddubsw %%xmm4,%%xmm0 \n" // paired add argb
|
"pmaddubsw %%xmm6,%%xmm0 \n" // 8x2 -> 4x2
|
||||||
"pmaddubsw %%xmm4,%%xmm1 \n"
|
"pmaddubsw %%xmm6,%%xmm1 \n"
|
||||||
|
"pmaddubsw %%xmm6,%%xmm2 \n"
|
||||||
|
"pmaddubsw %%xmm6,%%xmm3 \n"
|
||||||
|
"paddw %%xmm2,%%xmm0 \n" // 4x2 -> 4x1
|
||||||
|
"paddw %%xmm3,%%xmm1 \n"
|
||||||
|
"pxor %%xmm2,%%xmm2 \n" // 0 for vpavgw
|
||||||
|
"psrlw $1,%%xmm0 \n"
|
||||||
|
"psrlw $1,%%xmm1 \n"
|
||||||
|
"pavgw %%xmm2,%%xmm0 \n"
|
||||||
|
"pavgw %%xmm2,%%xmm1 \n"
|
||||||
|
"packuswb %%xmm1,%%xmm0 \n" // mutates
|
||||||
|
|
||||||
"movdqu 0x00(%0,%4,1),%%xmm2 \n" // Read 2nd row
|
"movdqa %%xmm6,%%xmm2 \n"
|
||||||
"movdqu 0x10(%0,%4,1),%%xmm5 \n"
|
"psllw $15,%%xmm2 \n" // 0x8000
|
||||||
"movdqa %%xmm2,%%xmm3 \n"
|
"movdqa %%xmm0,%%xmm1 \n"
|
||||||
"shufps $0x88,%%xmm5,%%xmm2 \n" // Even
|
"pmaddubsw %%xmm5,%%xmm1 \n" // 4 V
|
||||||
"shufps $0xdd,%%xmm5,%%xmm3 \n" // Odd pixels
|
"pmaddubsw %%xmm4,%%xmm0 \n" // 4 U
|
||||||
"movdqa %%xmm2,%%xmm5 \n"
|
"phaddw %%xmm1,%%xmm0 \n" // uuuuvvvv
|
||||||
"punpcklbw %%xmm3,%%xmm2 \n" // aarrgbb
|
"psubw %%xmm0,%%xmm2 \n"
|
||||||
"punpckhbw %%xmm5,%%xmm3 \n"
|
"psrlw $0x8,%%xmm2 \n"
|
||||||
"pmaddubsw %%xmm4,%%xmm2 \n" // argb
|
"packuswb %%xmm2,%%xmm2 \n"
|
||||||
"pmaddubsw %%xmm4,%%xmm3 \n"
|
"movd %%xmm2,(%1) \n" // Write 4 U's
|
||||||
|
"pshufd $0x55,%%xmm2,%%xmm2 \n" // Copy V to low 4 bytes
|
||||||
|
"movd %%xmm2,0x00(%1,%2,1) \n" // Write 4 V's
|
||||||
|
|
||||||
"pxor %%xmm5,%%xmm5 \n" // constant 0 for pavgw
|
"lea 0x20(%0),%0 \n"
|
||||||
"paddw %%xmm2,%%xmm0 \n"
|
"lea 0x4(%1),%1 \n"
|
||||||
"paddw %%xmm3,%%xmm1 \n"
|
"subl $0x8,%3 \n"
|
||||||
"psrlw $1,%%xmm0 \n" // round
|
"jg 1b \n"
|
||||||
"psrlw $1,%%xmm1 \n"
|
|
||||||
"pavgw %%xmm5,%%xmm0 \n"
|
|
||||||
"pavgw %%xmm5,%%xmm1 \n"
|
|
||||||
"packuswb %%xmm1,%%xmm0 \n" // 4 ARGB pixels
|
|
||||||
|
|
||||||
"movdqa %%xmm0,%%xmm1 \n"
|
|
||||||
"pmaddubsw %%xmm6,%%xmm0 \n" // u
|
|
||||||
"pmaddubsw %%xmm7,%%xmm1 \n" // v
|
|
||||||
"phaddw %%xmm1,%%xmm0 \n" // uuuuvvvv
|
|
||||||
|
|
||||||
"movdqa %5,%%xmm2 \n" // 0x8000
|
|
||||||
"psubw %%xmm0,%%xmm2 \n" // unsigned 0 to 0xffff
|
|
||||||
"psrlw $0x8,%%xmm2 \n"
|
|
||||||
"packuswb %%xmm2,%%xmm2 \n"
|
|
||||||
"movd %%xmm2,(%1) \n" // Write 4 U's
|
|
||||||
"shufps $0xdd,%%xmm2,%%xmm2 \n"
|
|
||||||
"movd %%xmm2,0x00(%1,%2,1) \n" // Write 4 V's
|
|
||||||
|
|
||||||
"lea 0x20(%0),%0 \n"
|
|
||||||
"lea 0x4(%1),%1 \n"
|
|
||||||
"subl $0x8,%3 \n"
|
|
||||||
"jg 1b \n"
|
|
||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_u), // %1
|
"+r"(dst_u), // %1
|
||||||
"+r"(dst_v), // %2
|
"+r"(dst_v), // %2
|
||||||
@ -1814,21 +1804,17 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
|
|||||||
"+rm"(width) // %3
|
"+rm"(width) // %3
|
||||||
#endif
|
#endif
|
||||||
: "r"((intptr_t)(src_stride_argb)), // %4
|
: "r"((intptr_t)(src_stride_argb)), // %4
|
||||||
"m"(kAddUV128) // %5
|
"m"(rgbuvconstants->kRGBToU), // %5
|
||||||
|
"m"(rgbuvconstants->kRGBToV), // %6
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
"m"(kShuffleAARRGGBB) // %7
|
||||||
"xmm7");
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
|
||||||
|
"xmm6", "xmm7");
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif // HAS_ARGBTOUVROW_SSSE3
|
#endif // HAS_ARGBTOUVROW_SSSE3
|
||||||
|
|
||||||
#ifdef HAS_ARGBTOUVROW_AVX2
|
#ifdef HAS_ARGBTOUVROW_AVX2
|
||||||
|
|
||||||
// ARGBARGB to AARRGGBB shuffle
|
|
||||||
static const lvec8 kShuffleAARRGGBB = {
|
|
||||||
0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
|
|
||||||
0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15,
|
|
||||||
};
|
|
||||||
|
|
||||||
// 16x2 -> 8x1 ARGB pixels converted to 8 U and 8 V
|
// 16x2 -> 8x1 ARGB pixels converted to 8 U and 8 V
|
||||||
// ARGBToUV does rounding average of 4 ARGB pixels
|
// ARGBToUV does rounding average of 4 ARGB pixels
|
||||||
void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
|
void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
|
||||||
@ -1888,7 +1874,11 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
|
|||||||
: "+r"(src_argb), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_u), // %1
|
"+r"(dst_u), // %1
|
||||||
"+r"(dst_v), // %2
|
"+r"(dst_v), // %2
|
||||||
"+r"(width) // %3
|
#if defined(__i386__)
|
||||||
|
"+m"(width) // %3
|
||||||
|
#else
|
||||||
|
"+rm"(width) // %3
|
||||||
|
#endif
|
||||||
: "r"((intptr_t)(src_stride_argb)), // %4
|
: "r"((intptr_t)(src_stride_argb)), // %4
|
||||||
"m"(rgbuvconstants->kRGBToU), // %5
|
"m"(rgbuvconstants->kRGBToU), // %5
|
||||||
"m"(rgbuvconstants->kRGBToV), // %6
|
"m"(rgbuvconstants->kRGBToV), // %6
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user