mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
Add ARGBToAR30Row_SSE2 to speed up H010ToAR30
Port ARGBToAR30Row_AVX2 to ARGBToAR30Row_SSE2 using same instructions but xmm registers and doing half as many pixels per loop. Bug: libyuv:751 Test: LibYUVConvertTest.ARGBToAR30_Opt Change-Id: Id644e54639133d1caf28ea3cd11ff6ab6891a673 Reviewed-on: https://chromium-review.googlesource.com/817918 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
parent
aabe380890
commit
0f98c3c1df
@ -268,6 +268,7 @@ extern "C" {
|
|||||||
// TODO(fbarchard): Port to Visual C
|
// TODO(fbarchard): Port to Visual C
|
||||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
||||||
|
#define HAS_ARGBTOAR30ROW_SSE2
|
||||||
#define HAS_CONVERT16TO8ROW_SSSE3
|
#define HAS_CONVERT16TO8ROW_SSSE3
|
||||||
#define HAS_MERGERGBROW_SSSE3
|
#define HAS_MERGERGBROW_SSSE3
|
||||||
#define HAS_SPLITRGBROW_SSSE3
|
#define HAS_SPLITRGBROW_SSSE3
|
||||||
@ -1795,6 +1796,7 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int width);
|
|||||||
void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
|
void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
|
||||||
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
|
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
|
||||||
void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
|
void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
|
||||||
|
void ARGBToAR30Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
|
||||||
|
|
||||||
void ARGBToRGB565DitherRow_C(const uint8* src_argb,
|
void ARGBToRGB565DitherRow_C(const uint8* src_argb,
|
||||||
uint8* dst_rgb,
|
uint8* dst_rgb,
|
||||||
@ -2422,6 +2424,7 @@ void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb,
|
|||||||
void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb,
|
void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb,
|
||||||
uint8* dst_rgb,
|
uint8* dst_rgb,
|
||||||
int width);
|
int width);
|
||||||
|
void ARGBToAR30Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int width);
|
||||||
|
|
||||||
void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb,
|
void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb,
|
||||||
uint8* dst_rgb,
|
uint8* dst_rgb,
|
||||||
|
|||||||
@ -49,26 +49,25 @@ extern "C" {
|
|||||||
// Secondary formats are converted in 2 steps.
|
// Secondary formats are converted in 2 steps.
|
||||||
// Auxilliary formats call primary converters.
|
// Auxilliary formats call primary converters.
|
||||||
enum FourCC {
|
enum FourCC {
|
||||||
// 8 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
|
// 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
|
||||||
FOURCC_I420 = FOURCC('I', '4', '2', '0'),
|
FOURCC_I420 = FOURCC('I', '4', '2', '0'),
|
||||||
FOURCC_I422 = FOURCC('I', '4', '2', '2'),
|
FOURCC_I422 = FOURCC('I', '4', '2', '2'),
|
||||||
FOURCC_I444 = FOURCC('I', '4', '4', '4'),
|
FOURCC_I444 = FOURCC('I', '4', '4', '4'),
|
||||||
FOURCC_I411 = FOURCC('I', '4', '1', '1'), // deprecated.
|
|
||||||
FOURCC_I400 = FOURCC('I', '4', '0', '0'),
|
FOURCC_I400 = FOURCC('I', '4', '0', '0'),
|
||||||
FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
|
FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
|
||||||
FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
|
FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
|
||||||
FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
|
FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
|
||||||
FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
|
FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
|
||||||
|
FOURCC_H010 = FOURCC('H', '0', '1', '0'), // unofficial fourcc. 10 bit lsb
|
||||||
|
|
||||||
// 1 Secondary YUV format: row biplanar.
|
// 1 Secondary YUV format: row biplanar.
|
||||||
FOURCC_M420 = FOURCC('M', '4', '2', '0'),
|
FOURCC_M420 = FOURCC('M', '4', '2', '0'),
|
||||||
FOURCC_Q420 = FOURCC('Q', '4', '2', '0'), // deprecated.
|
|
||||||
|
|
||||||
// 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
|
// 10 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc
|
||||||
FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
|
FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
|
||||||
FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
|
FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
|
||||||
FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
|
FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
|
||||||
FOURCC_AR30 = FOURCC('A', 'R', '3', '0'),
|
FOURCC_AR30 = FOURCC('A', 'R', '3', '0'), // 10 bit per channel. 2101010.
|
||||||
FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
|
FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
|
||||||
FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
|
FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
|
||||||
FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
|
FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
|
||||||
@ -76,16 +75,10 @@ enum FourCC {
|
|||||||
FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE.
|
FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE.
|
||||||
FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE.
|
FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE.
|
||||||
|
|
||||||
// 4 Secondary RGB formats: 4 Bayer Patterns. deprecated.
|
|
||||||
FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
|
|
||||||
FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
|
|
||||||
FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
|
|
||||||
FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
|
|
||||||
|
|
||||||
// 1 Primary Compressed YUV format.
|
// 1 Primary Compressed YUV format.
|
||||||
FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
|
FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
|
||||||
|
|
||||||
// 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
|
// 7 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
|
||||||
FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
|
FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
|
||||||
FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
|
FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
|
||||||
FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
|
FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
|
||||||
@ -93,7 +86,6 @@ enum FourCC {
|
|||||||
FOURCC_J420 = FOURCC('J', '4', '2', '0'),
|
FOURCC_J420 = FOURCC('J', '4', '2', '0'),
|
||||||
FOURCC_J400 = FOURCC('J', '4', '0', '0'), // unofficial fourcc
|
FOURCC_J400 = FOURCC('J', '4', '0', '0'), // unofficial fourcc
|
||||||
FOURCC_H420 = FOURCC('H', '4', '2', '0'), // unofficial fourcc
|
FOURCC_H420 = FOURCC('H', '4', '2', '0'), // unofficial fourcc
|
||||||
FOURCC_H010 = FOURCC('H', '0', '1', '0'), // unofficial fourcc. 10 bit lsb
|
|
||||||
|
|
||||||
// 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
|
// 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
|
||||||
FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
|
FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
|
||||||
@ -114,7 +106,13 @@ enum FourCC {
|
|||||||
FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP.
|
FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP.
|
||||||
FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO.
|
FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO.
|
||||||
|
|
||||||
// 1 Auxiliary compressed YUV format set aside for capturer.
|
// deprecated formats. Not supported, but defined for backward compatibility.
|
||||||
|
FOURCC_I411 = FOURCC('I', '4', '1', '1'),
|
||||||
|
FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
|
||||||
|
FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
|
||||||
|
FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
|
||||||
|
FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
|
||||||
|
FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
|
||||||
FOURCC_H264 = FOURCC('H', '2', '6', '4'),
|
FOURCC_H264 = FOURCC('H', '2', '6', '4'),
|
||||||
|
|
||||||
// Match any fourcc.
|
// Match any fourcc.
|
||||||
|
|||||||
@ -478,6 +478,14 @@ static int H010ToAR30Matrix(const uint16* src_y,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAS_ARGBTOAR30ROW_SSE2)
|
||||||
|
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||||
|
ARGBToAR30Row = ARGBToAR30Row_Any_SSE2;
|
||||||
|
if (IS_ALIGNED(width, 4)) {
|
||||||
|
ARGBToAR30Row = ARGBToAR30Row_SSE2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if defined(HAS_ARGBTOAR30ROW_AVX2)
|
#if defined(HAS_ARGBTOAR30ROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||||
ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
|
ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
|
||||||
|
|||||||
@ -1333,6 +1333,14 @@ int ARGBToAR30(const uint8* src_argb,
|
|||||||
height = 1;
|
height = 1;
|
||||||
src_stride_argb = dst_stride_ar30 = 0;
|
src_stride_argb = dst_stride_ar30 = 0;
|
||||||
}
|
}
|
||||||
|
#if defined(HAS_ARGBTOAR30ROW_SSE2)
|
||||||
|
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||||
|
ARGBToAR30Row = ARGBToAR30Row_Any_SSE2;
|
||||||
|
if (IS_ALIGNED(width, 4)) {
|
||||||
|
ARGBToAR30Row = ARGBToAR30Row_SSE2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#if defined(HAS_ARGBTOAR30ROW_AVX2)
|
#if defined(HAS_ARGBTOAR30ROW_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||||
ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
|
ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
|
||||||
|
|||||||
@ -396,6 +396,9 @@ ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
|
|||||||
ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
|
ANY11(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, 0, 4, 2, 7)
|
||||||
ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
|
ANY11(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, 0, 4, 2, 7)
|
||||||
#endif
|
#endif
|
||||||
|
#if defined(HAS_ARGBTOAR30ROW_SSE2)
|
||||||
|
ANY11(ARGBToAR30Row_Any_SSE2, ARGBToAR30Row_SSE2, 0, 4, 4, 3)
|
||||||
|
#endif
|
||||||
#if defined(HAS_ARGBTOAR30ROW_AVX2)
|
#if defined(HAS_ARGBTOAR30ROW_AVX2)
|
||||||
ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
|
ANY11(ARGBToAR30Row_Any_AVX2, ARGBToAR30Row_AVX2, 0, 4, 4, 7)
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -700,6 +700,59 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
|
|||||||
}
|
}
|
||||||
#endif // HAS_RGB24TOARGBROW_SSSE3
|
#endif // HAS_RGB24TOARGBROW_SSSE3
|
||||||
|
|
||||||
|
void ARGBToAR30Row_SSE2(const uint8* src, uint8* dst, int width) {
|
||||||
|
asm volatile(
|
||||||
|
"pcmpeqb %%xmm4,%%xmm4 \n" // 0x000000ff mask
|
||||||
|
"psrld $0x18,%%xmm4 \n"
|
||||||
|
"pcmpeqb %%xmm5,%%xmm5 \n" // 0xc0000000 mask
|
||||||
|
"pslld $30,%%xmm5 \n"
|
||||||
|
|
||||||
|
LABELALIGN
|
||||||
|
"1: \n"
|
||||||
|
"movdqu (%0),%%xmm0 \n"
|
||||||
|
// alpha
|
||||||
|
"movdqa %%xmm0,%%xmm3 \n"
|
||||||
|
"pand %%xmm5,%%xmm3 \n"
|
||||||
|
// red
|
||||||
|
"movdqa %%xmm0,%%xmm1 \n"
|
||||||
|
"psrld $0x10,%%xmm1 \n"
|
||||||
|
"pand %%xmm4,%%xmm1 \n"
|
||||||
|
"movdqa %%xmm1,%%xmm2 \n"
|
||||||
|
"psrld $0x6,%%xmm2 \n"
|
||||||
|
"pslld $22,%%xmm1 \n"
|
||||||
|
"pslld $20,%%xmm2 \n"
|
||||||
|
"por %%xmm1,%%xmm3 \n"
|
||||||
|
"por %%xmm2,%%xmm3 \n"
|
||||||
|
// green
|
||||||
|
"movdqa %%xmm0,%%xmm1 \n"
|
||||||
|
"psrld $0x08,%%xmm1 \n"
|
||||||
|
"pand %%xmm4,%%xmm1 \n"
|
||||||
|
"movdqa %%xmm1,%%xmm2 \n"
|
||||||
|
"psrld $0x6,%%xmm2 \n"
|
||||||
|
"pslld $12,%%xmm1 \n"
|
||||||
|
"pslld $10,%%xmm2 \n"
|
||||||
|
"por %%xmm1,%%xmm3 \n"
|
||||||
|
"por %%xmm2,%%xmm3 \n"
|
||||||
|
// blue
|
||||||
|
"pand %%xmm4,%%xmm0 \n"
|
||||||
|
"movdqa %%xmm0,%%xmm1 \n"
|
||||||
|
"psrld $0x6,%%xmm1 \n"
|
||||||
|
"pslld $2,%%xmm0 \n"
|
||||||
|
"por %%xmm0,%%xmm3 \n"
|
||||||
|
"por %%xmm1,%%xmm3 \n"
|
||||||
|
|
||||||
|
"movdqu %%xmm3,(%1) \n"
|
||||||
|
"add $0x10,%0 \n"
|
||||||
|
"add $0x10,%1 \n"
|
||||||
|
"sub $0x4,%2 \n"
|
||||||
|
"jg 1b \n"
|
||||||
|
: "+r"(src), // %0
|
||||||
|
"+r"(dst), // %1
|
||||||
|
"+r"(width) // %2
|
||||||
|
::"memory",
|
||||||
|
"cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
||||||
|
}
|
||||||
|
|
||||||
#ifdef HAS_ARGBTOAR30ROW_AVX2
|
#ifdef HAS_ARGBTOAR30ROW_AVX2
|
||||||
void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) {
|
void ARGBToAR30Row_AVX2(const uint8* src, uint8* dst, int width) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
|
|||||||
@ -1948,14 +1948,16 @@ TEST_F(LibYUVConvertTest, ARGBToAR30Row_Opt) {
|
|||||||
ARGBToAR30Row_C(src, dst_c, kPixels);
|
ARGBToAR30Row_C(src, dst_c, kPixels);
|
||||||
|
|
||||||
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
|
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
|
||||||
|
int has_sse2 = TestCpuFlag(kCpuHasSSE2);
|
||||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||||
if (has_avx2) {
|
if (has_avx2) {
|
||||||
ARGBToAR30Row_AVX2(src, dst_opt, kPixels);
|
ARGBToAR30Row_AVX2(src, dst_opt, kPixels);
|
||||||
|
} else if (has_sse2) {
|
||||||
|
ARGBToAR30Row_SSE2(src, dst_opt, kPixels);
|
||||||
} else {
|
} else {
|
||||||
ARGBToAR30Row_C(src, dst_opt, kPixels);
|
ARGBToAR30Row_C(src, dst_opt, kPixels);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int i = 0; i < kPixels * 4; ++i) {
|
for (int i = 0; i < kPixels * 4; ++i) {
|
||||||
EXPECT_EQ(dst_opt[i], dst_c[i]);
|
EXPECT_EQ(dst_opt[i], dst_c[i]);
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user