ARGBToRGB24_AVX2 version

AVX2 port of SSSE3 conversion to output 24 bit RGB

Bug: libyuv:778
Test: LibYUVConvertTest.NV21ToRGB24_Opt
Change-Id: I14f7815522d1b790ecd2bb39d9a3441e803b694a
Reviewed-on: https://chromium-review.googlesource.com/953303
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
Frank Barchard 2018-03-07 16:06:27 -08:00 committed by Commit Bot
parent 3009890c11
commit 1d509f2178
6 changed files with 187 additions and 35 deletions

View File

@ -275,10 +275,12 @@ extern "C" {
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_ABGRTOAR30ROW_AVX2
#define HAS_ARGBTOAR30ROW_AVX2
#define HAS_ARGBTORAWROW_AVX2
#define HAS_ARGBTORGB24ROW_AVX2
#define HAS_CONVERT16TO8ROW_AVX2
#define HAS_CONVERT8TO16ROW_AVX2
#define HAS_I210TOARGBROW_AVX2
#define HAS_I210TOAR30ROW_AVX2
#define HAS_I210TOARGBROW_AVX2
#define HAS_I422TOAR30ROW_AVX2
#define HAS_I422TOUYVYROW_AVX2
#define HAS_I422TOYUY2ROW_AVX2
@ -1701,6 +1703,9 @@ void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
uint8_t* dst_rgb,
const uint32_t dither4,
@ -2492,7 +2497,12 @@ void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int width);
void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
const uint32_t param,

View File

@ -879,6 +879,14 @@ int ARGBToRGB24(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTORGB24ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToRGB24Row = ARGBToRGB24Row_AVX2;
}
}
#endif
#if defined(HAS_ARGBTORGB24ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
@ -937,6 +945,14 @@ int ARGBToRAW(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTORAWROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToRAWRow = ARGBToRAWRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToRAWRow = ARGBToRAWRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTORAWROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToRAWRow = ARGBToRAWRow_Any_NEON;

View File

@ -127,22 +127,23 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
const uint8_t* v_buf, uint8_t* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \
SIMD_ALIGNED(uint8_t temp[64 * 4]); \
memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
SIMD_ALIGNED(uint8_t temp[128 * 4]); \
memset(temp, 0, 128 * 3); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n, r); \
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
if (width & 1) { \
temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \
temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1]; \
} \
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, yuvconstants, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \
MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384, \
SS(r, DUVSHIFT) * BPP); \
}
@ -161,10 +162,10 @@ ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15)
#endif // HAS_I444TOARGBROW_SSSE3
#ifdef HAS_I422TORGB24ROW_AVX2
ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
#endif
#ifdef HAS_I422TOARGBROW_AVX2
ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
@ -443,6 +444,12 @@ ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
#endif
#if defined(HAS_ARGBTORGB24ROW_AVX2)
ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)
#endif
#if defined(HAS_ARGBTORAWROW_AVX2)
ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31)
#endif
#if defined(HAS_ARGBTORGB565ROW_AVX2)
ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
#endif

View File

@ -3004,8 +3004,11 @@ void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
// TODO(fbarchard): ARGBToRGB24Row_AVX2
#if defined(HAS_ARGBTORGB24ROW_AVX2)
ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
#else
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
#endif
src_y += twidth;
src_uv += twidth;
dst_rgb24 += twidth * 3;
@ -3025,8 +3028,11 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth);
// TODO(fbarchard): ARGBToRGB24Row_AVX2
#if defined(HAS_ARGBTORGB24ROW_AVX2)
ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
#else
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
#endif
src_y += twidth;
src_vu += twidth;
dst_rgb24 += twidth * 3;
@ -3124,8 +3130,11 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y,
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
// TODO(fbarchard): ARGBToRGB24Row_AVX2
#if defined(HAS_ARGBTORGB24ROW_AVX2)
ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
#else
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
#endif
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;

View File

@ -505,6 +505,97 @@ void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
// vpermd for 12+12 to 24
static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm6 \n"
"vmovdqa %4,%%ymm7 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
"vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
"vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
"vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
"vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
"vpermd %%ymm1,%%ymm7,%%ymm1 \n"
"vpermd %%ymm2,%%ymm7,%%ymm2 \n"
"vpermd %%ymm3,%%ymm7,%%ymm3 \n"
"vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
"vpor %%ymm4,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
"vpermq $0x4f,%%ymm2,%%ymm4 \n"
"vpor %%ymm4,%%ymm1,%%ymm1 \n"
"vmovdqu %%ymm1,0x20(%1) \n"
"vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
"vpermq $0x93,%%ymm3,%%ymm3 \n"
"vpor %%ymm3,%%ymm2,%%ymm2 \n"
"vmovdqu %%ymm2,0x40(%1) \n"
"lea 0x60(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "m"(kShuffleMaskARGBToRGB24), // %3
"m"(kPermdRGB24_AVX) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm6 \n"
"vmovdqa %4,%%ymm7 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
"vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
"vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
"vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
"vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
"vpermd %%ymm1,%%ymm7,%%ymm1 \n"
"vpermd %%ymm2,%%ymm7,%%ymm2 \n"
"vpermd %%ymm3,%%ymm7,%%ymm3 \n"
"vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
"vpor %%ymm4,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
"vpermq $0x4f,%%ymm2,%%ymm4 \n"
"vpor %%ymm4,%%ymm1,%%ymm1 \n"
"vmovdqu %%ymm1,0x20(%1) \n"
"vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
"vpermq $0x93,%%ymm3,%%ymm3 \n"
"vpor %%ymm3,%%ymm2,%%ymm2 \n"
"vmovdqu %%ymm2,0x40(%1) \n"
"lea 0x60(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
: "m"(kShuffleMaskARGBToRAW), // %3
"m"(kPermdRGB24_AVX) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
asm volatile(
"pcmpeqb %%xmm3,%%xmm3 \n"

View File

@ -1022,15 +1022,9 @@ TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
MaskCpuFlags(benchmark_cpu_info_); \
FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_opt, kStrideB, kWidth, \
kHeight); \
int max_diff = 0; \
for (int i = 0; i < kStrideB * kHeightB; ++i) { \
int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \
static_cast<int>(dst_argb_opt[i])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
EXPECT_NEAR(dst_argb_c[i], dst_argb_opt[i], DIFF); \
} \
} \
EXPECT_LE(max_diff, DIFF); \
free_aligned_buffer_page_end(src_argb); \
free_aligned_buffer_page_end(dst_argb_c); \
free_aligned_buffer_page_end(dst_argb_opt); \
@ -1050,6 +1044,7 @@ TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
HEIGHT_B, DIFF)
// TODO(fbarchard): make ARM version of C code that matches NEON.
TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
@ -2458,4 +2453,28 @@ TEST_F(LibYUVConvertTest, TestH420ToAR30) {
free_aligned_buffer_page_end(ar30_pixels);
}
// Test RGB24 to ARGB and back to RGB24
TEST_F(LibYUVConvertTest, TestARGBToRGB24) {
const int kSize = 256;
align_buffer_page_end(orig_rgb24, kSize * 3);
align_buffer_page_end(argb_pixels, kSize * 4);
align_buffer_page_end(dest_rgb24, kSize * 3);
// Test grey scale
for (int i = 0; i < kSize * 3; ++i) {
orig_rgb24[i] = i;
}
RGB24ToARGB(orig_rgb24, 0, argb_pixels, 0, kSize, 1);
ARGBToRGB24(argb_pixels, 0, dest_rgb24, 0, kSize, 1);
for (int i = 0; i < kSize * 3; ++i) {
EXPECT_EQ(orig_rgb24[i], dest_rgb24[i]);
}
free_aligned_buffer_page_end(orig_rgb24);
free_aligned_buffer_page_end(argb_pixels);
free_aligned_buffer_page_end(dest_rgb24);
}
} // namespace libyuv