mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
ARGBToRGB24_AVX2 version
AVX2 port of SSSE3 conversion to output 24 bit RGB Bug: libyuv:778 Test: LibYUVConvertTest.NV21ToRGB24_Opt Change-Id: I14f7815522d1b790ecd2bb39d9a3441e803b694a Reviewed-on: https://chromium-review.googlesource.com/953303 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
parent
3009890c11
commit
1d509f2178
@ -275,10 +275,12 @@ extern "C" {
|
||||
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
|
||||
#define HAS_ABGRTOAR30ROW_AVX2
|
||||
#define HAS_ARGBTOAR30ROW_AVX2
|
||||
#define HAS_ARGBTORAWROW_AVX2
|
||||
#define HAS_ARGBTORGB24ROW_AVX2
|
||||
#define HAS_CONVERT16TO8ROW_AVX2
|
||||
#define HAS_CONVERT8TO16ROW_AVX2
|
||||
#define HAS_I210TOARGBROW_AVX2
|
||||
#define HAS_I210TOAR30ROW_AVX2
|
||||
#define HAS_I210TOARGBROW_AVX2
|
||||
#define HAS_I422TOAR30ROW_AVX2
|
||||
#define HAS_I422TOUYVYROW_AVX2
|
||||
#define HAS_I422TOYUY2ROW_AVX2
|
||||
@ -1701,6 +1703,9 @@ void ARGBToARGB4444Row_SSE2(const uint8_t* src, uint8_t* dst, int width);
|
||||
void ABGRToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
|
||||
void ARGBToAR30Row_SSSE3(const uint8_t* src, uint8_t* dst, int width);
|
||||
|
||||
void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width);
|
||||
void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width);
|
||||
|
||||
void ARGBToRGB565DitherRow_C(const uint8_t* src_argb,
|
||||
uint8_t* dst_rgb,
|
||||
const uint32_t dither4,
|
||||
@ -2492,7 +2497,12 @@ void ABGRToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
|
||||
void ARGBToAR30Row_Any_SSSE3(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int width);
|
||||
|
||||
void ARGBToRAWRow_Any_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int width);
|
||||
void ARGBToRGB24Row_Any_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int width);
|
||||
void ARGBToRGB565DitherRow_Any_SSE2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
const uint32_t param,
|
||||
|
||||
@ -879,6 +879,14 @@ int ARGBToRGB24(const uint8_t* src_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORGB24ROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToRGB24Row = ARGBToRGB24Row_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToRGB24Row = ARGBToRGB24Row_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORGB24ROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
|
||||
@ -937,6 +945,14 @@ int ARGBToRAW(const uint8_t* src_argb,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORAWROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToRAWRow = ARGBToRAWRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToRAWRow = ARGBToRAWRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORAWROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
|
||||
|
||||
@ -127,22 +127,23 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
|
||||
void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \
|
||||
const uint8_t* v_buf, uint8_t* dst_ptr, \
|
||||
const struct YuvConstants* yuvconstants, int width) { \
|
||||
SIMD_ALIGNED(uint8_t temp[64 * 4]); \
|
||||
memset(temp, 0, 64 * 3); /* for YUY2 and msan */ \
|
||||
SIMD_ALIGNED(uint8_t temp[128 * 4]); \
|
||||
memset(temp, 0, 128 * 3); /* for YUY2 and msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
|
||||
} \
|
||||
memcpy(temp, y_buf + n, r); \
|
||||
memcpy(temp + 64, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
|
||||
memcpy(temp + 128, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
|
||||
memcpy(temp + 128, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
|
||||
memcpy(temp + 256, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT)); \
|
||||
if (width & 1) { \
|
||||
temp[64 + SS(r, UVSHIFT)] = temp[64 + SS(r, UVSHIFT) - 1]; \
|
||||
temp[128 + SS(r, UVSHIFT)] = temp[128 + SS(r, UVSHIFT) - 1]; \
|
||||
temp[256 + SS(r, UVSHIFT)] = temp[256 + SS(r, UVSHIFT) - 1]; \
|
||||
} \
|
||||
ANY_SIMD(temp, temp + 64, temp + 128, temp + 192, yuvconstants, MASK + 1); \
|
||||
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 192, \
|
||||
ANY_SIMD(temp, temp + 128, temp + 256, temp + 384, yuvconstants, \
|
||||
MASK + 1); \
|
||||
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, temp + 384, \
|
||||
SS(r, DUVSHIFT) * BPP); \
|
||||
}
|
||||
|
||||
@ -161,10 +162,10 @@ ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
|
||||
ANY31C(I422ToARGB4444Row_Any_SSSE3, I422ToARGB4444Row_SSSE3, 1, 0, 2, 7)
|
||||
ANY31C(I422ToARGB1555Row_Any_SSSE3, I422ToARGB1555Row_SSSE3, 1, 0, 2, 7)
|
||||
ANY31C(I422ToRGB565Row_Any_SSSE3, I422ToRGB565Row_SSSE3, 1, 0, 2, 7)
|
||||
ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 7)
|
||||
ANY31C(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, 1, 0, 3, 15)
|
||||
#endif // HAS_I444TOARGBROW_SSSE3
|
||||
#ifdef HAS_I422TORGB24ROW_AVX2
|
||||
ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 15)
|
||||
ANY31C(I422ToRGB24Row_Any_AVX2, I422ToRGB24Row_AVX2, 1, 0, 3, 31)
|
||||
#endif
|
||||
#ifdef HAS_I422TOARGBROW_AVX2
|
||||
ANY31C(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, 1, 0, 4, 15)
|
||||
@ -443,6 +444,12 @@ ANY11(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 0, 4, 2, 3)
|
||||
ANY11(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 0, 4, 2, 3)
|
||||
ANY11(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 0, 4, 2, 3)
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORGB24ROW_AVX2)
|
||||
ANY11(ARGBToRGB24Row_Any_AVX2, ARGBToRGB24Row_AVX2, 0, 4, 3, 31)
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORAWROW_AVX2)
|
||||
ANY11(ARGBToRAWRow_Any_AVX2, ARGBToRAWRow_AVX2, 0, 4, 3, 31)
|
||||
#endif
|
||||
#if defined(HAS_ARGBTORGB565ROW_AVX2)
|
||||
ANY11(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, 0, 4, 2, 7)
|
||||
#endif
|
||||
|
||||
@ -3004,8 +3004,11 @@ void NV12ToRGB24Row_AVX2(const uint8_t* src_y,
|
||||
while (width > 0) {
|
||||
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
|
||||
NV12ToARGBRow_AVX2(src_y, src_uv, row, yuvconstants, twidth);
|
||||
// TODO(fbarchard): ARGBToRGB24Row_AVX2
|
||||
#if defined(HAS_ARGBTORGB24ROW_AVX2)
|
||||
ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
|
||||
#else
|
||||
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
|
||||
#endif
|
||||
src_y += twidth;
|
||||
src_uv += twidth;
|
||||
dst_rgb24 += twidth * 3;
|
||||
@ -3025,8 +3028,11 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y,
|
||||
while (width > 0) {
|
||||
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
|
||||
NV21ToARGBRow_AVX2(src_y, src_vu, row, yuvconstants, twidth);
|
||||
// TODO(fbarchard): ARGBToRGB24Row_AVX2
|
||||
#if defined(HAS_ARGBTORGB24ROW_AVX2)
|
||||
ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
|
||||
#else
|
||||
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
|
||||
#endif
|
||||
src_y += twidth;
|
||||
src_vu += twidth;
|
||||
dst_rgb24 += twidth * 3;
|
||||
@ -3124,8 +3130,11 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y,
|
||||
while (width > 0) {
|
||||
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
|
||||
I422ToARGBRow_AVX2(src_y, src_u, src_v, row, yuvconstants, twidth);
|
||||
// TODO(fbarchard): ARGBToRGB24Row_AVX2
|
||||
#if defined(HAS_ARGBTORGB24ROW_AVX2)
|
||||
ARGBToRGB24Row_AVX2(row, dst_rgb24, twidth);
|
||||
#else
|
||||
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
|
||||
#endif
|
||||
src_y += twidth;
|
||||
src_u += twidth / 2;
|
||||
src_v += twidth / 2;
|
||||
|
||||
@ -505,6 +505,97 @@ void ARGBToRAWRow_SSSE3(const uint8_t* src, uint8_t* dst, int width) {
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||
}
|
||||
|
||||
// vpermd for 12+12 to 24
|
||||
static const lvec32 kPermdRGB24_AVX = {0, 1, 2, 4, 5, 6, 3, 7};
|
||||
|
||||
void ARGBToRGB24Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 %3,%%ymm6 \n"
|
||||
"vmovdqa %4,%%ymm7 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm0 \n"
|
||||
"vmovdqu 0x20(%0),%%ymm1 \n"
|
||||
"vmovdqu 0x40(%0),%%ymm2 \n"
|
||||
"vmovdqu 0x60(%0),%%ymm3 \n"
|
||||
"lea 0x80(%0),%0 \n"
|
||||
"vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
|
||||
"vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
|
||||
"vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
|
||||
"vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
|
||||
"vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
|
||||
"vpermd %%ymm1,%%ymm7,%%ymm1 \n"
|
||||
"vpermd %%ymm2,%%ymm7,%%ymm2 \n"
|
||||
"vpermd %%ymm3,%%ymm7,%%ymm3 \n"
|
||||
"vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
|
||||
"vpor %%ymm4,%%ymm0,%%ymm0 \n"
|
||||
"vmovdqu %%ymm0,(%1) \n"
|
||||
"vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
|
||||
"vpermq $0x4f,%%ymm2,%%ymm4 \n"
|
||||
"vpor %%ymm4,%%ymm1,%%ymm1 \n"
|
||||
"vmovdqu %%ymm1,0x20(%1) \n"
|
||||
"vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
|
||||
"vpermq $0x93,%%ymm3,%%ymm3 \n"
|
||||
"vpor %%ymm3,%%ymm2,%%ymm2 \n"
|
||||
"vmovdqu %%ymm2,0x40(%1) \n"
|
||||
"lea 0x60(%1),%1 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kShuffleMaskARGBToRGB24), // %3
|
||||
"m"(kPermdRGB24_AVX) // %4
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
|
||||
void ARGBToRAWRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 %3,%%ymm6 \n"
|
||||
"vmovdqa %4,%%ymm7 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm0 \n"
|
||||
"vmovdqu 0x20(%0),%%ymm1 \n"
|
||||
"vmovdqu 0x40(%0),%%ymm2 \n"
|
||||
"vmovdqu 0x60(%0),%%ymm3 \n"
|
||||
"lea 0x80(%0),%0 \n"
|
||||
"vpshufb %%ymm6,%%ymm0,%%ymm0 \n" // xxx0yyy0
|
||||
"vpshufb %%ymm6,%%ymm1,%%ymm1 \n"
|
||||
"vpshufb %%ymm6,%%ymm2,%%ymm2 \n"
|
||||
"vpshufb %%ymm6,%%ymm3,%%ymm3 \n"
|
||||
"vpermd %%ymm0,%%ymm7,%%ymm0 \n" // pack to 24 bytes
|
||||
"vpermd %%ymm1,%%ymm7,%%ymm1 \n"
|
||||
"vpermd %%ymm2,%%ymm7,%%ymm2 \n"
|
||||
"vpermd %%ymm3,%%ymm7,%%ymm3 \n"
|
||||
"vpermq $0x3f,%%ymm1,%%ymm4 \n" // combine 24 + 8
|
||||
"vpor %%ymm4,%%ymm0,%%ymm0 \n"
|
||||
"vmovdqu %%ymm0,(%1) \n"
|
||||
"vpermq $0xf9,%%ymm1,%%ymm1 \n" // combine 16 + 16
|
||||
"vpermq $0x4f,%%ymm2,%%ymm4 \n"
|
||||
"vpor %%ymm4,%%ymm1,%%ymm1 \n"
|
||||
"vmovdqu %%ymm1,0x20(%1) \n"
|
||||
"vpermq $0xfe,%%ymm2,%%ymm2 \n" // combine 8 + 24
|
||||
"vpermq $0x93,%%ymm3,%%ymm3 \n"
|
||||
"vpor %%ymm3,%%ymm2,%%ymm2 \n"
|
||||
"vmovdqu %%ymm2,0x40(%1) \n"
|
||||
"lea 0x60(%1),%1 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "m"(kShuffleMaskARGBToRAW), // %3
|
||||
"m"(kPermdRGB24_AVX) // %4
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
|
||||
void ARGBToRGB565Row_SSE2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm3,%%xmm3 \n"
|
||||
|
||||
@ -1022,15 +1022,9 @@ TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
|
||||
MaskCpuFlags(benchmark_cpu_info_); \
|
||||
FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_opt, kStrideB, kWidth, \
|
||||
kHeight); \
|
||||
int max_diff = 0; \
|
||||
for (int i = 0; i < kStrideB * kHeightB; ++i) { \
|
||||
int abs_diff = abs(static_cast<int>(dst_argb_c[i]) - \
|
||||
static_cast<int>(dst_argb_opt[i])); \
|
||||
if (abs_diff > max_diff) { \
|
||||
max_diff = abs_diff; \
|
||||
EXPECT_NEAR(dst_argb_c[i], dst_argb_opt[i], DIFF); \
|
||||
} \
|
||||
} \
|
||||
EXPECT_LE(max_diff, DIFF); \
|
||||
free_aligned_buffer_page_end(src_argb); \
|
||||
free_aligned_buffer_page_end(dst_argb_c); \
|
||||
free_aligned_buffer_page_end(dst_argb_opt); \
|
||||
@ -1050,6 +1044,7 @@ TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
|
||||
TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
|
||||
HEIGHT_B, DIFF)
|
||||
|
||||
// TODO(fbarchard): make ARM version of C code that matches NEON.
|
||||
TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0)
|
||||
TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1, 0)
|
||||
TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0)
|
||||
@ -2458,4 +2453,28 @@ TEST_F(LibYUVConvertTest, TestH420ToAR30) {
|
||||
free_aligned_buffer_page_end(ar30_pixels);
|
||||
}
|
||||
|
||||
// Test RGB24 to ARGB and back to RGB24
|
||||
TEST_F(LibYUVConvertTest, TestARGBToRGB24) {
|
||||
const int kSize = 256;
|
||||
align_buffer_page_end(orig_rgb24, kSize * 3);
|
||||
align_buffer_page_end(argb_pixels, kSize * 4);
|
||||
align_buffer_page_end(dest_rgb24, kSize * 3);
|
||||
|
||||
// Test grey scale
|
||||
for (int i = 0; i < kSize * 3; ++i) {
|
||||
orig_rgb24[i] = i;
|
||||
}
|
||||
|
||||
RGB24ToARGB(orig_rgb24, 0, argb_pixels, 0, kSize, 1);
|
||||
ARGBToRGB24(argb_pixels, 0, dest_rgb24, 0, kSize, 1);
|
||||
|
||||
for (int i = 0; i < kSize * 3; ++i) {
|
||||
EXPECT_EQ(orig_rgb24[i], dest_rgb24[i]);
|
||||
}
|
||||
|
||||
free_aligned_buffer_page_end(orig_rgb24);
|
||||
free_aligned_buffer_page_end(argb_pixels);
|
||||
free_aligned_buffer_page_end(dest_rgb24);
|
||||
}
|
||||
|
||||
} // namespace libyuv
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user