ARGBAttenuate AVX2 rewritten to match NEON/C code

Bug: 665
Change-Id: If26fb389dabbca870a0e720f5258d6c9b2cde156
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2196904
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
Frank Barchard 2020-05-12 20:37:43 -07:00 committed by Commit Bot
parent d13db1b437
commit 84da59c168
8 changed files with 214 additions and 189 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1753
Version: 1754
License: BSD
License File: LICENSE

View File

@ -143,7 +143,6 @@ extern "C" {
// Effects:
#define HAS_ARGBADDROW_SSE2
#define HAS_ARGBAFFINEROW_SSE2
#define HAS_ARGBATTENUATEROW_SSSE3
#define HAS_ARGBBLENDROW_SSSE3
#define HAS_ARGBCOLORMATRIXROW_SSSE3
#define HAS_ARGBCOLORTABLEROW_X86
@ -231,7 +230,6 @@ extern "C" {
// Effects:
#define HAS_ARGBADDROW_AVX2
#define HAS_ARGBATTENUATEROW_AVX2
#define HAS_ARGBMULTIPLYROW_AVX2
#define HAS_ARGBSUBTRACTROW_AVX2
#define HAS_ARGBUNATTENUATEROW_AVX2
@ -270,6 +268,7 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_ABGRTOAR30ROW_SSSE3
#define HAS_ARGBATTENUATEROW_SSSE3
#define HAS_ARGBTOAR30ROW_SSSE3
#define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_CONVERT8TO16ROW_SSE2
@ -285,7 +284,6 @@ extern "C" {
#define HAS_RGBATOYJROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3
#define HAS_SWAPUVROW_SSSE3
#endif
// The following are available for AVX2 gcc/clang x86 platforms:
@ -296,6 +294,7 @@ extern "C" {
#define HAS_ABGRTOAR30ROW_AVX2
#define HAS_ABGRTOUVROW_AVX2
#define HAS_ABGRTOYROW_AVX2
#define HAS_ARGBATTENUATEROW_AVX2
#define HAS_ARGBTOAR30ROW_AVX2
#define HAS_ARGBTORAWROW_AVX2
#define HAS_ARGBTORGB24ROW_AVX2

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1753
#define LIBYUV_VERSION 1754
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -763,11 +763,11 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
r3 = (r3 << 3) | (r3 >> 2);
#if LIBYUV_ARGBTOUV_PAVGB
uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
#else
uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
@ -776,10 +776,10 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565,
dst_v[0] = RGB2xToV(r, g, b);
#endif
src_rgb565 += 4;
next_rgb565 += 4;
dst_u += 1;
dst_v += 1;
src_rgb565 += 4;
next_rgb565 += 4;
dst_u += 1;
dst_v += 1;
}
if (width & 1) {
uint8_t b0 = src_rgb565[0] & 0x1f;
@ -847,11 +847,11 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
r3 = (r3 << 3) | (r3 >> 2);
#if LIBYUV_ARGBTOUV_PAVGB
uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
#else
uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
@ -860,10 +860,10 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555,
dst_v[0] = RGB2xToV(r, g, b);
#endif
src_argb1555 += 4;
next_argb1555 += 4;
dst_u += 1;
dst_v += 1;
src_argb1555 += 4;
next_argb1555 += 4;
dst_u += 1;
dst_v += 1;
}
if (width & 1) {
uint8_t b0 = src_argb1555[0] & 0x1f;
@ -931,11 +931,11 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
r3 = (r3 << 4) | r3;
#if LIBYUV_ARGBTOUV_PAVGB
uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3));
uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3));
uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3));
dst_u[0] = RGBToU(ar, ag, ab);
dst_v[0] = RGBToV(ar, ag, ab);
#else
uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1;
uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1;
@ -944,10 +944,10 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444,
dst_v[0] = RGB2xToV(r, g, b);
#endif
src_argb4444 += 4;
next_argb4444 += 4;
dst_u += 1;
dst_v += 1;
src_argb4444 += 4;
next_argb4444 += 4;
dst_u += 1;
dst_v += 1;
}
if (width & 1) {
uint8_t b0 = src_argb4444[0] & 0x0f;
@ -2681,7 +2681,7 @@ void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
}
}
#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f
#define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f)
// Blend src_argb0 over src_argb1 and store to dst_argb.
// dst_argb may be src_argb0 or src_argb1.
@ -2757,12 +2757,7 @@ void BlendPlaneRow_C(const uint8_t* src0,
}
#undef UBLEND
#if defined(__aarch64__) || defined(__arm__)
#define ATTENUATE(f, a) (f * a + 128) >> 8
#else
// This code mimics the SSSE3 version for better testability.
#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24
#endif
// Multiply source RGB by alpha and store to destination.
void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) {

View File

@ -4892,94 +4892,99 @@ void BlendPlaneRow_AVX2(const uint8_t* src0,
#endif // HAS_BLENDPLANEROW_AVX2
#ifdef HAS_ARGBATTENUATEROW_SSSE3
// Shuffle table duplicating alpha
static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
// Shuffle table duplicating alpha.
static const uvec8 kAttenShuffle = {6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u,
14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u};
// Attenuate 4 pixels at a time.
void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
asm volatile(
"pcmpeqb %%xmm3,%%xmm3 \n"
"pslld $0x18,%%xmm3 \n"
"movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n"
"movdqu %3,%%xmm6 \n" // alpha shuffler
"pcmpeqb %%xmm7,%%xmm7 \n" // 0x0080
"psllw $0xf,%%xmm7 \n"
"psrlw $0x8,%%xmm7 \n"
"pcmpeqb %%xmm0,%%xmm0 \n" // 0xff000000
"pslld $0x18,%%xmm0 \n"
"sub %0,%1 \n"
// 4 pixel loop.
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"pshufb %%xmm4,%%xmm0 \n"
"movdqu (%0),%%xmm1 \n"
"punpcklbw %%xmm1,%%xmm1 \n"
"pmulhuw %%xmm1,%%xmm0 \n"
"movdqu (%0),%%xmm1 \n"
"pshufb %%xmm5,%%xmm1 \n"
"movdqu (%0),%%xmm2 \n"
"punpckhbw %%xmm2,%%xmm2 \n"
"pmulhuw %%xmm2,%%xmm1 \n"
"movdqu (%0),%%xmm2 \n"
"lea 0x10(%0),%0 \n"
"pand %%xmm3,%%xmm2 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"por %%xmm2,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "m"(kShuffleAlpha0), // %3
"m"(kShuffleAlpha1) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
"1: \n"
"movdqu (%0),%%xmm1 \n"
"pxor %%xmm4,%%xmm4 \n"
"movdqa %%xmm1,%%xmm2 \n"
"movdqa %%xmm1,%%xmm3 \n"
"punpcklbw %%xmm4,%%xmm2 \n"
"punpckhbw %%xmm4,%%xmm3 \n"
"movdqa %%xmm2,%%xmm4 \n"
"movdqa %%xmm3,%%xmm5 \n"
"pshufb %%xmm6,%%xmm4 \n"
"pshufb %%xmm6,%%xmm5 \n"
"pmullw %%xmm4,%%xmm2 \n"
"pmullw %%xmm5,%%xmm3 \n"
"pand %%xmm0,%%xmm1 \n"
"paddw %%xmm7,%%xmm2 \n"
"paddw %%xmm7,%%xmm3 \n"
"psrlw $0x8,%%xmm2 \n"
"psrlw $0x8,%%xmm3 \n"
"packuswb %%xmm3,%%xmm2 \n"
"por %%xmm1,%%xmm2 \n"
"movdqu %%xmm2,(%0,%1,1) \n"
"lea 0x10(%0),%0 \n"
"sub $0x4,%2 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "m"(kAttenShuffle) // %3
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
}
#endif // HAS_ARGBATTENUATEROW_SSSE3
#ifdef HAS_ARGBATTENUATEROW_AVX2
// Shuffle table duplicating alpha.
static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u,
128u, 128u, 14u, 15u, 14u, 15u,
14u, 15u, 128u, 128u};
// Attenuate 8 pixels at a time.
void ARGBAttenuateRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_argb,
int width) {
asm volatile(
"vbroadcastf128 %3,%%ymm4 \n"
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpslld $0x18,%%ymm5,%%ymm5 \n"
"vbroadcastf128 %3,%%ymm6 \n" // alpha shuffler
"vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" // 0xff000000
"vpsllw $0xf,%%ymm0,%%ymm7 \n" // 0x0080
"vpslld $0x18,%%ymm0,%%ymm0 \n"
"vpsrlw $0x8,%%ymm7,%%ymm7 \n"
"sub %0,%1 \n"
// 8 pixel loop.
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm6 \n"
"vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n"
"vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n"
"vpshufb %%ymm4,%%ymm0,%%ymm2 \n"
"vpshufb %%ymm4,%%ymm1,%%ymm3 \n"
"vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n"
"vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n"
"vpand %%ymm5,%%ymm6,%%ymm6 \n"
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpor %%ymm6,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,0x00(%0,%1,1) \n"
"lea 0x20(%0),%0 \n"
"vmovdqu (%0),%%ymm1 \n"
"vpxor %%ymm3,%%ymm3,%%ymm3 \n"
"vpunpcklbw %%ymm3,%%ymm1,%%ymm2 \n"
"vpunpckhbw %%ymm3,%%ymm1,%%ymm3 \n"
"vpshufb %%ymm6,%%ymm2,%%ymm4 \n"
"vpshufb %%ymm6,%%ymm3,%%ymm5 \n"
"vpmullw %%ymm4,%%ymm2,%%ymm2 \n"
"vpmullw %%ymm5,%%ymm3,%%ymm3 \n"
"vpaddw %%ymm7,%%ymm2,%%ymm2 \n"
"vpaddw %%ymm7,%%ymm3,%%ymm3 \n"
"vpsrlw $0x8,%%ymm2,%%ymm2 \n"
"vpsrlw $0x8,%%ymm3,%%ymm3 \n"
"vpackuswb %%ymm3,%%ymm2,%%ymm2 \n"
"vpblendvb %%ymm0,%%ymm1,%%ymm2,%%ymm2 \n"
"vmovdqu %%ymm2,(%0,%1,1) \n"
"lea 0x20(%0),%0 \n"
"sub $0x8,%2 \n"
"jg 1b \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "m"(kShuffleAlpha_AVX2) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
"+r"(width) // %2
: "m"(kAttenShuffle) // %3
: "memory", "cc",
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
}
#endif // HAS_ARGBATTENUATEROW_AVX2
@ -7068,7 +7073,6 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
"psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n"
"pxor %%xmm5,%%xmm5 \n"
"1: \n"
LABELALIGN
"1: \n"
@ -7111,11 +7115,10 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u,
uint8_t* dst_uv,
int width) {
asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0x0101
"vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
"1: \n"
LABELALIGN
"1: \n"

View File

@ -123,8 +123,8 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
"movi v23.8b, #255 \n" /* A */
"1: \n"
READYUV444
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%1, 448] \n"
"prfm pldl1keep, [%2, 448] \n"
"subs %w4, %w4, #8 \n"
@ -188,11 +188,11 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
YUVTORGB_SETUP
"1: \n"
READYUV422
"prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
"ld1 {v23.8b}, [%3], #8 \n"
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"prfm pldl1keep, [%2, 448] \n"
"prfm pldl1keep, [%1, 128] \n"
"prfm pldl1keep, [%2, 128] \n"
"prfm pldl1keep, [%3, 448] \n"
"subs %w5, %w5, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
@ -223,10 +223,10 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
"movi v20.8b, #255 \n" /* A */
"1: \n"
READYUV422
YUVTORGB(v23, v22, v21)
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"prfm pldl1keep, [%2, 448] \n"
YUVTORGB(v23, v22, v21)
"prfm pldl1keep, [%1, 128] \n"
"prfm pldl1keep, [%2, 128] \n"
"subs %w4, %w4, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n"
@ -254,10 +254,10 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
YUVTORGB_SETUP
"1: \n"
READYUV422
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"prfm pldl1keep, [%2, 448] \n"
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%1, 128] \n"
"prfm pldl1keep, [%2, 128] \n"
"subs %w4, %w4, #8 \n"
"st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
"b.gt 1b \n"
@ -295,13 +295,12 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
"1: \n"
READYUV422
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%0, 448] \n"
"subs %w4, %w4, #8 \n"
ARGBTORGB565
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"prfm pldl1keep, [%2, 448] \n"
"prfm pldl1keep, [%1, 128] \n"
"prfm pldl1keep, [%2, 128] \n"
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
"prfm pldl1keep, [%0, 448] \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
@ -337,11 +336,11 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
"1: \n"
READYUV422
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%0, 448] \n"
"subs %w4, %w4, #8 \n"
ARGBTOARGB1555
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"prfm pldl1keep, [%2, 448] \n"
"prfm pldl1keep, [%1, 128] \n"
"prfm pldl1keep, [%2, 128] \n"
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
"b.gt 1b \n"
: "+r"(src_y), // %0
@ -380,12 +379,12 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
"1: \n"
READYUV422
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%0, 448] \n"
"subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n"
ARGBTOARGB4444
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"prfm pldl1keep, [%2, 448] \n"
"prfm pldl1keep, [%1, 128] \n"
"prfm pldl1keep, [%2, 128] \n"
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444.
"b.gt 1b \n"
: "+r"(src_y), // %0
@ -453,9 +452,9 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
"movi v23.8b, #255 \n"
"1: \n"
READNV12
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%1, 256] \n"
"subs %w3, %w3, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
"b.gt 1b \n"
@ -482,9 +481,9 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
"movi v23.8b, #255 \n"
"1: \n"
READNV21
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%1, 256] \n"
"subs %w3, %w3, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
"b.gt 1b \n"
@ -510,9 +509,9 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
YUVTORGB_SETUP
"1: \n"
READNV12
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%1, 256] \n"
"subs %w3, %w3, #8 \n"
"st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
"b.gt 1b \n"
@ -538,9 +537,9 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
YUVTORGB_SETUP
"1: \n"
READNV21
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%1, 256] \n"
"subs %w3, %w3, #8 \n"
"st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n"
"b.gt 1b \n"
@ -562,25 +561,24 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile(YUVTORGB_SETUP
"1: \n" READNV12 YUVTORGB(
v22, v21, v20) ARGBTORGB565
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"subs %w3, %w3, #8 \n"
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
"+r"(width) // %3
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28",
"v29", "v30");
asm volatile(
YUVTORGB_SETUP "1: \n" READNV12
"prfm pldl1keep, [%0, 448] \n" YUVTORGB(
v22, v21, v20) ARGBTORGB565
"prfm pldl1keep, [%1, 256] \n"
"subs %w3, %w3, #8 \n"
"st1 {v0.8h}, [%2], 16 \n" // store 8 pixels
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
"+r"(width) // %3
: [kUVToRB] "r"(&yuvconstants->kUVToRB),
[kUVToG] "r"(&yuvconstants->kUVToG),
[kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR),
[kYToRgb] "r"(&yuvconstants->kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30");
}
void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
@ -592,8 +590,8 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
"movi v23.8b, #255 \n"
"1: \n"
READYUY2
YUVTORGB(v22, v21, v20)
"prfm pldl1keep, [%0, 448] \n"
YUVTORGB(v22, v21, v20)
"subs %w2, %w2, #8 \n"
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n"
@ -757,7 +755,6 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
"1: \n"
"subs %w1, %w1, #16 \n" // 16 bytes per loop
"st1 {v0.16b}, [%0], #16 \n" // store
"prfm pldl1keep, [%0, 448] \n"
"b.gt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
@ -771,7 +768,6 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
"1: \n"
"subs %w1, %w1, #4 \n" // 4 ints per loop
"st1 {v0.16b}, [%0], #16 \n" // store
"prfm pldl1keep, [%0, 448] \n"
"b.gt 1b \n"
: "+r"(dst), // %0
"+r"(width) // %1
@ -1161,9 +1157,9 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
asm volatile(
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY.
"prfm pldl1keep, [%0, 448] \n"
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y.
"prfm pldl1keep, [%0, 448] \n"
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
@ -1861,10 +1857,10 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
// 16x2 pixels -> 8x1. width is number of rgb pixels. e.g. 16.
void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
int src_stride_rgb565,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
int src_stride_rgb565,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
asm volatile(
RGBTOUV_SETUP_REG
@ -2456,8 +2452,8 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB.
"prfm pldl1keep, [%0, 448] \n"
"subs %w1, %w1, #8 \n" // 8 processed per loop.
"uxtl v0.8h, v0.8b \n" // b (0 .. 255)
"subs %w1, %w1, #8 \n" // 8 processed per loop.
"uxtl v0.8h, v0.8b \n" // b (0 .. 255)
"uxtl v1.8h, v1.8b \n"
"uxtl v2.8h, v2.8b \n"
"sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale
@ -2566,19 +2562,19 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels.
"prfm pldl1keep, [%0, 448] \n"
"subs %w1, %w1, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
"umlal v4.8h, v1.8b, v21.8b \n" // G
"umlal v4.8h, v2.8b, v22.8b \n" // R
"umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
"umlal v5.8h, v1.8b, v25.8b \n" // G
"umlal v5.8h, v2.8b, v26.8b \n" // R
"umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
"umlal v6.8h, v1.8b, v29.8b \n" // G
"umlal v6.8h, v2.8b, v30.8b \n" // R
"uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
"uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
"uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
"subs %w1, %w1, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B
"umlal v4.8h, v1.8b, v21.8b \n" // G
"umlal v4.8h, v2.8b, v22.8b \n" // R
"umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G
"umlal v5.8h, v1.8b, v25.8b \n" // G
"umlal v5.8h, v2.8b, v26.8b \n" // R
"umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R
"umlal v6.8h, v1.8b, v29.8b \n" // G
"umlal v6.8h, v2.8b, v30.8b \n" // R
"uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B
"uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G
"uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R
"st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels.
"b.gt 1b \n"
: "+r"(dst_argb), // %0
@ -3225,8 +3221,8 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
int width) {
asm volatile(
"1: \n"
"ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
"ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
"ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
"ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values

View File

@ -4254,6 +4254,8 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
}
#endif // HAS_ARGBBLENDROW_SSSE3
// ARGBAttenuateRow disabled on win32 due to differences (off by 1) compared
// to C and Neon. Use row_gcc.cc with clangcl.
#ifdef HAS_ARGBATTENUATEROW_SSSE3
// Shuffle table duplicating alpha.
static const uvec8 kShuffleAlpha0 = {

View File

@ -38,6 +38,26 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
align_buffer_page_end(unatten_pixels, kSize);
align_buffer_page_end(atten2_pixels, kSize);
// Test unattenuation clamps
orig_pixels[0 * 4 + 0] = 10u;
orig_pixels[0 * 4 + 1] = 20u;
orig_pixels[0 * 4 + 2] = 30u;
orig_pixels[0 * 4 + 3] = 255u;
orig_pixels[1 * 4 + 0] = 255u;
orig_pixels[1 * 4 + 1] = 128u;
orig_pixels[1 * 4 + 2] = 99u;
orig_pixels[1 * 4 + 3] = 255u;
ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 2, 1);
EXPECT_EQ(10u, atten_pixels[0 * 4 + 0]);
EXPECT_EQ(20u, atten_pixels[0 * 4 + 1]);
EXPECT_EQ(30u, atten_pixels[0 * 4 + 2]);
EXPECT_EQ(255u, atten_pixels[0 * 4 + 3]);
EXPECT_EQ(254u, atten_pixels[1 * 4 + 0]);
EXPECT_EQ(128u, atten_pixels[1 * 4 + 1]);
EXPECT_EQ(99u, atten_pixels[1 * 4 + 2]);
EXPECT_EQ(255u, atten_pixels[1 * 4 + 3]);
// Test unattenuation clamps
orig_pixels[0 * 4 + 0] = 200u;
orig_pixels[0 * 4 + 1] = 129u;
@ -100,9 +120,9 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) {
EXPECT_EQ(32, atten_pixels[128 * 4 + 1]);
EXPECT_EQ(21, atten_pixels[128 * 4 + 2]);
EXPECT_EQ(128, atten_pixels[128 * 4 + 3]);
EXPECT_NEAR(255, atten_pixels[255 * 4 + 0], 1);
EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], 1);
EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], 1);
EXPECT_EQ(254, atten_pixels[255 * 4 + 0]);
EXPECT_EQ(127, atten_pixels[255 * 4 + 1]);
EXPECT_EQ(85, atten_pixels[255 * 4 + 2]);
EXPECT_EQ(255, atten_pixels[255 * 4 + 3]);
free_aligned_buffer_page_end(atten2_pixels);
@ -1125,7 +1145,8 @@ static int TestBlend(int width,
int disable_cpu_flags,
int benchmark_cpu_info,
int invert,
int off) {
int off,
int attenuate) {
if (width < 1) {
width = 1;
}
@ -1139,10 +1160,12 @@ static int TestBlend(int width,
src_argb_a[i + off] = (fastrand() & 0xff);
src_argb_b[i + off] = (fastrand() & 0xff);
}
ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
height);
ARGBAttenuate(src_argb_b + off, kStride, src_argb_b + off, kStride, width,
height);
MemRandomize(src_argb_a, kStride * height + off);
MemRandomize(src_argb_b, kStride * height + off);
if (attenuate) {
ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
height);
}
memset(dst_argb_c, 255, kStride * height);
memset(dst_argb_opt, 255, kStride * height);
@ -1172,28 +1195,35 @@ static int TestBlend(int width,
TEST_F(LibYUVPlanarTest, ARGBBlend_Any) {
int max_diff =
TestBlend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlend_Unaligned) {
int max_diff =
TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 1);
disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 1);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlend_Invert) {
int max_diff =
TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, -1, 0);
disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 1);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlend_Unattenuated) {
int max_diff =
TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 0);
EXPECT_LE(max_diff, 1);
}
TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) {
int max_diff =
TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_,
disable_cpu_flags_, benchmark_cpu_info_, +1, 0);
disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1);
EXPECT_LE(max_diff, 1);
}