From 84da59c1689d62d199c6586480e459e51315e14c Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 12 May 2020 20:37:43 -0700 Subject: [PATCH] ARGBAttenuate AVX2 rewritten to match NEON/C code Bug: 665 Change-Id: If26fb389dabbca870a0e720f5258d6c9b2cde156 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2196904 Commit-Queue: Frank Barchard Reviewed-by: richard winterton --- README.chromium | 2 +- include/libyuv/row.h | 5 +- include/libyuv/version.h | 2 +- source/row_common.cc | 61 ++++++++--------- source/row_gcc.cc | 137 +++++++++++++++++++------------------- source/row_neon64.cc | 140 +++++++++++++++++++-------------------- source/row_win.cc | 2 + unit_test/planar_test.cc | 54 +++++++++++---- 8 files changed, 214 insertions(+), 189 deletions(-) diff --git a/README.chromium b/README.chromium index a585593f2..1d963266e 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1753 +Version: 1754 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index c4c3dd442..e0a65c696 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -143,7 +143,6 @@ extern "C" { // Effects: #define HAS_ARGBADDROW_SSE2 #define HAS_ARGBAFFINEROW_SSE2 -#define HAS_ARGBATTENUATEROW_SSSE3 #define HAS_ARGBBLENDROW_SSSE3 #define HAS_ARGBCOLORMATRIXROW_SSSE3 #define HAS_ARGBCOLORTABLEROW_X86 @@ -231,7 +230,6 @@ extern "C" { // Effects: #define HAS_ARGBADDROW_AVX2 -#define HAS_ARGBATTENUATEROW_AVX2 #define HAS_ARGBMULTIPLYROW_AVX2 #define HAS_ARGBSUBTRACTROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 @@ -270,6 +268,7 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) #define HAS_ABGRTOAR30ROW_SSSE3 +#define HAS_ARGBATTENUATEROW_SSSE3 #define HAS_ARGBTOAR30ROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT8TO16ROW_SSE2 @@ -285,7 +284,6 @@ extern "C" { #define HAS_RGBATOYJROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3 #define HAS_SWAPUVROW_SSSE3 - #endif // The following are available for AVX2 gcc/clang x86 platforms: @@ -296,6 +294,7 @@ extern "C" { #define HAS_ABGRTOAR30ROW_AVX2 #define HAS_ABGRTOUVROW_AVX2 #define HAS_ABGRTOYROW_AVX2 +#define HAS_ARGBATTENUATEROW_AVX2 #define HAS_ARGBTOAR30ROW_AVX2 #define HAS_ARGBTORAWROW_AVX2 #define HAS_ARGBTORGB24ROW_AVX2 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 45003409c..321b466c6 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1753 +#define LIBYUV_VERSION 1754 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_common.cc b/source/row_common.cc index 941e0b9f6..6b0194170 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -763,11 +763,11 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565, r3 = (r3 << 3) | (r3 >> 2); #if LIBYUV_ARGBTOUV_PAVGB - uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); - uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); - uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); + uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); + uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); + uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); #else uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; @@ -776,10 +776,10 @@ void RGB565ToUVRow_C(const uint8_t* src_rgb565, dst_v[0] = RGB2xToV(r, g, b); #endif - src_rgb565 += 4; - next_rgb565 += 4; - dst_u += 1; - dst_v += 1; + src_rgb565 += 4; + next_rgb565 += 4; + dst_u += 1; + dst_v += 1; } if (width & 1) { uint8_t b0 = src_rgb565[0] & 0x1f; @@ -847,11 +847,11 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, r3 = (r3 << 3) | (r3 >> 2); #if LIBYUV_ARGBTOUV_PAVGB - uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); - uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); - uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); + uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); + uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); + uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); #else uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; @@ -860,10 +860,10 @@ void ARGB1555ToUVRow_C(const uint8_t* src_argb1555, dst_v[0] = RGB2xToV(r, g, b); #endif - src_argb1555 += 4; - next_argb1555 += 4; - dst_u += 1; - dst_v += 1; + src_argb1555 += 4; + next_argb1555 += 4; + dst_u += 1; + dst_v += 1; } if (width & 1) { uint8_t b0 = src_argb1555[0] & 0x1f; @@ -931,11 +931,11 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, r3 = (r3 << 4) | r3; #if LIBYUV_ARGBTOUV_PAVGB - uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); - uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); - uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); + uint8_t ab = AVGB(AVGB(b0, b2), AVGB(b1, b3)); + uint8_t ag = AVGB(AVGB(g0, g2), AVGB(g1, g3)); + uint8_t ar = AVGB(AVGB(r0, r2), AVGB(r1, r3)); + dst_u[0] = RGBToU(ar, ag, ab); + dst_v[0] = RGBToV(ar, ag, ab); #else uint16_t b = (b0 + b1 + b2 + b3 + 1) >> 1; uint16_t g = (g0 + g1 + g2 + g3 + 1) >> 1; @@ -944,10 +944,10 @@ void ARGB4444ToUVRow_C(const uint8_t* src_argb4444, dst_v[0] = RGB2xToV(r, g, b); #endif - src_argb4444 += 4; - next_argb4444 += 4; - dst_u += 1; - dst_v += 1; + src_argb4444 += 4; + next_argb4444 += 4; + dst_u += 1; + dst_v += 1; } if (width & 1) { uint8_t b0 = src_argb4444[0] & 0x0f; @@ -2681,7 +2681,7 @@ void UYVYToYRow_C(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { } } -#define BLEND(f, b, a) (((256 - a) * b) >> 8) + f +#define BLEND(f, b, a) clamp255((((256 - a) * b) >> 8) + f) // Blend src_argb0 over src_argb1 and store to dst_argb. // dst_argb may be src_argb0 or src_argb1. @@ -2757,12 +2757,7 @@ void BlendPlaneRow_C(const uint8_t* src0, } #undef UBLEND -#if defined(__aarch64__) || defined(__arm__) #define ATTENUATE(f, a) (f * a + 128) >> 8 -#else -// This code mimics the SSSE3 version for better testability. -#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24 -#endif // Multiply source RGB by alpha and store to destination. void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { diff --git a/source/row_gcc.cc b/source/row_gcc.cc index c4a9579d0..181d7b373 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -4892,94 +4892,99 @@ void BlendPlaneRow_AVX2(const uint8_t* src0, #endif // HAS_BLENDPLANEROW_AVX2 #ifdef HAS_ARGBATTENUATEROW_SSSE3 -// Shuffle table duplicating alpha -static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, - 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; -static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, - 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u}; +// Shuffle table duplicating alpha. +static const uvec8 kAttenShuffle = {6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, + 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u}; + // Attenuate 4 pixels at a time. void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "pcmpeqb %%xmm3,%%xmm3 \n" - "pslld $0x18,%%xmm3 \n" - "movdqa %3,%%xmm4 \n" - "movdqa %4,%%xmm5 \n" + "movdqu %3,%%xmm6 \n" // alpha shuffler + "pcmpeqb %%xmm7,%%xmm7 \n" // 0x0080 + "psllw $0xf,%%xmm7 \n" + "psrlw $0x8,%%xmm7 \n" + "pcmpeqb %%xmm0,%%xmm0 \n" // 0xff000000 + "pslld $0x18,%%xmm0 \n" + "sub %0,%1 \n" // 4 pixel loop. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm0 \n" - "pshufb %%xmm4,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "punpcklbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm1,%%xmm0 \n" - "movdqu (%0),%%xmm1 \n" - "pshufb %%xmm5,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "punpckhbw %%xmm2,%%xmm2 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "movdqu (%0),%%xmm2 \n" - "lea 0x10(%0),%0 \n" - "pand %%xmm3,%%xmm2 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "por %%xmm2,%%xmm0 \n" - "movdqu %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAlpha0), // %3 - "m"(kShuffleAlpha1) // %4 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); + "1: \n" + "movdqu (%0),%%xmm1 \n" + "pxor %%xmm4,%%xmm4 \n" + "movdqa %%xmm1,%%xmm2 \n" + "movdqa %%xmm1,%%xmm3 \n" + "punpcklbw %%xmm4,%%xmm2 \n" + "punpckhbw %%xmm4,%%xmm3 \n" + "movdqa %%xmm2,%%xmm4 \n" + "movdqa %%xmm3,%%xmm5 \n" + "pshufb %%xmm6,%%xmm4 \n" + "pshufb %%xmm6,%%xmm5 \n" + "pmullw %%xmm4,%%xmm2 \n" + "pmullw %%xmm5,%%xmm3 \n" + "pand %%xmm0,%%xmm1 \n" + "paddw %%xmm7,%%xmm2 \n" + "paddw %%xmm7,%%xmm3 \n" + "psrlw $0x8,%%xmm2 \n" + "psrlw $0x8,%%xmm3 \n" + "packuswb %%xmm3,%%xmm2 \n" + "por %%xmm1,%%xmm2 \n" + "movdqu %%xmm2,(%0,%1,1) \n" + "lea 0x10(%0),%0 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kAttenShuffle) // %3 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ARGBATTENUATEROW_SSSE3 #ifdef HAS_ARGBATTENUATEROW_AVX2 -// Shuffle table duplicating alpha. -static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, - 128u, 128u, 14u, 15u, 14u, 15u, - 14u, 15u, 128u, 128u}; // Attenuate 8 pixels at a time. void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "vbroadcastf128 %3,%%ymm4 \n" - "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" - "vpslld $0x18,%%ymm5,%%ymm5 \n" + "vbroadcastf128 %3,%%ymm6 \n" // alpha shuffler + "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" // 0xff000000 + "vpsllw $0xf,%%ymm0,%%ymm7 \n" // 0x0080 + "vpslld $0x18,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm7,%%ymm7 \n" "sub %0,%1 \n" // 8 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm6 \n" - "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" - "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" - "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" - "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" - "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" - "vpand %%ymm5,%%ymm6,%%ymm6 \n" - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm1,%%ymm1 \n" - "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" - "vpor %%ymm6,%%ymm0,%%ymm0 \n" - "vmovdqu %%ymm0,0x00(%0,%1,1) \n" - "lea 0x20(%0),%0 \n" + "vmovdqu (%0),%%ymm1 \n" + "vpxor %%ymm3,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm3,%%ymm1,%%ymm2 \n" + "vpunpckhbw %%ymm3,%%ymm1,%%ymm3 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm4 \n" + "vpshufb %%ymm6,%%ymm3,%%ymm5 \n" + "vpmullw %%ymm4,%%ymm2,%%ymm2 \n" + "vpmullw %%ymm5,%%ymm3,%%ymm3 \n" + "vpaddw %%ymm7,%%ymm2,%%ymm2 \n" + "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" + "vpsrlw $0x8,%%ymm2,%%ymm2 \n" + "vpsrlw $0x8,%%ymm3,%%ymm3 \n" + "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" + "vpblendvb %%ymm0,%%ymm1,%%ymm2,%%ymm2 \n" + "vmovdqu %%ymm2,(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" "sub $0x8,%2 \n" - "jg 1b \n" + "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAlpha_AVX2) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kAttenShuffle) // %3 + : "memory", "cc", + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ARGBATTENUATEROW_AVX2 @@ -7068,7 +7073,6 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u, "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" - "1: \n" LABELALIGN "1: \n" @@ -7111,11 +7115,10 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u, uint8_t* dst_uv, int width) { asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0x0101 "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" - "1: \n" LABELALIGN "1: \n" diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 039a2e277..2961bd3df 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -123,8 +123,8 @@ void I444ToARGBRow_NEON(const uint8_t* src_y, "movi v23.8b, #255 \n" /* A */ "1: \n" READYUV444 - YUVTORGB(v22, v21, v20) "prfm pldl1keep, [%0, 448] \n" + YUVTORGB(v22, v21, v20) "prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%2, 448] \n" "subs %w4, %w4, #8 \n" @@ -188,11 +188,11 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y, YUVTORGB_SETUP "1: \n" READYUV422 + "prfm pldl1keep, [%0, 448] \n" YUVTORGB(v22, v21, v20) "ld1 {v23.8b}, [%3], #8 \n" - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" "prfm pldl1keep, [%3, 448] \n" "subs %w5, %w5, #8 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" @@ -223,10 +223,10 @@ void I422ToRGBARow_NEON(const uint8_t* src_y, "movi v20.8b, #255 \n" /* A */ "1: \n" READYUV422 - YUVTORGB(v23, v22, v21) "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" + YUVTORGB(v23, v22, v21) + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" "subs %w4, %w4, #8 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" @@ -254,10 +254,10 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, YUVTORGB_SETUP "1: \n" READYUV422 - YUVTORGB(v22, v21, v20) "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" + YUVTORGB(v22, v21, v20) + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" "subs %w4, %w4, #8 \n" "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" "b.gt 1b \n" @@ -295,13 +295,12 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, "1: \n" READYUV422 YUVTORGB(v22, v21, v20) + "prfm pldl1keep, [%0, 448] \n" "subs %w4, %w4, #8 \n" ARGBTORGB565 - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. - "prfm pldl1keep, [%0, 448] \n" "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 @@ -337,11 +336,11 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, "1: \n" READYUV422 YUVTORGB(v22, v21, v20) + "prfm pldl1keep, [%0, 448] \n" "subs %w4, %w4, #8 \n" ARGBTOARGB1555 - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. "b.gt 1b \n" : "+r"(src_y), // %0 @@ -380,12 +379,12 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, "1: \n" READYUV422 YUVTORGB(v22, v21, v20) + "prfm pldl1keep, [%0, 448] \n" "subs %w4, %w4, #8 \n" "movi v23.8b, #255 \n" ARGBTOARGB4444 - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" + "prfm pldl1keep, [%1, 128] \n" + "prfm pldl1keep, [%2, 128] \n" "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels ARGB4444. "b.gt 1b \n" : "+r"(src_y), // %0 @@ -453,9 +452,9 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y, "movi v23.8b, #255 \n" "1: \n" READNV12 - YUVTORGB(v22, v21, v20) "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" + YUVTORGB(v22, v21, v20) + "prfm pldl1keep, [%1, 256] \n" "subs %w3, %w3, #8 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "b.gt 1b \n" @@ -482,9 +481,9 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y, "movi v23.8b, #255 \n" "1: \n" READNV21 - YUVTORGB(v22, v21, v20) "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" + YUVTORGB(v22, v21, v20) + "prfm pldl1keep, [%1, 256] \n" "subs %w3, %w3, #8 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "b.gt 1b \n" @@ -510,9 +509,9 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y, YUVTORGB_SETUP "1: \n" READNV12 - YUVTORGB(v22, v21, v20) "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" + YUVTORGB(v22, v21, v20) + "prfm pldl1keep, [%1, 256] \n" "subs %w3, %w3, #8 \n" "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" "b.gt 1b \n" @@ -538,9 +537,9 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, YUVTORGB_SETUP "1: \n" READNV21 - YUVTORGB(v22, v21, v20) "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" + YUVTORGB(v22, v21, v20) + "prfm pldl1keep, [%1, 256] \n" "subs %w3, %w3, #8 \n" "st3 {v20.8b,v21.8b,v22.8b}, [%2], #24 \n" "b.gt 1b \n" @@ -562,25 +561,24 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb565, const struct YuvConstants* yuvconstants, int width) { - asm volatile(YUVTORGB_SETUP - "1: \n" READNV12 YUVTORGB( - v22, v21, v20) ARGBTORGB565 - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #8 \n" - "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels - "b.gt 1b \n" - : "+r"(src_y), // %0 - "+r"(src_uv), // %1 - "+r"(dst_rgb565), // %2 - "+r"(width) // %3 - : [kUVToRB] "r"(&yuvconstants->kUVToRB), - [kUVToG] "r"(&yuvconstants->kUVToG), - [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), - [kYToRgb] "r"(&yuvconstants->kYToRgb) - : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", - "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", - "v29", "v30"); + asm volatile( + YUVTORGB_SETUP "1: \n" READNV12 + "prfm pldl1keep, [%0, 448] \n" YUVTORGB( + v22, v21, v20) ARGBTORGB565 + "prfm pldl1keep, [%1, 256] \n" + "subs %w3, %w3, #8 \n" + "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); } void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, @@ -592,8 +590,8 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, "movi v23.8b, #255 \n" "1: \n" READYUY2 - YUVTORGB(v22, v21, v20) "prfm pldl1keep, [%0, 448] \n" + YUVTORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" @@ -757,7 +755,6 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { "1: \n" "subs %w1, %w1, #16 \n" // 16 bytes per loop "st1 {v0.16b}, [%0], #16 \n" // store - "prfm pldl1keep, [%0, 448] \n" "b.gt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 @@ -771,7 +768,6 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { "1: \n" "subs %w1, %w1, #4 \n" // 4 ints per loop "st1 {v0.16b}, [%0], #16 \n" // store - "prfm pldl1keep, [%0, 448] \n" "b.gt 1b \n" : "+r"(dst), // %0 "+r"(width) // %1 @@ -1161,9 +1157,9 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. + "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #16 \n" // 16 processed per loop. "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. - "prfm pldl1keep, [%0, 448] \n" "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 @@ -1861,10 +1857,10 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, // 16x2 pixels -> 8x1. width is number of rgb pixels. e.g. 16. void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, - int src_stride_rgb565, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { + int src_stride_rgb565, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { const uint8_t* src_rgb565_1 = src_rgb565 + src_stride_rgb565; asm volatile( RGBTOUV_SETUP_REG @@ -2456,8 +2452,8 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb, "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. "prfm pldl1keep, [%0, 448] \n" - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "uxtl v0.8h, v0.8b \n" // b (0 .. 255) + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "uxtl v0.8h, v0.8b \n" // b (0 .. 255) "uxtl v1.8h, v1.8b \n" "uxtl v2.8h, v2.8b \n" "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale @@ -2566,19 +2562,19 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. "prfm pldl1keep, [%0, 448] \n" - "subs %w1, %w1, #8 \n" // 8 processed per loop. - "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B - "umlal v4.8h, v1.8b, v21.8b \n" // G - "umlal v4.8h, v2.8b, v22.8b \n" // R - "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G - "umlal v5.8h, v1.8b, v25.8b \n" // G - "umlal v5.8h, v2.8b, v26.8b \n" // R - "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R - "umlal v6.8h, v1.8b, v29.8b \n" // G - "umlal v6.8h, v2.8b, v30.8b \n" // R - "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B - "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G - "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R + "subs %w1, %w1, #8 \n" // 8 processed per loop. + "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B + "umlal v4.8h, v1.8b, v21.8b \n" // G + "umlal v4.8h, v2.8b, v22.8b \n" // R + "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G + "umlal v5.8h, v1.8b, v25.8b \n" // G + "umlal v5.8h, v2.8b, v26.8b \n" // R + "umull v6.8h, v0.8b, v28.8b \n" // B to Sepia R + "umlal v6.8h, v1.8b, v29.8b \n" // G + "umlal v6.8h, v2.8b, v30.8b \n" // R + "uqshrn v0.8b, v4.8h, #7 \n" // 16 bit to 8 bit B + "uqshrn v1.8b, v5.8h, #7 \n" // 16 bit to 8 bit G + "uqshrn v2.8b, v6.8h, #7 \n" // 16 bit to 8 bit R "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // store 8 pixels. "b.gt 1b \n" : "+r"(dst_argb), // %0 @@ -3225,8 +3221,8 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, int width) { asm volatile( "1: \n" - "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values - "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values + "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values + "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values "prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%1, 448] \n" "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values diff --git a/source/row_win.cc b/source/row_win.cc index fdb111e8b..08b80fcc8 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4254,6 +4254,8 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, } #endif // HAS_ARGBBLENDROW_SSSE3 +// ARGBAttenuateRow disabled on win32 due to differences (off by 1) compared +// to C and Neon. Use row_gcc.cc with clangcl. #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha. static const uvec8 kShuffleAlpha0 = { diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 736e478a7..6949db258 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -38,6 +38,26 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) { align_buffer_page_end(unatten_pixels, kSize); align_buffer_page_end(atten2_pixels, kSize); + // Test unattenuation clamps + orig_pixels[0 * 4 + 0] = 10u; + orig_pixels[0 * 4 + 1] = 20u; + orig_pixels[0 * 4 + 2] = 30u; + orig_pixels[0 * 4 + 3] = 255u; + orig_pixels[1 * 4 + 0] = 255u; + orig_pixels[1 * 4 + 1] = 128u; + orig_pixels[1 * 4 + 2] = 99u; + orig_pixels[1 * 4 + 3] = 255u; + + ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 2, 1); + EXPECT_EQ(10u, atten_pixels[0 * 4 + 0]); + EXPECT_EQ(20u, atten_pixels[0 * 4 + 1]); + EXPECT_EQ(30u, atten_pixels[0 * 4 + 2]); + EXPECT_EQ(255u, atten_pixels[0 * 4 + 3]); + EXPECT_EQ(254u, atten_pixels[1 * 4 + 0]); + EXPECT_EQ(128u, atten_pixels[1 * 4 + 1]); + EXPECT_EQ(99u, atten_pixels[1 * 4 + 2]); + EXPECT_EQ(255u, atten_pixels[1 * 4 + 3]); + // Test unattenuation clamps orig_pixels[0 * 4 + 0] = 200u; orig_pixels[0 * 4 + 1] = 129u; @@ -100,9 +120,9 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) { EXPECT_EQ(32, atten_pixels[128 * 4 + 1]); EXPECT_EQ(21, atten_pixels[128 * 4 + 2]); EXPECT_EQ(128, atten_pixels[128 * 4 + 3]); - EXPECT_NEAR(255, atten_pixels[255 * 4 + 0], 1); - EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], 1); - EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], 1); + EXPECT_EQ(254, atten_pixels[255 * 4 + 0]); + EXPECT_EQ(127, atten_pixels[255 * 4 + 1]); + EXPECT_EQ(85, atten_pixels[255 * 4 + 2]); EXPECT_EQ(255, atten_pixels[255 * 4 + 3]); free_aligned_buffer_page_end(atten2_pixels); @@ -1125,7 +1145,8 @@ static int TestBlend(int width, int disable_cpu_flags, int benchmark_cpu_info, int invert, - int off) { + int off, + int attenuate) { if (width < 1) { width = 1; } @@ -1139,10 +1160,12 @@ static int TestBlend(int width, src_argb_a[i + off] = (fastrand() & 0xff); src_argb_b[i + off] = (fastrand() & 0xff); } - ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width, - height); - ARGBAttenuate(src_argb_b + off, kStride, src_argb_b + off, kStride, width, - height); + MemRandomize(src_argb_a, kStride * height + off); + MemRandomize(src_argb_b, kStride * height + off); + if (attenuate) { + ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width, + height); + } memset(dst_argb_c, 255, kStride * height); memset(dst_argb_opt, 255, kStride * height); @@ -1172,28 +1195,35 @@ static int TestBlend(int width, TEST_F(LibYUVPlanarTest, ARGBBlend_Any) { int max_diff = TestBlend(benchmark_width_ - 4, benchmark_height_, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlend_Unaligned) { int max_diff = TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 1); + disable_cpu_flags_, benchmark_cpu_info_, +1, 1, 1); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlend_Invert) { int max_diff = TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, -1, 0); + disable_cpu_flags_, benchmark_cpu_info_, -1, 0, 1); + EXPECT_LE(max_diff, 1); +} + +TEST_F(LibYUVPlanarTest, ARGBBlend_Unattenuated) { + int max_diff = + TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_, + disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 0); EXPECT_LE(max_diff, 1); } TEST_F(LibYUVPlanarTest, ARGBBlend_Opt) { int max_diff = TestBlend(benchmark_width_, benchmark_height_, benchmark_iterations_, - disable_cpu_flags_, benchmark_cpu_info_, +1, 0); + disable_cpu_flags_, benchmark_cpu_info_, +1, 0, 1); EXPECT_LE(max_diff, 1); }