From 1837f0022eaa6fde28dc78822c3eb7c4223e66f9 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 13 Jul 2020 14:44:29 -0700 Subject: [PATCH] Rollback of ARGBAttentuate ARGBAttenuate AVX2 different than NEON/C Was C ARGBAttenuate_Opt (1151 ms) SSSE3 ARGBAttenuate_Opt (455 ms) AVX2 ARGBAttenuate_Opt (296 ms) Now C ARGBAttenuate_Opt (1765 ms) SSSE3 ARGBAttenuate_Opt (355 ms) AVX2 ARGBAttenuate_Opt (299 ms) BUG=b/153564664 Change-Id: I2f027339552e399b90cc5ffeffde4255e9ff175b Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2294488 Commit-Queue: Frank Barchard Reviewed-by: Mirko Bonadei --- README.chromium | 2 +- include/libyuv/row.h | 4 +- include/libyuv/version.h | 2 +- source/row_common.cc | 5 ++ source/row_gcc.cc | 135 +++++++++++++++++++------------------- source/row_win.cc | 2 - unit_test/convert_test.cc | 32 +++++---- unit_test/planar_test.cc | 26 +------- 8 files changed, 97 insertions(+), 111 deletions(-) diff --git a/README.chromium b/README.chromium index e2d913276..89dd5c8f0 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1760 +Version: 1762 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 18b0bfd7e..a27788c1f 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -142,6 +142,7 @@ extern "C" { // Effects: #define HAS_ARGBADDROW_SSE2 #define HAS_ARGBAFFINEROW_SSE2 +#define HAS_ARGBATTENUATEROW_SSSE3 #define HAS_ARGBBLENDROW_SSSE3 #define HAS_ARGBCOLORMATRIXROW_SSSE3 #define HAS_ARGBCOLORTABLEROW_X86 @@ -228,6 +229,7 @@ extern "C" { // Effects: #define HAS_ARGBADDROW_AVX2 +#define HAS_ARGBATTENUATEROW_AVX2 #define HAS_ARGBMULTIPLYROW_AVX2 #define HAS_ARGBSUBTRACTROW_AVX2 #define HAS_ARGBUNATTENUATEROW_AVX2 @@ -266,7 +268,6 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) #define HAS_ABGRTOAR30ROW_SSSE3 -#define HAS_ARGBATTENUATEROW_SSSE3 #define HAS_ARGBTOAR30ROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT8TO16ROW_SSE2 @@ -293,7 +294,6 @@ extern "C" { #define HAS_ABGRTOAR30ROW_AVX2 #define HAS_ABGRTOUVROW_AVX2 #define HAS_ABGRTOYROW_AVX2 -#define HAS_ARGBATTENUATEROW_AVX2 #define HAS_ARGBTOAR30ROW_AVX2 #define HAS_ARGBTORAWROW_AVX2 #define HAS_ARGBTORGB24ROW_AVX2 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 62edc7afd..e54313543 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1760 +#define LIBYUV_VERSION 1762 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_common.cc b/source/row_common.cc index c7420ed6f..588f8c42f 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2782,7 +2782,12 @@ void BlendPlaneRow_C(const uint8_t* src0, } #undef UBLEND +#if defined(__aarch64__) || defined(__arm__) #define ATTENUATE(f, a) (f * a + 128) >> 8 +#else +// This code mimics the SSSE3 version for better testability. +#define ATTENUATE(f, a) (a | (a << 8)) * (f | (f << 8)) >> 24 +#endif // Multiply source RGB by alpha and store to destination. void ARGBAttenuateRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width) { diff --git a/source/row_gcc.cc b/source/row_gcc.cc index f70541884..709f0709d 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -4889,98 +4889,93 @@ void BlendPlaneRow_AVX2(const uint8_t* src0, #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha. -static const uvec8 kAttenShuffle = {6u, 7u, 6u, 7u, 6u, 7u, 128u, 128u, - 14u, 15u, 14u, 15u, 14u, 15u, 128u, 128u}; - +static const uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, + 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u}; +static const uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, + 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u}; // Attenuate 4 pixels at a time. void ARGBAttenuateRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "movdqu %3,%%xmm6 \n" // alpha shuffler - "pcmpeqb %%xmm7,%%xmm7 \n" // 0x0080 - "psllw $0xf,%%xmm7 \n" - "psrlw $0x8,%%xmm7 \n" - "pcmpeqb %%xmm0,%%xmm0 \n" // 0xff000000 - "pslld $0x18,%%xmm0 \n" - "sub %0,%1 \n" + "pcmpeqb %%xmm3,%%xmm3 \n" + "pslld $0x18,%%xmm3 \n" + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" // 4 pixel loop. LABELALIGN - "1: \n" - "movdqu (%0),%%xmm1 \n" - "pxor %%xmm4,%%xmm4 \n" - "movdqa %%xmm1,%%xmm2 \n" - "movdqa %%xmm1,%%xmm3 \n" - "punpcklbw %%xmm4,%%xmm2 \n" - "punpckhbw %%xmm4,%%xmm3 \n" - "movdqa %%xmm2,%%xmm4 \n" - "movdqa %%xmm3,%%xmm5 \n" - "pshufb %%xmm6,%%xmm4 \n" - "pshufb %%xmm6,%%xmm5 \n" - "pmullw %%xmm4,%%xmm2 \n" - "pmullw %%xmm5,%%xmm3 \n" - "pand %%xmm0,%%xmm1 \n" - "paddw %%xmm7,%%xmm2 \n" - "paddw %%xmm7,%%xmm3 \n" - "psrlw $0x8,%%xmm2 \n" - "psrlw $0x8,%%xmm3 \n" - "packuswb %%xmm3,%%xmm2 \n" - "por %%xmm1,%%xmm2 \n" - "movdqu %%xmm2,(%0,%1,1) \n" - "lea 0x10(%0),%0 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kAttenShuffle) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + "1: \n" + "movdqu (%0),%%xmm0 \n" + "pshufb %%xmm4,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "punpcklbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm1,%%xmm0 \n" + "movdqu (%0),%%xmm1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "punpckhbw %%xmm2,%%xmm2 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "movdqu (%0),%%xmm2 \n" + "lea 0x10(%0),%0 \n" + "pand %%xmm3,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "por %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha0), // %3 + "m"(kShuffleAlpha1) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } #endif // HAS_ARGBATTENUATEROW_SSSE3 #ifdef HAS_ARGBATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const uvec8 kShuffleAlpha_AVX2 = {6u, 7u, 6u, 7u, 6u, 7u, + 128u, 128u, 14u, 15u, 14u, 15u, + 14u, 15u, 128u, 128u}; // Attenuate 8 pixels at a time. void ARGBAttenuateRow_AVX2(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( - "vbroadcastf128 %3,%%ymm6 \n" // alpha shuffler - "vpcmpeqb %%ymm0,%%ymm0,%%ymm0 \n" // 0xff000000 - "vpsllw $0xf,%%ymm0,%%ymm7 \n" // 0x0080 - "vpslld $0x18,%%ymm0,%%ymm0 \n" - "vpsrlw $0x8,%%ymm7,%%ymm7 \n" + "vbroadcastf128 %3,%%ymm4 \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpslld $0x18,%%ymm5,%%ymm5 \n" "sub %0,%1 \n" // 8 pixel loop. LABELALIGN "1: \n" - "vmovdqu (%0),%%ymm1 \n" - "vpxor %%ymm3,%%ymm3,%%ymm3 \n" - "vpunpcklbw %%ymm3,%%ymm1,%%ymm2 \n" - "vpunpckhbw %%ymm3,%%ymm1,%%ymm3 \n" - "vpshufb %%ymm6,%%ymm2,%%ymm4 \n" - "vpshufb %%ymm6,%%ymm3,%%ymm5 \n" - "vpmullw %%ymm4,%%ymm2,%%ymm2 \n" - "vpmullw %%ymm5,%%ymm3,%%ymm3 \n" - "vpaddw %%ymm7,%%ymm2,%%ymm2 \n" - "vpaddw %%ymm7,%%ymm3,%%ymm3 \n" - "vpsrlw $0x8,%%ymm2,%%ymm2 \n" - "vpsrlw $0x8,%%ymm3,%%ymm3 \n" - "vpackuswb %%ymm3,%%ymm2,%%ymm2 \n" - "vpblendvb %%ymm0,%%ymm1,%%ymm2,%%ymm2 \n" - "vmovdqu %%ymm2,(%0,%1,1) \n" - "lea 0x20(%0),%0 \n" + "vmovdqu (%0),%%ymm6 \n" + "vpunpcklbw %%ymm6,%%ymm6,%%ymm0 \n" + "vpunpckhbw %%ymm6,%%ymm6,%%ymm1 \n" + "vpshufb %%ymm4,%%ymm0,%%ymm2 \n" + "vpshufb %%ymm4,%%ymm1,%%ymm3 \n" + "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" + "vpmulhuw %%ymm3,%%ymm1,%%ymm1 \n" + "vpand %%ymm5,%%ymm6,%%ymm6 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpor %%ymm6,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%0,%1,1) \n" + "lea 0x20(%0),%0 \n" "sub $0x8,%2 \n" - "jg 1b \n" + "jg 1b \n" "vzeroupper \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kAttenShuffle) // %3 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAlpha_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } #endif // HAS_ARGBATTENUATEROW_AVX2 @@ -7069,6 +7064,7 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u, "psrlw $0xf,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" + "1: \n" LABELALIGN "1: \n" @@ -7111,10 +7107,11 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u, uint8_t* dst_uv, int width) { asm volatile( - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0x0101 + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "1: \n" LABELALIGN "1: \n" diff --git a/source/row_win.cc b/source/row_win.cc index 74d87e117..9afcf060a 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4257,8 +4257,6 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, } #endif // HAS_ARGBBLENDROW_SSSE3 -// ARGBAttenuateRow disabled on win32 due to differences (off by 1) compared -// to C and Neon. Use row_gcc.cc with clangcl. #ifdef HAS_ARGBATTENUATEROW_SSSE3 // Shuffle table duplicating alpha. static const uvec8 kShuffleAlpha0 = { diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 6e92093dc..08dc9e892 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -720,8 +720,27 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1) TESTQPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, _Premult, +, 0, 1) +#define J420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, l, m) +#define J420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, l, m) +#define H420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, l, m) +#define H420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, l, m) +#define U420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, l, m) +#define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, l, m) + TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1) TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1) +TESTQPLANARTOB(J420Alpha, 2, 2, ARGB, 4, 4, 1) +TESTQPLANARTOB(J420Alpha, 2, 2, ABGR, 4, 4, 1) +TESTQPLANARTOB(H420Alpha, 2, 2, ARGB, 4, 4, 1) +TESTQPLANARTOB(H420Alpha, 2, 2, ABGR, 4, 4, 1) +TESTQPLANARTOB(U420Alpha, 2, 2, ARGB, 4, 4, 1) +TESTQPLANARTOB(U420Alpha, 2, 2, ABGR, 4, 4, 1) #define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, \ BPP_B, W1280, N, NEG, OFF) \ @@ -2532,19 +2551,6 @@ TESTPLANARTOE(I422, 2, 1, UYVY, 2, 4, ARGB, 4) TESTQPLANARTOEI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, SUB_B, BPP_B, \ benchmark_width_, _Premult, +, 0, FMT_C, BPP_C, 1) -#define J420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ - I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, l, m) -#define J420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ - I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, l, m) -#define H420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ - I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, l, m) -#define H420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ - I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, l, m) -#define U420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ - I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, l, m) -#define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ - I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, l, m) - TESTQPLANARTOE(I420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4) TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4) TESTQPLANARTOE(J420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4) diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 6949db258..65aa46e0d 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -38,26 +38,6 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) { align_buffer_page_end(unatten_pixels, kSize); align_buffer_page_end(atten2_pixels, kSize); - // Test unattenuation clamps - orig_pixels[0 * 4 + 0] = 10u; - orig_pixels[0 * 4 + 1] = 20u; - orig_pixels[0 * 4 + 2] = 30u; - orig_pixels[0 * 4 + 3] = 255u; - orig_pixels[1 * 4 + 0] = 255u; - orig_pixels[1 * 4 + 1] = 128u; - orig_pixels[1 * 4 + 2] = 99u; - orig_pixels[1 * 4 + 3] = 255u; - - ARGBAttenuate(orig_pixels, 0, atten_pixels, 0, 2, 1); - EXPECT_EQ(10u, atten_pixels[0 * 4 + 0]); - EXPECT_EQ(20u, atten_pixels[0 * 4 + 1]); - EXPECT_EQ(30u, atten_pixels[0 * 4 + 2]); - EXPECT_EQ(255u, atten_pixels[0 * 4 + 3]); - EXPECT_EQ(254u, atten_pixels[1 * 4 + 0]); - EXPECT_EQ(128u, atten_pixels[1 * 4 + 1]); - EXPECT_EQ(99u, atten_pixels[1 * 4 + 2]); - EXPECT_EQ(255u, atten_pixels[1 * 4 + 3]); - // Test unattenuation clamps orig_pixels[0 * 4 + 0] = 200u; orig_pixels[0 * 4 + 1] = 129u; @@ -120,9 +100,9 @@ TEST_F(LibYUVPlanarTest, TestAttenuate) { EXPECT_EQ(32, atten_pixels[128 * 4 + 1]); EXPECT_EQ(21, atten_pixels[128 * 4 + 2]); EXPECT_EQ(128, atten_pixels[128 * 4 + 3]); - EXPECT_EQ(254, atten_pixels[255 * 4 + 0]); - EXPECT_EQ(127, atten_pixels[255 * 4 + 1]); - EXPECT_EQ(85, atten_pixels[255 * 4 + 2]); + EXPECT_NEAR(255, atten_pixels[255 * 4 + 0], 1); + EXPECT_NEAR(127, atten_pixels[255 * 4 + 1], 1); + EXPECT_NEAR(85, atten_pixels[255 * 4 + 2], 1); EXPECT_EQ(255, atten_pixels[255 * 4 + 3]); free_aligned_buffer_page_end(atten2_pixels);