diff --git a/README.chromium b/README.chromium index 00b52ac7e..3365e9695 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1918 +Version: 1919 License: BSD-3-Clause License File: LICENSE Shipped: yes diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 64cb0869c..c854adee1 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -346,11 +346,6 @@ extern "C" { #define HAS_SPLITXRGBROW_AVX2 #define HAS_SWAPUVROW_AVX2 #define HAS_YUY2TONVUVROW_AVX2 -// TODO: Port SSSE3 to AVX2 -// #define HAS_ABGRTOUVJROW_AVX2 -// #define HAS_ABGRTOUVROW_AVX2 -// #define HAS_ARGBTOUVJROW_AVX2 -#define HAS_ARGBTOUVROW_AVX2 #if defined(__x86_64__) || !defined(__pic__) // TODO(fbarchard): fix build error on android_full_debug=1 @@ -360,6 +355,18 @@ extern "C" { #endif #endif +// The following are available for AVX2 gcc/clang x64 platforms: +// TODO(fbarchard): Port to Visual C +// TODO(fbarchard): Port to x86 32 bit +#if !defined(LIBYUV_DISABLE_X86) && defined(__x86_64__) && \ + (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) && \ + !defined(LIBYUV_ENABLE_ROWWIN) +#define HAS_ABGRTOUVJROW_AVX2 +#define HAS_ABGRTOUVROW_AVX2 +#define HAS_ARGBTOUVJROW_AVX2 +#define HAS_ARGBTOUVROW_AVX2 +#endif + // The following are available for AVX512 clang x86 platforms: // TODO(fbarchard): Port to GCC and Visual C // TODO(b/42280744): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 35bf7d3ec..0fc37aed8 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1918 +#define LIBYUV_VERSION 1919 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_gcc.cc b/source/row_gcc.cc index ac90fb60c..49f9159a9 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1747,9 +1747,8 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, "movdqa %0,%%xmm6 \n" // ARGB to U "movdqa %1,%%xmm7 \n" // ARGB to V : - : - "m"(rgbuvconstants->kRGBToU), // %0 - "m"(rgbuvconstants->kRGBToV) // %1 + : "m"(rgbuvconstants->kRGBToU), // %0 + "m"(rgbuvconstants->kRGBToV) // %1 : "memory", "cc"); asm volatile( @@ -1824,9 +1823,12 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, #ifdef HAS_ARGBTOUVROW_AVX2 -// UYVY shuf 8 UV to 16 UV. -static const vec8 kShuffleAARRGGBB = {0, 4, 1, 5, 2, 6, 3, 7, - 8, 12, 9, 13, 10, 14, 11, 15}; +// ARGBARGB to AARRGGBB shuffle +static const lvec8 kShuffleAARRGGBB = { + 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15, + 0, 4, 1, 5, 2, 6, 3, 7, 8, 12, 9, 13, 10, 14, 11, 15, +}; + // 16x2 -> 8x1 ARGB pixels converted to 8 U and 8 V // ARGBToUV does rounding average of 4 ARGB pixels @@ -1837,19 +1839,16 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, int width, const struct RgbUVConstants* rgbuvconstants) { asm volatile( - "vbroadcastf128 %7,%%ymm15 \n" // kShuffleAARRGGBB + "vmovdqa %7,%%ymm15 \n" // kShuffleAARRGGBB "vpcmpeqb %%ymm14,%%ymm14,%%ymm14 \n" // 0x0101 "vpabsb %%ymm14,%%ymm14 \n" "vpxor %%ymm13,%%ymm13,%%ymm13 \n" // 0 for vpavgw - "vbroadcastf128 %5,%%ymm6 \n" // RGBToU "vbroadcastf128 %6,%%ymm7 \n" // RGBToV "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // 0x8000 "vpsllw $15,%%ymm5,%%ymm5 \n" - "sub %1,%2 \n" - LABELALIGN "1: \n" "vmovdqu (%0),%%ymm8 \n" // Read 16x2 ARGB Pixels "vmovdqu 0x20(%0),%%ymm9 \n" @@ -1886,7 +1885,7 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, "lea 0x40(%0),%0 \n" "lea 0x8(%1),%1 \n" "subl $0x10,%3 \n" - "jg 1b \n" + "jg 1b \n" "vzeroupper \n" : "+r"(src_argb), // %0 "+r"(dst_u), // %1 @@ -1899,10 +1898,85 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb, : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7", "xmm8", "xmm9", "xmm10", "xmm11", "xmm13", "xmm14", "xmm15"); } + +// 16x2 -> 8x1 ARGB pixels converted to 8 U and 8 V +// ARGBToUV does rounding average of 4 ARGB pixels +// TODO: unroll to 32x2 or update caller/any to 16 pixels +void ARGBToUVMatrixRow_AVX2_MADDWD(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct RgbUVConstants* rgbuvconstants) { + asm volatile( + "vmovdqa %7,%%ymm15 \n" // kShuffleAARRGGBB + "vpcmpeqb %%ymm14,%%ymm14,%%ymm14 \n" // 0x0101 + "vpabsb %%ymm14,%%ymm14 \n" + "vpxor %%ymm13,%%ymm13,%%ymm13 \n" // 0 for vpavgw + "vpcmpeqw %%ymm12,%%ymm12,%%ymm12 \n" // -1 + "vbroadcastf128 %5,%%ymm6 \n" // RGBToU + "vbroadcastf128 %6,%%ymm7 \n" // RGBToV + "vpcmpeqd %%ymm5,%%ymm5,%%ymm5 \n" // 0x00008000 + "vpabsd %%ymm5,%%ymm5 \n" + "vpslld $15,%%ymm5,%%ymm5 \n" + "sub %1,%2 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm8 \n" // Read 16x2 ARGB Pixels + "vmovdqu 0x20(%0),%%ymm9 \n" + "vmovdqu 0x00(%0,%4,1),%%ymm10 \n" + "vmovdqu 0x20(%0,%4,1),%%ymm11 \n" + "vpshufb %%ymm15,%%ymm8,%%ymm8 \n" // aarrggbb + "vpshufb %%ymm15,%%ymm9,%%ymm9 \n" + "vpshufb %%ymm15,%%ymm10,%%ymm10 \n" + "vpshufb %%ymm15,%%ymm11,%%ymm11 \n" + "vpmaddubsw %%ymm14,%%ymm8,%%ymm8 \n" // 16x2 -> 8x2 + "vpmaddubsw %%ymm14,%%ymm9,%%ymm9 \n" + "vpmaddubsw %%ymm14,%%ymm10,%%ymm10 \n" + "vpmaddubsw %%ymm14,%%ymm11,%%ymm11 \n" + "vpaddw %%ymm8,%%ymm10,%%ymm8 \n" // 8x2 -> 8x1 + "vpaddw %%ymm9,%%ymm11,%%ymm9 \n" + "vpsrlw $1,%%ymm8,%%ymm8 \n" + "vpsrlw $1,%%ymm9,%%ymm9 \n" + "vpavgw %%ymm13,%%ymm8,%%ymm8 \n" + "vpavgw %%ymm13,%%ymm9,%%ymm9 \n" + "vpackuswb %%ymm9,%%ymm8,%%ymm0 \n" // mutates + "vpermq $0xd8,%%ymm0,%%ymm0 \n" // 8 ARGB Pixels + + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" // 8 V + "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" // 8 U + "vpmaddwd %%ymm12,%%ymm1,%%ymm1 \n" // negate + hadd + "vpmaddwd %%ymm12,%%ymm0,%%ymm0 \n" + "vpaddd %%ymm1,%%ymm5,%%ymm1 \n" // +0x8000 = 0 to 0xFFFF + "vpaddd %%ymm0,%%ymm5,%%ymm0 \n" + "vpackusdw %%ymm1,%%ymm0,%%ymm0 \n" // mutates + "vpermq $0xd8,%%ymm0,%%ymm0 \n" // uuuuuuuu vvvvvvvv + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" // mutates + "vmovq %%xmm0,(%1) \n" // Write 8 U's + "vextractf128 $0x1,%%ymm0,%%xmm0 \n" // Copy V to low 8 bytes + "vmovq %%xmm0,0x00(%1,%2,1) \n" // Write 8 V's + + "lea 0x40(%0),%0 \n" + "lea 0x8(%1),%1 \n" + "subl $0x10,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"((intptr_t)(src_stride_argb)), // %4 + "m"(rgbuvconstants->kRGBToU), // %5 + "m"(rgbuvconstants->kRGBToV), // %6 + "m"(kShuffleAARRGGBB) // %7 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7", "xmm8", + "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"); +} #endif // HAS_ARGBTOUVROW_AVX2 - -#ifdef HAS_ARGBTOUV444ROW_SSSE3 +#if defined(HAS_ARGBTOUV444ROW_SSSE3) || defined(HAS_ARGBTOUVROW_AVX2) // RGB to BT601 coefficients // UB 0.875 coefficient = 112 @@ -1916,6 +1990,20 @@ static const struct RgbUVConstants kARGBI601UVConstants = { {-112, 74, 38, 0, -112, 74, 38, 0, -112, 74, 38, 0, -112, 74, 38, 0}, {18, 94, -112, 0, 18, 94, -112, 0, 18, 94, -112, 0, 18, 94, -112, 0}}; +static const struct RgbUVConstants kABGRI601UVConstants = { + {38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0}, + {-112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18, 0}}; + +static const struct RgbUVConstants kBGRAI601UVConstants = { + {0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112}, + {0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18}}; + +static const struct RgbUVConstants kRGBAI601UVConstants = { + {0, -112, 74, 38, 0, -112, 74, 38, 0, -112, 74, 38, 0, -112, 74, 38}, + {0, 18, 94, -112, 0, 18, 94, -112, 0, 18, 94, -112, 0, 18, 94, -112}}; +#endif + +#ifdef HAS_ARGBTOUV444ROW_SSSE3 void ARGBToUV444Row_SSSE3(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, @@ -1935,18 +2023,6 @@ void ARGBToUV444Row_AVX2(const uint8_t* src_argb, } #endif // HAS_ARGBTOUV444ROW_AVX2 -#ifdef HAS_ARGBTOUVROW_AVX2 -void ARGBToUVRow_AVX2(const uint8_t* src_argb, - int src_stride_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width) { - ARGBToUVMatrixRow_AVX2(src_argb, src_stride_argb, dst_u, dst_v, width, - &kARGBI601UVConstants); -} -#endif // HAS_ARGBTOUVROW_AVX2 - - #ifdef HAS_ARGBTOUVROW_SSSE3 void ARGBToUVRow_SSSE3(const uint8_t* src_argb, int src_stride_argb, @@ -1957,10 +2033,6 @@ void ARGBToUVRow_SSSE3(const uint8_t* src_argb, &kARGBI601UVConstants); } -static const struct RgbUVConstants kABGRI601UVConstants = { - {38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0}, - {-112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18, 0}}; - void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, @@ -1970,10 +2042,6 @@ void ABGRToUVRow_SSSE3(const uint8_t* src_abgr, &kABGRI601UVConstants); } -static const struct RgbUVConstants kBGRAI601UVConstants = { - {0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112, 0, 38, 74, -112}, - {0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18, 0, -112, 94, 18}}; - void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, int src_stride_bgra, uint8_t* dst_u, @@ -1983,10 +2051,6 @@ void BGRAToUVRow_SSSE3(const uint8_t* src_bgra, &kBGRAI601UVConstants); } -static const struct RgbUVConstants kRGBAI601UVConstants = { - {0, -112, 74, 38, 0, -112, 74, 38, 0, -112, 74, 38, 0, -112, 74, 38}, - {0, 18, 94, -112, 0, 18, 94, -112, 0, 18, 94, -112, 0, 18, 94, -112}}; - void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, int src_stride_rgba, uint8_t* dst_u, @@ -1997,6 +2061,26 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, } #endif // HAS_ARGBTOUVROW_SSSE3 +#ifdef HAS_ARGBTOUVROW_AVX2 +void ARGBToUVRow_AVX2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_AVX2(src_argb, src_stride_argb, dst_u, dst_v, width, + &kARGBI601UVConstants); +} + +void ABGRToUVRow_AVX2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_AVX2(src_abgr, src_stride_abgr, dst_u, dst_v, width, + &kABGRI601UVConstants); +} +#endif // HAS_ARGBTOUVROW_AVX2 + #ifdef HAS_ARGBTOUVJ444ROW_SSSE3 // RGB to JPEG coefficients // UB 0.500 coefficient = 128 @@ -2043,7 +2127,9 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb, ARGBToUVMatrixRow_SSSE3(src_argb, src_stride_argb, dst_u, dst_v, width, &kARGBJPEGUVConstants); } +#endif // HAS_ARGBTOUVJROW_SSSE3 +#ifdef HAS_ABGRTOUVJROW_SSSE3 void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, int src_stride_abgr, uint8_t* dst_u, @@ -2054,6 +2140,28 @@ void ABGRToUVJRow_SSSE3(const uint8_t* src_abgr, } #endif // HAS_ABGRTOUVJROW_SSSE3 +#ifdef HAS_ARGBTOUVJROW_AVX2 +void ARGBToUVJRow_AVX2(const uint8_t* src_argb, + int src_stride_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_AVX2(src_argb, src_stride_argb, dst_u, dst_v, width, + &kARGBJPEGUVConstants); +} +#endif // HAS_ARGBTOUVJROW_AVX2 + +#ifdef HAS_ABGRTOUVJROW_AVX2 +void ABGRToUVJRow_AVX2(const uint8_t* src_abgr, + int src_stride_abgr, + uint8_t* dst_u, + uint8_t* dst_v, + int width) { + ARGBToUVMatrixRow_AVX2(src_abgr, src_stride_abgr, dst_u, dst_v, width, + &kABGRJPEGUVConstants); +} +#endif // HAS_ABGRTOUVJROW_AVX2 + void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { asm volatile( "movdqa %3,%%xmm4 \n"