diff --git a/README.chromium b/README.chromium index c0f7290d9..a5d5e2b9e 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1939 +Version: 1940 Revision: DEPS License: BSD-3-Clause License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 67e629aae..37a6d0417 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -323,6 +323,7 @@ extern "C" { #define HAS_I422TOUYVYROW_AVX2 #define HAS_I422TOYUY2ROW_AVX2 #define HAS_INTERPOLATEROW_16TO8_AVX2 +#define HAS_INTERPOLATEROW_16_AVX2 #define HAS_MERGEAR64ROW_AVX2 #define HAS_MERGEARGB16TO8ROW_AVX2 #define HAS_MERGEARGBROW_AVX2 @@ -6700,6 +6701,16 @@ void InterpolateRow_16_C(uint16_t* dst_ptr, ptrdiff_t src_stride, int width, int source_y_fraction); +void InterpolateRow_16_AVX2(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction); +void InterpolateRow_16_Any_AVX2(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction); void InterpolateRow_16_NEON(uint16_t* dst_ptr, const uint16_t* src_ptr, ptrdiff_t src_stride, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 97aa52c6a..f52c5131a 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1939 +#define LIBYUV_VERSION 1940 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_any.cc b/source/row_any.cc index 86991ce7d..c8b1cf94e 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1858,6 +1858,15 @@ ANY11I(InterpolateRow_16_Any_NEON, 1, 7) #endif +#ifdef HAS_INTERPOLATEROW_16_AVX2 +ANY11I(InterpolateRow_16_Any_AVX2, + InterpolateRow_16_AVX2, + uint16_t, + uint16_t, + 1, + 1, + 15) +#endif #undef ANY11I // Any 1 to 1 interpolate with scale param diff --git a/source/row_common.cc b/source/row_common.cc index 67dc13019..f80745760 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -4370,26 +4370,6 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y, } #endif -#ifdef HAS_RGB24TOYJROW_AVX2 -// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. -} -#endif // HAS_RGB24TOYJROW_AVX2 - -#ifdef HAS_RAWTOYJROW_AVX2 -// Convert 32 RAW pixels (128 bytes) to 32 YJ values. -} -#endif // HAS_RAWTOYJROW_AVX2 - -#ifdef HAS_RGB24TOYJROW_SSSE3 -// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. -} -#endif // HAS_RGB24TOYJROW_SSSE3 - -#ifdef HAS_RAWTOYJROW_SSSE3 -// Convert 16 RAW pixels (64 bytes) to 16 YJ values. -} -#endif // HAS_RAWTOYJROW_SSSE3 - #ifdef HAS_INTERPOLATEROW_16TO8_AVX2 void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr, const uint16_t* src_ptr, @@ -4401,7 +4381,7 @@ void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr, SIMD_ALIGNED(uint16_t row[MAXTWIDTH]); while (width > 0) { int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction); + InterpolateRow_16_AVX2(row, src_ptr, src_stride, twidth, source_y_fraction); Convert16To8Row_AVX2(row, dst_ptr, scale, twidth); src_ptr += twidth; dst_ptr += twidth; diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 22b8d0b30..b8647cb3f 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -8911,6 +8911,85 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr, } #endif // HAS_INTERPOLATEROW_AVX2 +#ifdef HAS_INTERPOLATEROW_16_AVX2 +// Bilinear filter 16x2 -> 16x1 +void InterpolateRow_16_AVX2(uint16_t* dst_ptr, + const uint16_t* src_ptr, + ptrdiff_t src_stride, + int width, + int source_y_fraction) { + asm volatile( + "sub %1,%0 \n" + "cmp $0x0,%3 \n" + "je 100f \n" + "cmp $0x80,%3 \n" + "je 50f \n" + + "vmovd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x100,%3 \n" + "vmovd %3,%%xmm5 \n" + "vpunpcklwd %%xmm0,%%xmm5,%%xmm5 \n" + "vpbroadcastd %%xmm5,%%ymm5 \n" + "mov $0x80008000,%%eax \n" // 0x80008000 used to bias unsigned words to signed range for vpmaddwd. + "vmovd %%eax,%%xmm4 \n" + "vbroadcastss %%xmm4,%%ymm4 \n" + "mov $8388736,%%eax \n" // 32768 * 256 + 128 rounding constant. + "vmovd %%eax,%%xmm3 \n" + "vbroadcastss %%xmm3,%%ymm3 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%1),%%ymm0 \n" + "vmovdqu (%1,%4,2),%%ymm1 \n" + "vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n" + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" + "vpsubw %%ymm4,%%ymm2,%%ymm2 \n" + "vpsubw %%ymm4,%%ymm0,%%ymm0 \n" + "vpmaddwd %%ymm5,%%ymm2,%%ymm2 \n" + "vpmaddwd %%ymm5,%%ymm0,%%ymm0 \n" + "vpaddd %%ymm3,%%ymm2,%%ymm2 \n" + "vpaddd %%ymm3,%%ymm0,%%ymm0 \n" + "vpsrad $0x8,%%ymm2,%%ymm2 \n" + "vpsrad $0x8,%%ymm0,%%ymm0 \n" + "vpackusdw %%ymm2,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + "jmp 99f \n" + + "50: \n" + LABELALIGN + "2: \n" + "vmovdqu (%1),%%ymm0 \n" + "vpavgw (%1,%4,2),%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 2b \n" + "jmp 99f \n" + + "100: \n" + LABELALIGN + "3: \n" + "vmovdqu (%1),%%ymm0 \n" + "vmovdqu %%ymm0,0x00(%1,%0,1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 3b \n" + + "99: \n" + "vzeroupper \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(width), // %2 + "+r"(source_y_fraction) // %3 + : "r"(src_stride) // %4 + : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_INTERPOLATEROW_16_AVX2 + #ifdef HAS_ARGBSHUFFLEROW_SSSE3 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,