diff --git a/README.chromium b/README.chromium index bdd05f1f3..b96e82397 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1775 +Version: 1776 License: BSD License File: LICENSE diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index 137b30f18..7322300da 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -289,6 +289,32 @@ int NV21ToI420(const uint8_t* src_y, int width, int height); +// Convert NV12 to NV24. +LIBYUV_API +int NV12ToNV24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + +// Convert NV16 to NV24. +LIBYUV_API +int NV16ToNV24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height); + // Convert YUY2 to I420. LIBYUV_API int YUY2ToI420(const uint8_t* src_yuy2, diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index ee77d2281..92759b2b5 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -77,12 +77,14 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && \ (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) #define HAS_SCALEUVROWDOWN2BOX_SSSE3 -#define HAS_SCALECOLUP2LINEAR_SSE2 -#define HAS_SCALECOLUP2LINEAR_SSSE3 #define HAS_SCALEROWUP2LINEAR_SSE2 #define HAS_SCALEROWUP2LINEAR_SSSE3 -#define HAS_SCALECOLUP2LINEAR_16_SSE2 +#define HAS_SCALEROWUP2BILINEAR_SSE2 +#define HAS_SCALEROWUP2BILINEAR_SSSE3 #define HAS_SCALEROWUP2LINEAR_16_SSE2 +#define HAS_SCALEROWUP2BILINEAR_16_SSE2 +#define HAS_SCALEUVROWUP2LINEAR_SSSE3 +#define HAS_SCALEUVROWUP2BILINEAR_SSSE3 #endif // The following are available for gcc/clang x86 platforms, but @@ -92,10 +94,12 @@ extern "C" { (defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \ (defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2)) #define HAS_SCALEUVROWDOWN2BOX_AVX2 -#define HAS_SCALECOLUP2LINEAR_AVX2 #define HAS_SCALEROWUP2LINEAR_AVX2 -#define HAS_SCALECOLUP2LINEAR_16_AVX2 +#define HAS_SCALEROWUP2BILINEAR_AVX2 #define HAS_SCALEROWUP2LINEAR_16_AVX2 +#define HAS_SCALEROWUP2BILINEAR_16_AVX2 +#define HAS_SCALEUVROWUP2LINEAR_AVX2 +#define HAS_SCALEUVROWUP2BILINEAR_AVX2 #endif // The following are available on all x86 platforms, but @@ -124,10 +128,12 @@ extern "C" { #define HAS_SCALEROWDOWN4_NEON #define HAS_SCALEUVROWDOWN2BOX_NEON #define HAS_SCALEUVROWDOWNEVEN_NEON -#define HAS_SCALECOLUP2LINEAR_NEON #define HAS_SCALEROWUP2LINEAR_NEON -#define HAS_SCALECOLUP2LINEAR_16_NEON +#define HAS_SCALEROWUP2BILINEAR_NEON #define HAS_SCALEROWUP2LINEAR_16_NEON +#define HAS_SCALEROWUP2BILINEAR_16_NEON +#define HAS_SCALEUVROWUP2LINEAR_NEON +#define HAS_SCALEUVROWUP2BILINEAR_NEON #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) @@ -464,6 +470,24 @@ void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv, int src_stepx, uint8_t* dst_uv, int dst_width); + +void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleUVRowUp2_Linear_Any_C(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + void ScaleUVCols_C(uint8_t* dst_uv, const uint8_t* src_uv, int dst_width, @@ -1163,6 +1187,55 @@ void ScaleUVRowDownEvenBox_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width); +void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleUVRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleUVRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); +void ScaleUVRowUp2_Linear_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width); +void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width); + // ScaleRowDown2Box also used by planar functions // NEON downscalers with interpolation. diff --git a/include/libyuv/version.h b/include/libyuv/version.h index a57dfa538..6073df8f5 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1775 +#define LIBYUV_VERSION 1776 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index 98258b9bc..8a4fcf06e 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -16,6 +16,7 @@ #include "libyuv/rotate.h" #include "libyuv/row.h" #include "libyuv/scale.h" // For ScalePlane() +#include "libyuv/scale_uv.h" // For UVScale() #ifdef __cplusplus namespace libyuv { @@ -613,6 +614,55 @@ int NV21ToI420(const uint8_t* src_y, width, height); } +LIBYUV_API +int NV12ToNV24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (width == 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), + SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width), + Abs(height), kFilterBilinear); + return 0; +} + +LIBYUV_API +int NV16ToNV24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height) { + if (width == 0 || height == 0) { + return -1; + } + + if (dst_y) { + ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y, + Abs(width), Abs(height), kFilterBilinear); + } + UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv, + dst_stride_uv, Abs(width), Abs(height), kFilterBilinear); + return 0; +} + // Convert YUY2 to I420. LIBYUV_API int YUY2ToI420(const uint8_t* src_yuy2, diff --git a/source/scale.cc b/source/scale.cc index 16771cd80..226024cd8 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1415,27 +1415,27 @@ void ScalePlaneUp2_Bilinear(int src_width, // This function can only scale up by 2 times. assert(src_width == ((dst_width + 1) / 2)); - assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1); + assert(src_height == ((dst_height + 1) / 2)); -#ifdef HAS_SCALEROWUP2LINEAR_SSE2 +#ifdef HAS_SCALEROWUP2BILINEAR_SSE2 if (TestCpuFlag(kCpuHasSSE2)) { Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2; } #endif -#ifdef HAS_SCALEROWUP2LINEAR_SSSE3 +#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3 if (TestCpuFlag(kCpuHasSSSE3)) { Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3; } #endif -#ifdef HAS_SCALEROWUP2LINEAR_AVX2 +#ifdef HAS_SCALEROWUP2BILINEAR_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2; } #endif -#ifdef HAS_SCALEROWUP2LINEAR_NEON +#ifdef HAS_SCALEROWUP2BILINEAR_NEON if (TestCpuFlag(kCpuHasNEON)) { Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON; } @@ -1480,19 +1480,19 @@ void ScalePlaneUp2_16_Linear(int src_width, // This function can only scale up by 2 times horizontally. assert(src_width == ((dst_width + 1) / 2)); -#ifdef HAS_SCALEROWUP2LINEAR_SSE2 +#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 if (TestCpuFlag(kCpuHasSSE2)) { ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2; } #endif -#ifdef HAS_SCALEROWUP2LINEAR_AVX2 +#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2; } #endif -#ifdef HAS_SCALEROWUP2LINEAR_NEON +#ifdef HAS_SCALEROWUP2LINEAR_16_NEON if (TestCpuFlag(kCpuHasNEON)) { ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON; } @@ -1532,21 +1532,21 @@ void ScalePlaneUp2_16_Bilinear(int src_width, // This function can only scale up by 2 times. assert(src_width == ((dst_width + 1) / 2)); - assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1); + assert(src_height == ((dst_height + 1) / 2)); -#ifdef HAS_SCALEROWUP2LINEAR_SSE2 +#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 if (TestCpuFlag(kCpuHasSSE2)) { Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2; } #endif -#ifdef HAS_SCALEROWUP2LINEAR_AVX2 +#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2 if (TestCpuFlag(kCpuHasAVX2)) { Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2; } #endif -#ifdef HAS_SCALEROWUP2LINEAR_NEON +#ifdef HAS_SCALEROWUP2BILINEAR_16_NEON if (TestCpuFlag(kCpuHasNEON)) { Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON; } diff --git a/source/scale_any.cc b/source/scale_any.cc index 79394985a..4257d17b9 100644 --- a/source/scale_any.cc +++ b/source/scale_any.cc @@ -640,7 +640,7 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_C, 0, uint16_t) -#ifdef HAS_SCALECOLUP2LINEAR_SSE2 +#ifdef HAS_SCALEROWUP2LINEAR_SSE2 SUH2LANY(ScaleRowUp2_Linear_Any_SSE2, ScaleRowUp2_Linear_SSE2, ScaleRowUp2_Linear_C, @@ -648,7 +648,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_SSE2, uint8_t) #endif -#ifdef HAS_SCALECOLUP2LINEAR_SSSE3 +#ifdef HAS_SCALEROWUP2LINEAR_SSSE3 SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3, ScaleRowUp2_Linear_SSSE3, ScaleRowUp2_Linear_C, @@ -656,7 +656,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3, uint8_t) #endif -#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2 +#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2, ScaleRowUp2_Linear_16_SSE2, ScaleRowUp2_Linear_16_C, @@ -664,7 +664,7 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2, uint16_t) #endif -#ifdef HAS_SCALECOLUP2LINEAR_AVX2 +#ifdef HAS_SCALEROWUP2LINEAR_AVX2 SUH2LANY(ScaleRowUp2_Linear_Any_AVX2, ScaleRowUp2_Linear_AVX2, ScaleRowUp2_Linear_C, @@ -672,7 +672,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_AVX2, uint8_t) #endif -#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2 +#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2, ScaleRowUp2_Linear_16_AVX2, ScaleRowUp2_Linear_16_C, @@ -680,7 +680,7 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2, uint16_t) #endif -#ifdef HAS_SCALECOLUP2LINEAR_NEON +#ifdef HAS_SCALEROWUP2LINEAR_NEON SUH2LANY(ScaleRowUp2_Linear_Any_NEON, ScaleRowUp2_Linear_NEON, ScaleRowUp2_Linear_C, @@ -688,7 +688,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_NEON, uint8_t) #endif -#ifdef HAS_SCALECOLUP2LINEAR_16_NEON +#ifdef HAS_SCALEROWUP2LINEAR_16_NEON SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON, ScaleRowUp2_Linear_16_NEON, ScaleRowUp2_Linear_16_C, @@ -699,7 +699,7 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON, #undef SUH2LANY // Scale up 2 times using bilinear filter. -// This function produces 2 rows at a time +// This function produces 2 rows at a time. #define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \ void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \ ptrdiff_t dst_stride, int dst_width) { \ @@ -736,7 +736,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C, 0, uint16_t) -#ifdef HAS_SCALEROWUP2LINEAR_SSE2 +#ifdef HAS_SCALEROWUP2BILINEAR_SSE2 SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2, ScaleRowUp2_Bilinear_SSE2, ScaleRowUp2_Bilinear_C, @@ -744,7 +744,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2, uint8_t) #endif -#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2 +#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2, ScaleRowUp2_Bilinear_16_SSE2, ScaleRowUp2_Bilinear_16_C, @@ -752,7 +752,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2, uint16_t) #endif -#ifdef HAS_SCALEROWUP2LINEAR_SSSE3 +#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3 SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3, ScaleRowUp2_Bilinear_SSSE3, ScaleRowUp2_Bilinear_C, @@ -760,7 +760,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3, uint8_t) #endif -#ifdef HAS_SCALEROWUP2LINEAR_AVX2 +#ifdef HAS_SCALEROWUP2BILINEAR_AVX2 SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2, ScaleRowUp2_Bilinear_AVX2, ScaleRowUp2_Bilinear_C, @@ -768,7 +768,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2, uint8_t) #endif -#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 +#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2 SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2, ScaleRowUp2_Bilinear_16_AVX2, ScaleRowUp2_Bilinear_16_C, @@ -776,7 +776,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2, uint16_t) #endif -#ifdef HAS_SCALEROWUP2LINEAR_NEON +#ifdef HAS_SCALEROWUP2BILINEAR_NEON SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON, ScaleRowUp2_Bilinear_NEON, ScaleRowUp2_Bilinear_C, @@ -784,7 +784,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON, uint8_t) #endif -#ifdef HAS_SCALEROWUP2LINEAR_16_NEON +#ifdef HAS_SCALEROWUP2BILINEAR_16_NEON SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON, ScaleRowUp2_Bilinear_16_NEON, ScaleRowUp2_Bilinear_16_C, @@ -794,6 +794,120 @@ SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON, #undef SU2BLANY +// Scale bi-planar plane up horizontally 2 times using linear filter. +#define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE) \ + void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \ + int work_width = (dst_width - 1) & ~1; \ + int r = work_width & MASK; \ + int n = work_width & ~MASK; \ + dst_ptr[0] = src_ptr[0]; \ + dst_ptr[1] = src_ptr[1]; \ + if (work_width > 0) { \ + if (n != 0) { \ + SIMD(src_ptr, dst_ptr + 2, n); \ + } \ + C(src_ptr + n, dst_ptr + 2 * n + 2, r); \ + } \ + dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \ + dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \ + } + +SBUH2LANY(ScaleUVRowUp2_Linear_Any_C, + ScaleUVRowUp2_Linear_C, + ScaleUVRowUp2_Linear_C, + 0, + uint8_t) + +#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3 +SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3, + ScaleUVRowUp2_Linear_SSSE3, + ScaleUVRowUp2_Linear_C, + 7, + uint8_t) +#endif + +#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2 +SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2, + ScaleUVRowUp2_Linear_AVX2, + ScaleUVRowUp2_Linear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEUVROWUP2LINEAR_NEON +SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON, + ScaleUVRowUp2_Linear_NEON, + ScaleUVRowUp2_Linear_C, + 7, + uint8_t) +#endif + +#undef SBUH2LANY + +// Scale bi-planar plane up 2 times using bilinear filter. +// This function produces 2 rows at a time. +#define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE) \ + void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \ + ptrdiff_t dst_stride, int dst_width) { \ + int work_width = (dst_width - 1) & ~1; \ + int r = work_width & MASK; \ + int n = work_width & ~MASK; \ + const PTYPE* sa = src_ptr; \ + const PTYPE* sb = src_ptr + src_stride; \ + PTYPE* da = dst_ptr; \ + PTYPE* db = dst_ptr + dst_stride; \ + da[0] = (3 * sa[0] + sb[0]) >> 2; \ + db[0] = (sa[0] + 3 * sb[0]) >> 2; \ + da[1] = (3 * sa[1] + sb[1]) >> 2; \ + db[1] = (sa[1] + 3 * sb[1]) >> 2; \ + if (work_width > 0) { \ + if (n != 0) { \ + SIMD(sa, sb - sa, da + 2, db - da, n); \ + } \ + C(sa + n, sb - sa, da + 2 * n + 2, db - da, r); \ + } \ + da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] + \ + sb[((dst_width + 1) & ~1) - 2]) >> 2; \ + db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] + \ + 3 * sb[((dst_width + 1) & ~1) - 2]) >> 2; \ + da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] + \ + sb[((dst_width + 1) & ~1) - 1]) >> 2; \ + db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] + \ + 3 * sb[((dst_width + 1) & ~1) - 1]) >> 2; \ + } + +SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C, + ScaleUVRowUp2_Bilinear_C, + ScaleUVRowUp2_Bilinear_C, + 0, + uint8_t) + +#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3 +SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3, + ScaleUVRowUp2_Bilinear_SSSE3, + ScaleUVRowUp2_Bilinear_C, + 7, + uint8_t) +#endif + +#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2 +SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2, + ScaleUVRowUp2_Bilinear_AVX2, + ScaleUVRowUp2_Bilinear_C, + 15, + uint8_t) +#endif + +#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON +SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON, + ScaleUVRowUp2_Bilinear_NEON, + ScaleUVRowUp2_Bilinear_C, + 7, + uint8_t) +#endif + +#undef SBU2BLANY + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/scale_common.cc b/source/scale_common.cc index 8d41c03d4..4af843216 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -1200,6 +1200,56 @@ void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv, } } +void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + dst_ptr[4 * x + 0] = + (src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2; + dst_ptr[4 * x + 1] = + (src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2; + dst_ptr[4 * x + 2] = + (src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2; + dst_ptr[4 * x + 3] = + (src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2; + } +} + +void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* s = src_ptr; + const uint8_t* t = src_ptr + src_stride; + uint8_t* d = dst_ptr; + uint8_t* e = dst_ptr + dst_stride; + int src_width = dst_width >> 1; + int x; + assert((dst_width % 2 == 0) && (dst_width >= 0)); + for (x = 0; x < src_width; ++x) { + d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + + t[2 * x + 2] * 1 + 8) >> 4; + d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + + t[2 * x + 3] * 1 + 8) >> 4; + d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 + + t[2 * x + 2] * 3 + 8) >> 4; + d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 + + t[2 * x + 3] * 3 + 8) >> 4; + e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 + + t[2 * x + 2] * 3 + 8) >> 4; + e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 + + t[2 * x + 3] * 3 + 8) >> 4; + e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 + + t[2 * x + 2] * 9 + 8) >> 4; + e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 + + t[2 * x + 3] * 9 + 8) >> 4; + } +} + // Scales a single row of pixels using point sampling. void ScaleUVCols_C(uint8_t* dst_uv, const uint8_t* src_uv, diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index db3c9687e..226e0a956 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -779,7 +779,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr, "xmm7"); } -#ifdef HAS_SCALECOLUP2LINEAR_SSE2 +#ifdef HAS_SCALEROWUP2LINEAR_SSE2 void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { @@ -833,7 +833,7 @@ void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, } #endif -#ifdef HAS_SCALEROWUP2LINEAR_SSE2 +#ifdef HAS_SCALEROWUP2BILINEAR_SSE2 void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -949,7 +949,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr, } #endif -#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2 +#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, uint16_t* dst_ptr, int dst_width) { @@ -999,7 +999,7 @@ void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, } #endif -#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2 +#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2 void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, @@ -1106,7 +1106,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr, } #endif -#ifdef HAS_SCALECOLUP2LINEAR_SSSE3 +#ifdef HAS_SCALEROWUP2LINEAR_SSSE3 static const uvec8 kLinearMadd31_SSSE3 = {3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 3}; @@ -1149,7 +1149,7 @@ void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, } #endif -#ifdef HAS_SCALEROWUP2LINEAR_SSSE3 +#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3 void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -1236,7 +1236,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, } #endif -#ifdef HAS_SCALECOLUP2LINEAR_AVX2 +#ifdef HAS_SCALEROWUP2LINEAR_AVX2 static const lvec8 kLinearMadd31_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 3}; @@ -1281,7 +1281,7 @@ void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, } #endif -#ifdef HAS_SCALEROWUP2LINEAR_AVX2 +#ifdef HAS_SCALEROWUP2BILINEAR_AVX2 void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, ptrdiff_t src_stride, uint8_t* dst_ptr, @@ -1364,7 +1364,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, } #endif -#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2 +#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 static const lvec16 kLinearMadd31_16_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1, 3}; @@ -1450,7 +1450,7 @@ void ScaleRowUp2_Linear_16_AVX2_Full(const uint16_t* src_ptr, } #endif -#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2 +#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2 void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr, ptrdiff_t src_stride, uint16_t* dst_ptr, @@ -2261,6 +2261,257 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, } #endif // HAS_SCALEUVROWDOWN2BOX_AVX2 +#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3 +static const uvec8 kUVLinearMadd31_SSSE3 = {3, 1, 3, 1, 1, 3, 1, 3, + 3, 1, 3, 1, 1, 3, 1, 3}; +void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "pcmpeqw %%xmm4,%%xmm4 \n" + "psrlw $15,%%xmm4 \n" + "psllw $1,%%xmm4 \n" // all 2 + "movdqu %3,%%xmm3 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 00112233 (1u1v) + "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) + "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) + "movdqa %%xmm0,%%xmm2 \n" + "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) + "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) + "pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi) + "pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo) + "paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo) + "paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi) + "psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo) + "psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi) + "vpackuswb %%xmm2,%%xmm0,%%xmm0 \n" + "vmovdqu %%xmm0,(%1) \n" + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 uv to 8 uv + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kUVLinearMadd31_SSSE3) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3 +void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "pcmpeqw %%xmm6,%%xmm6 \n" + "psrlw $15,%%xmm6 \n" + "psllw $3,%%xmm6 \n" // all 8 + "movdqu %5,%%xmm7 \n" + + LABELALIGN + "1: \n" + "movq (%0),%%xmm0 \n" // 00112233 (1u1v) + "movq 2(%0),%%xmm1 \n" // 11223344 (1u1v) + "punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v) + "movdqa %%xmm0,%%xmm2 \n" + "punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v) + "punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v) + "pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi) + "pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo) + + "movq (%0,%3),%%xmm1 \n" + "movq 2(%0,%3),%%xmm4 \n" + "punpcklbw %%xmm4,%%xmm1 \n" + "movdqa %%xmm1,%%xmm3 \n" + "punpckhdq %%xmm1,%%xmm3 \n" + "punpckldq %%xmm1,%%xmm1 \n" + "pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi) + "pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo) + + // xmm0 xmm2 + // xmm1 xmm3 + + "movdqa %%xmm0,%%xmm4 \n" + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo) + "paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo) + "paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo) + "paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo) + "psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo) + + "movdqa %%xmm1,%%xmm5 \n" + "paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo) + "paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo) + "paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo) + "paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo) + "psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo) + + "movdqa %%xmm2,%%xmm0 \n" + "movdqa %%xmm3,%%xmm1 \n" + "paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi) + "paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi) + "paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi) + "paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi) + "psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi) + + "movdqa %%xmm3,%%xmm1 \n" + "paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi) + "paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi) + "paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi) + "paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi) + "psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi) + + "packuswb %%xmm0,%%xmm4 \n" + "movdqu %%xmm4,(%1) \n" // store above + "packuswb %%xmm1,%%xmm5 \n" + "movdqu %%xmm5,(%1,%4) \n" // store below + + "lea 0x8(%0),%0 \n" + "lea 0x10(%1),%1 \n" // 4 uv to 8 uv + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kUVLinearMadd31_SSSE3) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2 +static const lvec8 kUVLinearMadd31_AVX2 = {3, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3, + 1, 1, 3, 1, 3, 3, 1, 3, 1, 1, 3, + 1, 3, 3, 1, 3, 1, 1, 3, 1, 3}; + +void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + asm volatile( + "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" + "vpsrlw $15,%%ymm4,%%ymm4 \n" + "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 + "vmovdqu %3,%%ymm3 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" + "vmovdqu 2(%0),%%xmm1 \n" + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n" + "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi) + "vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo) + "vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo) + "vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi) + "vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo) + "vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi) + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 uv to 16 uv + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "m"(kUVLinearMadd31_AVX2) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); +} +#endif + +#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2 +void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + asm volatile( + "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" + "vpsrlw $15,%%ymm6,%%ymm6 \n" + "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 + "vmovdqu %5,%%ymm7 \n" + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%xmm0 \n" + "vmovdqu 2(%0),%%xmm1 \n" + "vpermq $0b11011000,%%ymm0,%%ymm0 \n" + "vpermq $0b11011000,%%ymm1,%%ymm1 \n" + "vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n" + "vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n" + "vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" + "vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi) + "vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo) + + "vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF + "vmovdqu 2(%0,%3),%%xmm3 \n" // 123456789ABCDEF0 + "vpermq $0b11011000,%%ymm2,%%ymm2 \n" + "vpermq $0b11011000,%%ymm3,%%ymm3 \n" + "vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n" + "vpunpckhdq %%ymm2,%%ymm2,%%ymm4 \n" + "vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi) + "vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo) + + // ymm0 ymm1 + // ymm2 ymm3 + + "vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo) + "vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo) + "vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo) + "vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo) + "vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo) + + "vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo) + "vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo) + "vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo) + "vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo) + "vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo) + + "vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi) + "vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi) + "vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi) + "vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi) + "vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi) + + "vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi) + "vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi) + "vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi) + "vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi) + "vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi) + + "vpackuswb %%ymm0,%%ymm4,%%ymm4 \n" + "vmovdqu %%ymm4,(%1) \n" // store above + "vpackuswb %%ymm2,%%ymm5,%%ymm5 \n" + "vmovdqu %%ymm5,(%1,%4) \n" // store below + + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" // 8 uv to 16 uv + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"((intptr_t)(src_stride)), // %3 + "r"((intptr_t)(dst_stride)), // %4 + "m"(kUVLinearMadd31_AVX2) // %5 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", + "xmm7"); +} +#endif + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/scale_neon.cc b/source/scale_neon.cc index e260dc955..fea3e64e1 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -509,20 +509,19 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr, int dst_width) { const uint8_t* src_temp = src_ptr + 1; asm volatile( - "vmov.u16 q15, #3 \n" + "vmov.u8 d30, #3 \n" "1: \n" - "vld1.8 {d0}, [%0]! \n" // 01234567 - "vld1.8 {d2}, [%3]! \n" // 12345678 + "vld1.8 {d4}, [%0]! \n" // 01234567 + "vld1.8 {d5}, [%3]! \n" // 12345678 - "vmovl.u8 q0, d0 \n" // 01234567 (16b) - "vmovl.u8 q1, d2 \n" // 12345678 (16b) - "vmovq q2, q0 \n" - "vmla.u16 q2, q1, q15 \n" // 3*near+far (odd) - "vmla.u16 q1, q0, q15 \n" // 3*near+far (even) + "vmovl.u8 q0, d4 \n" // 01234567 (16b) + "vmovl.u8 q1, d5 \n" // 12345678 (16b) + "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd) + "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even) - "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (odd) - "vrshrn.u16 d1, q2, #2 \n" // 3/4*near+1/4*far (even) + "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd) + "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even) "vst2.8 {d0, d1}, [%1]! \n" // store "subs %2, %2, #16 \n" // 8 sample -> 16 sample @@ -548,25 +547,24 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, asm volatile( "vmov.u16 q15, #3 \n" + "vmov.u8 d28, #3 \n" "1: \n" - "vld1.8 {d0}, [%0]! \n" // 01234567 - "vld1.8 {d2}, [%5]! \n" // 12345678 + "vld1.8 {d4}, [%0]! \n" // 01234567 + "vld1.8 {d5}, [%5]! \n" // 12345678 - "vmovl.u8 q0, d0 \n" // 01234567 (16b) - "vmovl.u8 q1, d2 \n" // 12345678 (16b) - "vmovq q2, q0 \n" - "vmla.u16 q0, q1, q15 \n" // 3*near+far (1, odd) - "vmla.u16 q1, q2, q15 \n" // 3*near+far (1, even) + "vmovl.u8 q0, d4 \n" // 01234567 (16b) + "vmovl.u8 q1, d5 \n" // 12345678 (16b) + "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd) + "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even) - "vld1.8 {d4}, [%1]! \n" // 01234567 - "vld1.8 {d6}, [%6]! \n" // 12345678 + "vld1.8 {d8}, [%1]! \n" + "vld1.8 {d9}, [%6]! \n" - "vmovl.u8 q2, d4 \n" // 01234567 (16b) - "vmovl.u8 q3, d6 \n" // 12345678 (16b) - "vmovq q4, q2 \n" - "vmla.u16 q2, q3, q15 \n" // 3*near+far (2, odd) - "vmla.u16 q3, q4, q15 \n" // 3*near+far (2, even) + "vmovl.u8 q2, d8 \n" + "vmovl.u8 q3, d9 \n" + "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd) + "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even) // e o // q1 q0 @@ -600,7 +598,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr, "+r"(src_temp), // %5 "+r"(src_temp1) // %6 : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28", "q15" // Clobber List ); } @@ -694,6 +692,105 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, ); } +void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_temp = src_ptr + 2; + asm volatile( + "vmov.u8 d30, #3 \n" + + "1: \n" + "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v) + "vld1.8 {d5}, [%3]! \n" // 11223344 (1u1v) + + "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b) + "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b) + "vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd) + "vmlal.u8 q1, d4, d30 \n" // 3*near+far (even) + + "vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd) + "vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even) + + "vst2.16 {d0, d1}, [%1]! \n" // store + "subs %2, %2, #8 \n" // 4 uv -> 8 uv + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width), // %2 + "+r"(src_temp) // %3 + : + : "memory", "cc", "q0", "q1", "q2", "d30" // Clobber List + ); +} + +void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + uint8_t* dst_ptr1 = dst_ptr + dst_stride; + const uint8_t* src_temp = src_ptr + 2; + const uint8_t* src_temp1 = src_ptr1 + 2; + + asm volatile( + "vmov.u16 q15, #3 \n" + "vmov.u8 d28, #3 \n" + + "1: \n" + "vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v) + "vld1.8 {d5}, [%5]! \n" // 11223344 (1u1v) + + "vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b) + "vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b) + "vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd) + "vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even) + + "vld1.8 {d8}, [%1]! \n" // 00112233 (1u1v) + "vld1.8 {d9}, [%6]! \n" // 11223344 (1u1v) + + "vmovl.u8 q2, d8 \n" // 00112233 (1u1v, 16b) + "vmovl.u8 q3, d9 \n" // 11223344 (1u1v, 16b) + "vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd) + "vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even) + + // e o + // q1 q0 + // q3 q2 + + "vmovq q4, q2 \n" + "vmovq q5, q3 \n" + "vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd) + "vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even) + "vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd) + "vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even) + + // e o + // q5 q4 + // q1 q0 + + "vrshrn.u16 d2, q1, #4 \n" // 2, even + "vrshrn.u16 d3, q0, #4 \n" // 2, odd + "vrshrn.u16 d0, q5, #4 \n" // 1, even + "vrshrn.u16 d1, q4, #4 \n" // 1, odd + + "vst2.16 {d0, d1}, [%2]! \n" // store + "vst2.16 {d2, d3}, [%3]! \n" // store + "subs %4, %4, #8 \n" // 4 uv -> 8 uv + "bgt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_ptr1), // %3 + "+r"(dst_width), // %4 + "+r"(src_temp), // %5 + "+r"(src_temp1) // %6 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28", + "q15" // Clobber List + ); +} + // Add a row of bytes to a row of shorts. Used for box filter. // Reads 16 bytes and accumulates to 16 shorts at a time. void ScaleAddRow_NEON(const uint8_t* src_ptr, diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 4b4f2fb1b..3a3d499dc 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -721,6 +721,101 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr, ); } +void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int dst_width) { + const uint8_t* src_temp = src_ptr + 2; + asm volatile( + "movi v31.8b, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" // 00112233 (1u1v) + "ldr d1, [%1], #8 \n" // 11223344 (1u1v) + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.8h, v0.8b, #0 \n" // 00112233 (1u1v, 16b) + "ushll v3.8h, v1.8b, #0 \n" // 11223344 (1u1v, 16b) + + "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd) + "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even) + + "rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd) + "rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even) + + "st2 {v1.4h, v2.4h}, [%2], #16 \n" // store + "subs %w3, %w3, #8 \n" // 4 uv -> 8 uv + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_temp), // %1 + "+r"(dst_ptr), // %2 + "+r"(dst_width) // %3 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List + ); +} + +void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr, + ptrdiff_t src_stride, + uint8_t* dst_ptr, + ptrdiff_t dst_stride, + int dst_width) { + const uint8_t* src_ptr1 = src_ptr + src_stride; + uint8_t* dst_ptr1 = dst_ptr + dst_stride; + const uint8_t* src_temp = src_ptr + 2; + const uint8_t* src_temp1 = src_ptr1 + 2; + + asm volatile( + "movi v31.8b, #3 \n" + "movi v30.8h, #3 \n" + + "1: \n" + "ldr d0, [%0], #8 \n" + "ldr d1, [%2], #8 \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + + "ushll v2.8h, v0.8b, #0 \n" + "ushll v3.8h, v1.8b, #0 \n" + "umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd) + "umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even) + + "ldr d0, [%1], #8 \n" + "ldr d1, [%3], #8 \n" + "prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead + + "ushll v4.8h, v0.8b, #0 \n" + "ushll v5.8h, v1.8b, #0 \n" + "umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd) + "umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even) + + "mov v0.8h, v4.8h \n" + "mov v1.8h, v5.8h \n" + "mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd) + "mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even) + "mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd) + "mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even) + + "rshrn v2.8b, v2.8h, #4 \n" // 2, odd + "rshrn v1.8b, v3.8h, #4 \n" // 2, even + "rshrn v4.8b, v4.8h, #4 \n" // 1, odd + "rshrn v3.8b, v5.8h, #4 \n" // 1, even + + "st2 {v1.4h, v2.4h}, [%5], #16 \n" // store 1 + "st2 {v3.4h, v4.4h}, [%4], #16 \n" // store 2 + "subs %w6, %w6, #8 \n" // 4 uv -> 8 uv + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(src_ptr1), // %1 + "+r"(src_temp), // %2 + "+r"(src_temp1), // %3 + "+r"(dst_ptr), // %4 + "+r"(dst_ptr1), // %5 + "+r"(dst_width) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30", + "v31" // Clobber List + ); +} + // Add a row of bytes to a row of shorts. Used for box filter. // Reads 16 bytes and accumulates to 16 shorts at a time. void ScaleAddRow_NEON(const uint8_t* src_ptr, diff --git a/source/scale_uv.cc b/source/scale_uv.cc index 2235eebe8..ab58966d5 100644 --- a/source/scale_uv.cc +++ b/source/scale_uv.cc @@ -649,6 +649,116 @@ static void ScaleUVBilinearUp(int src_width, } #endif // HAS_SCALEUVBILINEARUP +// Scale UV, horizontally up by 2 times. +// Uses linear filter horizontally, nearest vertically. +// This is an optimized version for scaling up a plane to 2 times of +// its original width, using linear interpolation. +// This is used to scale U and V planes of NV16 to NV24. +void ScaleUVLinearUp2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_uv, + uint8_t* dst_uv) { + void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) = + ScaleUVRowUp2_Linear_Any_C; + int i; + int y; + int dy; + + // This function can only scale up by 2 times horizontally. + assert(src_width == ((dst_width + 1) / 2)); + +#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + ScaleRowUp = ScaleUVRowUp2_Linear_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2LINEAR_NEON + if (TestCpuFlag(kCpuHasNEON)) { + ScaleRowUp = ScaleUVRowUp2_Linear_Any_NEON; + } +#endif + + if (dst_height == 1) { + ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, + dst_width); + } else { + dy = FixedDiv(src_height - 1, dst_height - 1); + y = (1 << 15) - 1; + for (i = 0; i < dst_height; ++i) { + ScaleRowUp(src_uv + (y >> 16) * src_stride, dst_uv, dst_width); + dst_uv += dst_stride; + y += dy; + } + } +} + +// Scale plane, up by 2 times. +// This is an optimized version for scaling up a plane to 2 times of +// its original size, using bilinear interpolation. +// This is used to scale U and V planes of NV12 to NV24. +void ScaleUVBilinearUp2(int src_width, + int src_height, + int dst_width, + int dst_height, + int src_stride, + int dst_stride, + const uint8_t* src_ptr, + uint8_t* dst_ptr) { + void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride, + uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) = + ScaleUVRowUp2_Bilinear_Any_C; + int x; + + // This function can only scale up by 2 times. + assert(src_width == ((dst_width + 1) / 2)); + assert(src_height == ((dst_height + 1) / 2)); + +#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3 + if (TestCpuFlag(kCpuHasSSSE3)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3; + } +#endif + +#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2 + if (TestCpuFlag(kCpuHasAVX2)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_AVX2; + } +#endif + +#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON + if (TestCpuFlag(kCpuHasNEON)) { + Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_NEON; + } +#endif + + if (src_height == 1) { + Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width); + } else { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + dst_ptr += dst_stride; + for (x = 0; x < src_height - 1; ++x) { + Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width); + src_ptr += src_stride; + // TODO: Test performance of writing one row of destination at a time. + dst_ptr += 2 * dst_stride; + } + if (!(dst_height & 1)) { + Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width); + } + } +} + // Scale UV to/from any dimensions, without interpolation. // Fixed point math is used for performance: The upper 16 bits // of x and dx is the integer part of the source position and @@ -844,6 +954,18 @@ static void ScaleUV(const uint8_t* src, dst_stride, src, dst, x, y, dy, 4, filtering); return; } + if (filtering && src_height == dst_height) { + ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride, + dst_stride, src, dst); + return; + } + if ((clip_height + 1) / 2 == src_height && + (clip_width + 1) / 2 == src_width && + (filtering == kFilterBilinear || filtering == kFilterBox)) { + ScaleUVBilinearUp2(src_width, src_height, clip_width, clip_height, + src_stride, dst_stride, src, dst); + return; + } #if HAS_SCALEUVBILINEARUP if (filtering && dy < 65536) { ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height, diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 20703200e..c7c5daffe 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -458,6 +458,8 @@ TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2) TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2) TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2) +TESTBIPLANARTOBP(NV12, 2, 2, NV24, 1, 1) +TESTBIPLANARTOBP(NV16, 2, 1, NV24, 1, 1) #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \