Add NV12ToNV24 and NV16ToNV24

These are bi-planar convert functions to scale UV plane to Y plane's size using (bi)linear filter.

libyuv_unittest --gtest_filter=*ToNV24*

R=fbarchard@chromium.org

Change-Id: I3d98f833feeef00af3c903ac9ad0e41bdcbcb51f
Bug: libyuv:872
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2682152
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Yuan Tong 2021-02-09 13:09:59 +08:00 committed by Frank Barchard
parent 942c508448
commit f7fc83f46d
13 changed files with 951 additions and 71 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1775
Version: 1776
License: BSD
License File: LICENSE

View File

@ -289,6 +289,32 @@ int NV21ToI420(const uint8_t* src_y,
int width,
int height);
// Convert NV12 to NV24.
LIBYUV_API
int NV12ToNV24(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_uv,
int src_stride_uv,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height);
// Convert NV16 to NV24.
LIBYUV_API
int NV16ToNV24(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_uv,
int src_stride_uv,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height);
// Convert YUY2 to I420.
LIBYUV_API
int YUY2ToI420(const uint8_t* src_yuy2,

View File

@ -77,12 +77,14 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_SCALEUVROWDOWN2BOX_SSSE3
#define HAS_SCALECOLUP2LINEAR_SSE2
#define HAS_SCALECOLUP2LINEAR_SSSE3
#define HAS_SCALEROWUP2LINEAR_SSE2
#define HAS_SCALEROWUP2LINEAR_SSSE3
#define HAS_SCALECOLUP2LINEAR_16_SSE2
#define HAS_SCALEROWUP2BILINEAR_SSE2
#define HAS_SCALEROWUP2BILINEAR_SSSE3
#define HAS_SCALEROWUP2LINEAR_16_SSE2
#define HAS_SCALEROWUP2BILINEAR_16_SSE2
#define HAS_SCALEUVROWUP2LINEAR_SSSE3
#define HAS_SCALEUVROWUP2BILINEAR_SSSE3
#endif
// The following are available for gcc/clang x86 platforms, but
@ -92,10 +94,12 @@ extern "C" {
(defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_SCALEUVROWDOWN2BOX_AVX2
#define HAS_SCALECOLUP2LINEAR_AVX2
#define HAS_SCALEROWUP2LINEAR_AVX2
#define HAS_SCALECOLUP2LINEAR_16_AVX2
#define HAS_SCALEROWUP2BILINEAR_AVX2
#define HAS_SCALEROWUP2LINEAR_16_AVX2
#define HAS_SCALEROWUP2BILINEAR_16_AVX2
#define HAS_SCALEUVROWUP2LINEAR_AVX2
#define HAS_SCALEUVROWUP2BILINEAR_AVX2
#endif
// The following are available on all x86 platforms, but
@ -124,10 +128,12 @@ extern "C" {
#define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEUVROWDOWN2BOX_NEON
#define HAS_SCALEUVROWDOWNEVEN_NEON
#define HAS_SCALECOLUP2LINEAR_NEON
#define HAS_SCALEROWUP2LINEAR_NEON
#define HAS_SCALECOLUP2LINEAR_16_NEON
#define HAS_SCALEROWUP2BILINEAR_NEON
#define HAS_SCALEROWUP2LINEAR_16_NEON
#define HAS_SCALEROWUP2BILINEAR_16_NEON
#define HAS_SCALEUVROWUP2LINEAR_NEON
#define HAS_SCALEUVROWUP2BILINEAR_NEON
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@ -464,6 +470,24 @@ void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
int src_stepx,
uint8_t* dst_uv,
int dst_width);
void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_Any_C(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVCols_C(uint8_t* dst_uv,
const uint8_t* src_uv,
int dst_width,
@ -1163,6 +1187,55 @@ void ScaleUVRowDownEvenBox_Any_MMI(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
// ScaleRowDown2Box also used by planar functions
// NEON downscalers with interpolation.

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1775
#define LIBYUV_VERSION 1776
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -16,6 +16,7 @@
#include "libyuv/rotate.h"
#include "libyuv/row.h"
#include "libyuv/scale.h" // For ScalePlane()
#include "libyuv/scale_uv.h" // For UVScale()
#ifdef __cplusplus
namespace libyuv {
@ -613,6 +614,55 @@ int NV21ToI420(const uint8_t* src_y,
width, height);
}
LIBYUV_API
int NV12ToNV24(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_uv,
int src_stride_uv,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height) {
if (width == 0 || height == 0) {
return -1;
}
if (dst_y) {
ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
Abs(width), Abs(height), kFilterBilinear);
}
UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
Abs(height), kFilterBilinear);
return 0;
}
LIBYUV_API
int NV16ToNV24(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_uv,
int src_stride_uv,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height) {
if (width == 0 || height == 0) {
return -1;
}
if (dst_y) {
ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
Abs(width), Abs(height), kFilterBilinear);
}
UVScale(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
return 0;
}
// Convert YUY2 to I420.
LIBYUV_API
int YUY2ToI420(const uint8_t* src_yuy2,

View File

@ -1415,27 +1415,27 @@ void ScalePlaneUp2_Bilinear(int src_width,
// This function can only scale up by 2 times.
assert(src_width == ((dst_width + 1) / 2));
assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
assert(src_height == ((dst_height + 1) / 2));
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
#ifdef HAS_SCALEROWUP2BILINEAR_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
if (TestCpuFlag(kCpuHasSSSE3)) {
Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
#ifdef HAS_SCALEROWUP2BILINEAR_AVX2
if (TestCpuFlag(kCpuHasAVX2)) {
Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_NEON
#ifdef HAS_SCALEROWUP2BILINEAR_NEON
if (TestCpuFlag(kCpuHasNEON)) {
Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
}
@ -1480,19 +1480,19 @@ void ScalePlaneUp2_16_Linear(int src_width,
// This function can only scale up by 2 times horizontally.
assert(src_width == ((dst_width + 1) / 2));
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2;
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
if (TestCpuFlag(kCpuHasAVX2)) {
ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2;
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_NEON
#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
if (TestCpuFlag(kCpuHasNEON)) {
ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON;
}
@ -1532,21 +1532,21 @@ void ScalePlaneUp2_16_Bilinear(int src_width,
// This function can only scale up by 2 times.
assert(src_width == ((dst_width + 1) / 2));
assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
assert(src_height == ((dst_height + 1) / 2));
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2;
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
if (TestCpuFlag(kCpuHasAVX2)) {
Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2;
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_NEON
#ifdef HAS_SCALEROWUP2BILINEAR_16_NEON
if (TestCpuFlag(kCpuHasNEON)) {
Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON;
}

View File

@ -640,7 +640,7 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_C,
0,
uint16_t)
#ifdef HAS_SCALECOLUP2LINEAR_SSE2
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
SUH2LANY(ScaleRowUp2_Linear_Any_SSE2,
ScaleRowUp2_Linear_SSE2,
ScaleRowUp2_Linear_C,
@ -648,7 +648,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_SSE2,
uint8_t)
#endif
#ifdef HAS_SCALECOLUP2LINEAR_SSSE3
#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
ScaleRowUp2_Linear_SSSE3,
ScaleRowUp2_Linear_C,
@ -656,7 +656,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
uint8_t)
#endif
#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
ScaleRowUp2_Linear_16_SSE2,
ScaleRowUp2_Linear_16_C,
@ -664,7 +664,7 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
uint16_t)
#endif
#ifdef HAS_SCALECOLUP2LINEAR_AVX2
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
ScaleRowUp2_Linear_AVX2,
ScaleRowUp2_Linear_C,
@ -672,7 +672,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
uint8_t)
#endif
#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2
#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
ScaleRowUp2_Linear_16_AVX2,
ScaleRowUp2_Linear_16_C,
@ -680,7 +680,7 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
uint16_t)
#endif
#ifdef HAS_SCALECOLUP2LINEAR_NEON
#ifdef HAS_SCALEROWUP2LINEAR_NEON
SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
ScaleRowUp2_Linear_NEON,
ScaleRowUp2_Linear_C,
@ -688,7 +688,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
uint8_t)
#endif
#ifdef HAS_SCALECOLUP2LINEAR_16_NEON
#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
ScaleRowUp2_Linear_16_NEON,
ScaleRowUp2_Linear_16_C,
@ -699,7 +699,7 @@ SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
#undef SUH2LANY
// Scale up 2 times using bilinear filter.
// This function produces 2 rows at a time
// This function produces 2 rows at a time.
#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \
void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
ptrdiff_t dst_stride, int dst_width) { \
@ -736,7 +736,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C,
0,
uint16_t)
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
#ifdef HAS_SCALEROWUP2BILINEAR_SSE2
SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
ScaleRowUp2_Bilinear_SSE2,
ScaleRowUp2_Bilinear_C,
@ -744,7 +744,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
uint8_t)
#endif
#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
ScaleRowUp2_Bilinear_16_SSE2,
ScaleRowUp2_Bilinear_16_C,
@ -752,7 +752,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
uint16_t)
#endif
#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
ScaleRowUp2_Bilinear_SSSE3,
ScaleRowUp2_Bilinear_C,
@ -760,7 +760,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
uint8_t)
#endif
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
#ifdef HAS_SCALEROWUP2BILINEAR_AVX2
SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
ScaleRowUp2_Bilinear_AVX2,
ScaleRowUp2_Bilinear_C,
@ -768,7 +768,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
uint8_t)
#endif
#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
ScaleRowUp2_Bilinear_16_AVX2,
ScaleRowUp2_Bilinear_16_C,
@ -776,7 +776,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
uint16_t)
#endif
#ifdef HAS_SCALEROWUP2LINEAR_NEON
#ifdef HAS_SCALEROWUP2BILINEAR_NEON
SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON,
ScaleRowUp2_Bilinear_NEON,
ScaleRowUp2_Bilinear_C,
@ -784,7 +784,7 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON,
uint8_t)
#endif
#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
#ifdef HAS_SCALEROWUP2BILINEAR_16_NEON
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON,
ScaleRowUp2_Bilinear_16_NEON,
ScaleRowUp2_Bilinear_16_C,
@ -794,6 +794,120 @@ SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON,
#undef SU2BLANY
// Scale bi-planar plane up horizontally 2 times using linear filter.
#define SBUH2LANY(NAME, SIMD, C, MASK, PTYPE) \
void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
int work_width = (dst_width - 1) & ~1; \
int r = work_width & MASK; \
int n = work_width & ~MASK; \
dst_ptr[0] = src_ptr[0]; \
dst_ptr[1] = src_ptr[1]; \
if (work_width > 0) { \
if (n != 0) { \
SIMD(src_ptr, dst_ptr + 2, n); \
} \
C(src_ptr + n, dst_ptr + 2 * n + 2, r); \
} \
dst_ptr[2 * dst_width - 2] = src_ptr[((dst_width + 1) & ~1) - 2]; \
dst_ptr[2 * dst_width - 1] = src_ptr[((dst_width + 1) & ~1) - 1]; \
}
SBUH2LANY(ScaleUVRowUp2_Linear_Any_C,
ScaleUVRowUp2_Linear_C,
ScaleUVRowUp2_Linear_C,
0,
uint8_t)
#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3,
ScaleUVRowUp2_Linear_SSSE3,
ScaleUVRowUp2_Linear_C,
7,
uint8_t)
#endif
#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2,
ScaleUVRowUp2_Linear_AVX2,
ScaleUVRowUp2_Linear_C,
15,
uint8_t)
#endif
#ifdef HAS_SCALEUVROWUP2LINEAR_NEON
SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
ScaleUVRowUp2_Linear_NEON,
ScaleUVRowUp2_Linear_C,
7,
uint8_t)
#endif
#undef SBUH2LANY
// Scale bi-planar plane up 2 times using bilinear filter.
// This function produces 2 rows at a time.
#define SBU2BLANY(NAME, SIMD, C, MASK, PTYPE) \
void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
ptrdiff_t dst_stride, int dst_width) { \
int work_width = (dst_width - 1) & ~1; \
int r = work_width & MASK; \
int n = work_width & ~MASK; \
const PTYPE* sa = src_ptr; \
const PTYPE* sb = src_ptr + src_stride; \
PTYPE* da = dst_ptr; \
PTYPE* db = dst_ptr + dst_stride; \
da[0] = (3 * sa[0] + sb[0]) >> 2; \
db[0] = (sa[0] + 3 * sb[0]) >> 2; \
da[1] = (3 * sa[1] + sb[1]) >> 2; \
db[1] = (sa[1] + 3 * sb[1]) >> 2; \
if (work_width > 0) { \
if (n != 0) { \
SIMD(sa, sb - sa, da + 2, db - da, n); \
} \
C(sa + n, sb - sa, da + 2 * n + 2, db - da, r); \
} \
da[2 * dst_width - 2] = (3 * sa[((dst_width + 1) & ~1) - 2] + \
sb[((dst_width + 1) & ~1) - 2]) >> 2; \
db[2 * dst_width - 2] = (sa[((dst_width + 1) & ~1) - 2] + \
3 * sb[((dst_width + 1) & ~1) - 2]) >> 2; \
da[2 * dst_width - 1] = (3 * sa[((dst_width + 1) & ~1) - 1] + \
sb[((dst_width + 1) & ~1) - 1]) >> 2; \
db[2 * dst_width - 1] = (sa[((dst_width + 1) & ~1) - 1] + \
3 * sb[((dst_width + 1) & ~1) - 1]) >> 2; \
}
SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,
ScaleUVRowUp2_Bilinear_C,
ScaleUVRowUp2_Bilinear_C,
0,
uint8_t)
#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3,
ScaleUVRowUp2_Bilinear_SSSE3,
ScaleUVRowUp2_Bilinear_C,
7,
uint8_t)
#endif
#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2,
ScaleUVRowUp2_Bilinear_AVX2,
ScaleUVRowUp2_Bilinear_C,
15,
uint8_t)
#endif
#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON
SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON,
ScaleUVRowUp2_Bilinear_NEON,
ScaleUVRowUp2_Bilinear_C,
7,
uint8_t)
#endif
#undef SBU2BLANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -1200,6 +1200,56 @@ void ScaleUVRowDownEvenBox_C(const uint8_t* src_uv,
}
}
void ScaleUVRowUp2_Linear_C(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
int src_width = dst_width >> 1;
int x;
assert((dst_width % 2 == 0) && (dst_width >= 0));
for (x = 0; x < src_width; ++x) {
dst_ptr[4 * x + 0] =
(src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
dst_ptr[4 * x + 1] =
(src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
dst_ptr[4 * x + 2] =
(src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
dst_ptr[4 * x + 3] =
(src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
}
}
void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
const uint8_t* s = src_ptr;
const uint8_t* t = src_ptr + src_stride;
uint8_t* d = dst_ptr;
uint8_t* e = dst_ptr + dst_stride;
int src_width = dst_width >> 1;
int x;
assert((dst_width % 2 == 0) && (dst_width >= 0));
for (x = 0; x < src_width; ++x) {
d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
t[2 * x + 2] * 1 + 8) >> 4;
d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
t[2 * x + 3] * 1 + 8) >> 4;
d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
t[2 * x + 2] * 3 + 8) >> 4;
d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
t[2 * x + 3] * 3 + 8) >> 4;
e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
t[2 * x + 2] * 3 + 8) >> 4;
e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
t[2 * x + 3] * 3 + 8) >> 4;
e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
t[2 * x + 2] * 9 + 8) >> 4;
e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
t[2 * x + 3] * 9 + 8) >> 4;
}
}
// Scales a single row of pixels using point sampling.
void ScaleUVCols_C(uint8_t* dst_uv,
const uint8_t* src_uv,

View File

@ -779,7 +779,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
"xmm7");
}
#ifdef HAS_SCALECOLUP2LINEAR_SSE2
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
@ -833,7 +833,7 @@ void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
#ifdef HAS_SCALEROWUP2BILINEAR_SSE2
void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
@ -949,7 +949,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
}
#endif
#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
@ -999,7 +999,7 @@ void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
@ -1106,7 +1106,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
}
#endif
#ifdef HAS_SCALECOLUP2LINEAR_SSSE3
#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
static const uvec8 kLinearMadd31_SSSE3 = {3, 1, 1, 3, 3, 1, 1, 3,
3, 1, 1, 3, 3, 1, 1, 3};
@ -1149,7 +1149,7 @@ void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
#ifdef HAS_SCALEROWUP2BILINEAR_SSSE3
void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
@ -1236,7 +1236,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
}
#endif
#ifdef HAS_SCALECOLUP2LINEAR_AVX2
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
static const lvec8 kLinearMadd31_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1,
3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1,
1, 3, 3, 1, 1, 3, 3, 1, 1, 3};
@ -1281,7 +1281,7 @@ void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
#ifdef HAS_SCALEROWUP2BILINEAR_AVX2
void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
@ -1364,7 +1364,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
}
#endif
#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2
#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
static const lvec16 kLinearMadd31_16_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3,
3, 1, 1, 3, 3, 1, 1, 3};
@ -1450,7 +1450,7 @@ void ScaleRowUp2_Linear_16_AVX2_Full(const uint16_t* src_ptr,
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
#ifdef HAS_SCALEROWUP2BILINEAR_16_AVX2
void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
@ -2261,6 +2261,257 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
}
#endif // HAS_SCALEUVROWDOWN2BOX_AVX2
#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
static const uvec8 kUVLinearMadd31_SSSE3 = {3, 1, 3, 1, 1, 3, 1, 3,
3, 1, 3, 1, 1, 3, 1, 3};
void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"pcmpeqw %%xmm4,%%xmm4 \n"
"psrlw $15,%%xmm4 \n"
"psllw $1,%%xmm4 \n" // all 2
"movdqu %3,%%xmm3 \n"
LABELALIGN
"1: \n"
"movq (%0),%%xmm0 \n" // 00112233 (1u1v)
"movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
"punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
"movdqa %%xmm0,%%xmm2 \n"
"punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v)
"punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v)
"pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (1u1v16, hi)
"pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (1u1v16, lo)
"paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
"paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
"psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
"psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
"vpackuswb %%xmm2,%%xmm0,%%xmm0 \n"
"vmovdqu %%xmm0,(%1) \n"
"lea 0x8(%0),%0 \n"
"lea 0x10(%1),%1 \n" // 4 uv to 8 uv
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "m"(kUVLinearMadd31_SSSE3) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif
#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n"
"psllw $3,%%xmm6 \n" // all 8
"movdqu %5,%%xmm7 \n"
LABELALIGN
"1: \n"
"movq (%0),%%xmm0 \n" // 00112233 (1u1v)
"movq 2(%0),%%xmm1 \n" // 11223344 (1u1v)
"punpcklbw %%xmm1,%%xmm0 \n" // 0101121223233434 (2u2v)
"movdqa %%xmm0,%%xmm2 \n"
"punpckhdq %%xmm0,%%xmm2 \n" // 2323232334343434 (2u2v)
"punpckldq %%xmm0,%%xmm0 \n" // 0101010112121212 (2u2v)
"pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1u1v16, hi)
"pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1u1v16, lo)
"movq (%0,%3),%%xmm1 \n"
"movq 2(%0,%3),%%xmm4 \n"
"punpcklbw %%xmm4,%%xmm1 \n"
"movdqa %%xmm1,%%xmm3 \n"
"punpckhdq %%xmm1,%%xmm3 \n"
"punpckldq %%xmm1,%%xmm1 \n"
"pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
"pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
// xmm0 xmm2
// xmm1 xmm3
"movdqa %%xmm0,%%xmm4 \n"
"movdqa %%xmm1,%%xmm5 \n"
"paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
"paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
"paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
"paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
"psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
"movdqa %%xmm1,%%xmm5 \n"
"paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
"paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
"paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
"paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
"psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
"movdqa %%xmm2,%%xmm0 \n"
"movdqa %%xmm3,%%xmm1 \n"
"paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
"paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
"paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
"paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
"psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
"movdqa %%xmm3,%%xmm1 \n"
"paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
"paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
"paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
"paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
"psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
"packuswb %%xmm0,%%xmm4 \n"
"movdqu %%xmm4,(%1) \n" // store above
"packuswb %%xmm1,%%xmm5 \n"
"movdqu %%xmm5,(%1,%4) \n" // store below
"lea 0x8(%0),%0 \n"
"lea 0x10(%1),%1 \n" // 4 uv to 8 uv
"sub $0x8,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)), // %4
"m"(kUVLinearMadd31_SSSE3) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif
#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
static const lvec8 kUVLinearMadd31_AVX2 = {3, 1, 3, 1, 1, 3, 1, 3, 3, 1, 3,
1, 1, 3, 1, 3, 3, 1, 3, 1, 1, 3,
1, 3, 3, 1, 3, 1, 1, 3, 1, 3};
void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
"vmovdqu %3,%%ymm3 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n"
"vmovdqu 2(%0),%%xmm1 \n"
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
"vpermq $0b11011000,%%ymm1,%%ymm1 \n"
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
"vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
"vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
"vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
"vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
"vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
"vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 uv to 16 uv
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "m"(kUVLinearMadd31_AVX2) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif
#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrlw $15,%%ymm6,%%ymm6 \n"
"vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
"vmovdqu %5,%%ymm7 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n"
"vmovdqu 2(%0),%%xmm1 \n"
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
"vpermq $0b11011000,%%ymm1,%%ymm1 \n"
"vpunpcklbw %%ymm1,%%ymm0,%%ymm0 \n"
"vpunpckhdq %%ymm0,%%ymm0,%%ymm2 \n"
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
"vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
"vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
"vmovdqu 2(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm2,%%ymm2 \n"
"vpermq $0b11011000,%%ymm3,%%ymm3 \n"
"vpunpcklbw %%ymm3,%%ymm2,%%ymm2 \n"
"vpunpckhdq %%ymm2,%%ymm2,%%ymm4 \n"
"vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
"vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
// ymm0 ymm1
// ymm2 ymm3
"vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
"vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
"vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
"vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
"vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
"vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
"vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
"vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
"vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
"vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
"vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
"vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
"vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
"vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
"vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
"vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
"vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
"vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
"vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
"vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
"vmovdqu %%ymm4,(%1) \n" // store above
"vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
"vmovdqu %%ymm5,(%1,%4) \n" // store below
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 uv to 16 uv
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)), // %4
"m"(kUVLinearMadd31_AVX2) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif
#endif // defined(__x86_64__) || defined(__i386__)
#ifdef __cplusplus

View File

@ -509,20 +509,19 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
int dst_width) {
const uint8_t* src_temp = src_ptr + 1;
asm volatile(
"vmov.u16 q15, #3 \n"
"vmov.u8 d30, #3 \n"
"1: \n"
"vld1.8 {d0}, [%0]! \n" // 01234567
"vld1.8 {d2}, [%3]! \n" // 12345678
"vld1.8 {d4}, [%0]! \n" // 01234567
"vld1.8 {d5}, [%3]! \n" // 12345678
"vmovl.u8 q0, d0 \n" // 01234567 (16b)
"vmovl.u8 q1, d2 \n" // 12345678 (16b)
"vmovq q2, q0 \n"
"vmla.u16 q2, q1, q15 \n" // 3*near+far (odd)
"vmla.u16 q1, q0, q15 \n" // 3*near+far (even)
"vmovl.u8 q0, d4 \n" // 01234567 (16b)
"vmovl.u8 q1, d5 \n" // 12345678 (16b)
"vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd)
"vmlal.u8 q1, d4, d30 \n" // 3*near+far (even)
"vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (odd)
"vrshrn.u16 d1, q2, #2 \n" // 3/4*near+1/4*far (even)
"vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd)
"vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even)
"vst2.8 {d0, d1}, [%1]! \n" // store
"subs %2, %2, #16 \n" // 8 sample -> 16 sample
@ -548,25 +547,24 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
asm volatile(
"vmov.u16 q15, #3 \n"
"vmov.u8 d28, #3 \n"
"1: \n"
"vld1.8 {d0}, [%0]! \n" // 01234567
"vld1.8 {d2}, [%5]! \n" // 12345678
"vld1.8 {d4}, [%0]! \n" // 01234567
"vld1.8 {d5}, [%5]! \n" // 12345678
"vmovl.u8 q0, d0 \n" // 01234567 (16b)
"vmovl.u8 q1, d2 \n" // 12345678 (16b)
"vmovq q2, q0 \n"
"vmla.u16 q0, q1, q15 \n" // 3*near+far (1, odd)
"vmla.u16 q1, q2, q15 \n" // 3*near+far (1, even)
"vmovl.u8 q0, d4 \n" // 01234567 (16b)
"vmovl.u8 q1, d5 \n" // 12345678 (16b)
"vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd)
"vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even)
"vld1.8 {d4}, [%1]! \n" // 01234567
"vld1.8 {d6}, [%6]! \n" // 12345678
"vld1.8 {d8}, [%1]! \n"
"vld1.8 {d9}, [%6]! \n"
"vmovl.u8 q2, d4 \n" // 01234567 (16b)
"vmovl.u8 q3, d6 \n" // 12345678 (16b)
"vmovq q4, q2 \n"
"vmla.u16 q2, q3, q15 \n" // 3*near+far (2, odd)
"vmla.u16 q3, q4, q15 \n" // 3*near+far (2, even)
"vmovl.u8 q2, d8 \n"
"vmovl.u8 q3, d9 \n"
"vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd)
"vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even)
// e o
// q1 q0
@ -600,7 +598,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
"+r"(src_temp), // %5
"+r"(src_temp1) // %6
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
: "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
"q15" // Clobber List
);
}
@ -694,6 +692,105 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
);
}
void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
const uint8_t* src_temp = src_ptr + 2;
asm volatile(
"vmov.u8 d30, #3 \n"
"1: \n"
"vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v)
"vld1.8 {d5}, [%3]! \n" // 11223344 (1u1v)
"vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b)
"vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b)
"vmlal.u8 q0, d5, d30 \n" // 3*near+far (odd)
"vmlal.u8 q1, d4, d30 \n" // 3*near+far (even)
"vrshrn.u16 d1, q0, #2 \n" // 3/4*near+1/4*far (odd)
"vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (even)
"vst2.16 {d0, d1}, [%1]! \n" // store
"subs %2, %2, #8 \n" // 4 uv -> 8 uv
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_temp) // %3
:
: "memory", "cc", "q0", "q1", "q2", "d30" // Clobber List
);
}
void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
const uint8_t* src_ptr1 = src_ptr + src_stride;
uint8_t* dst_ptr1 = dst_ptr + dst_stride;
const uint8_t* src_temp = src_ptr + 2;
const uint8_t* src_temp1 = src_ptr1 + 2;
asm volatile(
"vmov.u16 q15, #3 \n"
"vmov.u8 d28, #3 \n"
"1: \n"
"vld1.8 {d4}, [%0]! \n" // 00112233 (1u1v)
"vld1.8 {d5}, [%5]! \n" // 11223344 (1u1v)
"vmovl.u8 q0, d4 \n" // 00112233 (1u1v, 16b)
"vmovl.u8 q1, d5 \n" // 11223344 (1u1v, 16b)
"vmlal.u8 q0, d5, d28 \n" // 3*near+far (1, odd)
"vmlal.u8 q1, d4, d28 \n" // 3*near+far (1, even)
"vld1.8 {d8}, [%1]! \n" // 00112233 (1u1v)
"vld1.8 {d9}, [%6]! \n" // 11223344 (1u1v)
"vmovl.u8 q2, d8 \n" // 00112233 (1u1v, 16b)
"vmovl.u8 q3, d9 \n" // 11223344 (1u1v, 16b)
"vmlal.u8 q2, d9, d28 \n" // 3*near+far (2, odd)
"vmlal.u8 q3, d8, d28 \n" // 3*near+far (2, even)
// e o
// q1 q0
// q3 q2
"vmovq q4, q2 \n"
"vmovq q5, q3 \n"
"vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd)
"vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even)
"vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd)
"vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even)
// e o
// q5 q4
// q1 q0
"vrshrn.u16 d2, q1, #4 \n" // 2, even
"vrshrn.u16 d3, q0, #4 \n" // 2, odd
"vrshrn.u16 d0, q5, #4 \n" // 1, even
"vrshrn.u16 d1, q4, #4 \n" // 1, odd
"vst2.16 {d0, d1}, [%2]! \n" // store
"vst2.16 {d2, d3}, [%3]! \n" // store
"subs %4, %4, #8 \n" // 4 uv -> 8 uv
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_ptr1), // %1
"+r"(dst_ptr), // %2
"+r"(dst_ptr1), // %3
"+r"(dst_width), // %4
"+r"(src_temp), // %5
"+r"(src_temp1) // %6
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d28",
"q15" // Clobber List
);
}
// Add a row of bytes to a row of shorts. Used for box filter.
// Reads 16 bytes and accumulates to 16 shorts at a time.
void ScaleAddRow_NEON(const uint8_t* src_ptr,

View File

@ -721,6 +721,101 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
);
}
void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
const uint8_t* src_temp = src_ptr + 2;
asm volatile(
"movi v31.8b, #3 \n"
"1: \n"
"ldr d0, [%0], #8 \n" // 00112233 (1u1v)
"ldr d1, [%1], #8 \n" // 11223344 (1u1v)
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"ushll v2.8h, v0.8b, #0 \n" // 00112233 (1u1v, 16b)
"ushll v3.8h, v1.8b, #0 \n" // 11223344 (1u1v, 16b)
"umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd)
"umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even)
"rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd)
"rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even)
"st2 {v1.4h, v2.4h}, [%2], #16 \n" // store
"subs %w3, %w3, #8 \n" // 4 uv -> 8 uv
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_temp), // %1
"+r"(dst_ptr), // %2
"+r"(dst_width) // %3
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List
);
}
void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
const uint8_t* src_ptr1 = src_ptr + src_stride;
uint8_t* dst_ptr1 = dst_ptr + dst_stride;
const uint8_t* src_temp = src_ptr + 2;
const uint8_t* src_temp1 = src_ptr1 + 2;
asm volatile(
"movi v31.8b, #3 \n"
"movi v30.8h, #3 \n"
"1: \n"
"ldr d0, [%0], #8 \n"
"ldr d1, [%2], #8 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"ushll v2.8h, v0.8b, #0 \n"
"ushll v3.8h, v1.8b, #0 \n"
"umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd)
"umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even)
"ldr d0, [%1], #8 \n"
"ldr d1, [%3], #8 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"ushll v4.8h, v0.8b, #0 \n"
"ushll v5.8h, v1.8b, #0 \n"
"umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd)
"umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even)
"mov v0.8h, v4.8h \n"
"mov v1.8h, v5.8h \n"
"mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd)
"mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even)
"mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd)
"mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even)
"rshrn v2.8b, v2.8h, #4 \n" // 2, odd
"rshrn v1.8b, v3.8h, #4 \n" // 2, even
"rshrn v4.8b, v4.8h, #4 \n" // 1, odd
"rshrn v3.8b, v5.8h, #4 \n" // 1, even
"st2 {v1.4h, v2.4h}, [%5], #16 \n" // store 1
"st2 {v3.4h, v4.4h}, [%4], #16 \n" // store 2
"subs %w6, %w6, #8 \n" // 4 uv -> 8 uv
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_ptr1), // %1
"+r"(src_temp), // %2
"+r"(src_temp1), // %3
"+r"(dst_ptr), // %4
"+r"(dst_ptr1), // %5
"+r"(dst_width) // %6
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
"v31" // Clobber List
);
}
// Add a row of bytes to a row of shorts. Used for box filter.
// Reads 16 bytes and accumulates to 16 shorts at a time.
void ScaleAddRow_NEON(const uint8_t* src_ptr,

View File

@ -649,6 +649,116 @@ static void ScaleUVBilinearUp(int src_width,
}
#endif // HAS_SCALEUVBILINEARUP
// Scale UV, horizontally up by 2 times.
// Uses linear filter horizontally, nearest vertically.
// This is an optimized version for scaling up a plane to 2 times of
// its original width, using linear interpolation.
// This is used to scale U and V planes of NV16 to NV24.
void ScaleUVLinearUp2(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8_t* src_uv,
uint8_t* dst_uv) {
void (*ScaleRowUp)(const uint8_t* src_uv, uint8_t* dst_uv, int dst_width) =
ScaleUVRowUp2_Linear_Any_C;
int i;
int y;
int dy;
// This function can only scale up by 2 times horizontally.
assert(src_width == ((dst_width + 1) / 2));
#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
if (TestCpuFlag(kCpuHasSSSE3)) {
ScaleRowUp = ScaleUVRowUp2_Linear_Any_SSSE3;
}
#endif
#ifdef HAS_SCALEUVROWUP2LINEAR_AVX2
if (TestCpuFlag(kCpuHasAVX2)) {
ScaleRowUp = ScaleUVRowUp2_Linear_Any_AVX2;
}
#endif
#ifdef HAS_SCALEUVROWUP2LINEAR_NEON
if (TestCpuFlag(kCpuHasNEON)) {
ScaleRowUp = ScaleUVRowUp2_Linear_Any_NEON;
}
#endif
if (dst_height == 1) {
ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv,
dst_width);
} else {
dy = FixedDiv(src_height - 1, dst_height - 1);
y = (1 << 15) - 1;
for (i = 0; i < dst_height; ++i) {
ScaleRowUp(src_uv + (y >> 16) * src_stride, dst_uv, dst_width);
dst_uv += dst_stride;
y += dy;
}
}
}
// Scale plane, up by 2 times.
// This is an optimized version for scaling up a plane to 2 times of
// its original size, using bilinear interpolation.
// This is used to scale U and V planes of NV12 to NV24.
void ScaleUVBilinearUp2(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8_t* src_ptr,
uint8_t* dst_ptr) {
void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
ScaleUVRowUp2_Bilinear_Any_C;
int x;
// This function can only scale up by 2 times.
assert(src_width == ((dst_width + 1) / 2));
assert(src_height == ((dst_height + 1) / 2));
#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
if (TestCpuFlag(kCpuHasSSSE3)) {
Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_SSSE3;
}
#endif
#ifdef HAS_SCALEUVROWUP2BILINEAR_AVX2
if (TestCpuFlag(kCpuHasAVX2)) {
Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_AVX2;
}
#endif
#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON
if (TestCpuFlag(kCpuHasNEON)) {
Scale2RowUp = ScaleUVRowUp2_Bilinear_Any_NEON;
}
#endif
if (src_height == 1) {
Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
} else {
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
dst_ptr += dst_stride;
for (x = 0; x < src_height - 1; ++x) {
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
src_ptr += src_stride;
// TODO: Test performance of writing one row of destination at a time.
dst_ptr += 2 * dst_stride;
}
if (!(dst_height & 1)) {
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
}
}
}
// Scale UV to/from any dimensions, without interpolation.
// Fixed point math is used for performance: The upper 16 bits
// of x and dx is the integer part of the source position and
@ -844,6 +954,18 @@ static void ScaleUV(const uint8_t* src,
dst_stride, src, dst, x, y, dy, 4, filtering);
return;
}
if (filtering && src_height == dst_height) {
ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride,
dst_stride, src, dst);
return;
}
if ((clip_height + 1) / 2 == src_height &&
(clip_width + 1) / 2 == src_width &&
(filtering == kFilterBilinear || filtering == kFilterBox)) {
ScaleUVBilinearUp2(src_width, src_height, clip_width, clip_height,
src_stride, dst_stride, src, dst);
return;
}
#if HAS_SCALEUVBILINEARUP
if (filtering && dy < 65536) {
ScaleUVBilinearUp(src_width, src_height, clip_width, clip_height,

View File

@ -458,6 +458,8 @@ TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
TESTBIPLANARTOBP(NV12, 2, 2, NV24, 1, 1)
TESTBIPLANARTOBP(NV16, 2, 1, NV24, 1, 1)
#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \