Add P010ToP410 and P210ToP410

These are 16 bit bi-planar convert functions to scale UV plane to
Y plane's size using (bi)linear filter.

libyuv_unittest --gtest_filter=*ToP41*

R=fbarchard@chromium.org

Bug: libyuv:872
Change-Id: I3cb4fafe2b2c9eedd0d91cf4c619abb9ee107bc1
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2690102
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Yuan Tong 2021-02-12 10:49:25 +08:00 committed by Frank Barchard
parent 12a4a2372c
commit d4ecb70610
15 changed files with 1353 additions and 498 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1776
Version: 1777
License: BSD
License File: LICENSE

View File

@ -315,6 +315,44 @@ int NV16ToNV24(const uint8_t* src_y,
int width,
int height);
// Convert P010 to P410.
LIBYUV_API
int P010ToP410(const uint16_t* src_y,
int src_stride_y,
const uint16_t* src_uv,
int src_stride_uv,
uint16_t* dst_y,
int dst_stride_y,
uint16_t* dst_uv,
int dst_stride_uv,
int width,
int height);
// Convert P012 to P412.
#define P012ToP412 P010ToP410
// Convert P016 to P416.
#define P016ToP416 P010ToP410
// Convert P210 to P410.
LIBYUV_API
int P210ToP410(const uint16_t* src_y,
int src_stride_y,
const uint16_t* src_uv,
int src_stride_uv,
uint16_t* dst_y,
int dst_stride_y,
uint16_t* dst_uv,
int dst_stride_uv,
int width,
int height);
// Convert P212 to P412.
#define P212ToP412 P210ToP410
// Convert P216 to P416.
#define P216ToP416 P210ToP410
// Convert YUY2 to I420.
LIBYUV_API
int YUY2ToI420(const uint8_t* src_yuy2,

View File

@ -81,10 +81,12 @@ extern "C" {
#define HAS_SCALEROWUP2LINEAR_SSSE3
#define HAS_SCALEROWUP2BILINEAR_SSE2
#define HAS_SCALEROWUP2BILINEAR_SSSE3
#define HAS_SCALEROWUP2LINEAR_16_SSE2
#define HAS_SCALEROWUP2BILINEAR_16_SSE2
#define HAS_SCALEROWUP2LINEAR_16_SSSE3
#define HAS_SCALEROWUP2BILINEAR_16_SSSE3
#define HAS_SCALEUVROWUP2LINEAR_SSSE3
#define HAS_SCALEUVROWUP2BILINEAR_SSSE3
#define HAS_SCALEUVROWUP2LINEAR_16_SSE2
#define HAS_SCALEUVROWUP2BILINEAR_16_SSE2
#endif
// The following are available for gcc/clang x86 platforms, but
@ -100,6 +102,8 @@ extern "C" {
#define HAS_SCALEROWUP2BILINEAR_16_AVX2
#define HAS_SCALEUVROWUP2LINEAR_AVX2
#define HAS_SCALEUVROWUP2BILINEAR_AVX2
#define HAS_SCALEUVROWUP2LINEAR_16_AVX2
#define HAS_SCALEUVROWUP2BILINEAR_16_AVX2
#endif
// The following are available on all x86 platforms, but
@ -134,6 +138,8 @@ extern "C" {
#define HAS_SCALEROWUP2BILINEAR_16_NEON
#define HAS_SCALEUVROWUP2LINEAR_NEON
#define HAS_SCALEUVROWUP2BILINEAR_NEON
#define HAS_SCALEUVROWUP2LINEAR_16_NEON
#define HAS_SCALEUVROWUP2BILINEAR_16_NEON
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@ -487,6 +493,22 @@ void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_16_Any_C(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVCols_C(uint8_t* dst_uv,
const uint8_t* src_uv,
@ -589,10 +611,10 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
void ScaleRowUp2_Linear_16_SSSE3(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
void ScaleRowUp2_Bilinear_16_SSSE3(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
@ -629,10 +651,10 @@ void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
void ScaleRowUp2_Linear_16_Any_SSSE3(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr,
void ScaleRowUp2_Bilinear_16_Any_SSSE3(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
@ -1235,6 +1257,54 @@ void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
// ScaleRowDown2Box also used by planar functions
// NEON downscalers with interpolation.

View File

@ -30,6 +30,19 @@ int UVScale(const uint8_t* src_uv,
int dst_height,
enum FilterMode filtering);
// Scale an 16 bit UV image.
// This function is currently incomplete, it can't handle all cases.
LIBYUV_API
int UVScale_16(const uint16_t* src_uv,
int src_stride_uv,
int src_width,
int src_height,
uint16_t* dst_uv,
int dst_stride_uv,
int dst_width,
int dst_height,
enum FilterMode filtering);
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1776
#define LIBYUV_VERSION 1777
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -663,6 +663,55 @@ int NV16ToNV24(const uint8_t* src_y,
return 0;
}
LIBYUV_API
int P010ToP410(const uint16_t* src_y,
int src_stride_y,
const uint16_t* src_uv,
int src_stride_uv,
uint16_t* dst_y,
int dst_stride_y,
uint16_t* dst_uv,
int dst_stride_uv,
int width,
int height) {
if (width == 0 || height == 0) {
return -1;
}
if (dst_y) {
ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
Abs(width), Abs(height), kFilterBilinear);
}
UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
Abs(height), kFilterBilinear);
return 0;
}
LIBYUV_API
int P210ToP410(const uint16_t* src_y,
int src_stride_y,
const uint16_t* src_uv,
int src_stride_uv,
uint16_t* dst_y,
int dst_stride_y,
uint16_t* dst_uv,
int dst_stride_uv,
int width,
int height) {
if (width == 0 || height == 0) {
return -1;
}
if (dst_y) {
ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
Abs(width), Abs(height), kFilterBilinear);
}
UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
return 0;
}
// Convert YUY2 to I420.
LIBYUV_API
int YUY2ToI420(const uint8_t* src_yuy2,

View File

@ -4190,6 +4190,7 @@ void MergeARGBRow_AVX2(const uint8_t* src_r,
"lea 64(%4),%4 \n"
"sub $0x10,%5 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
@ -4231,6 +4232,7 @@ void MergeXRGBRow_AVX2(const uint8_t* src_r,
"lea 64(%3),%3 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_r), // %0
"+r"(src_g), // %1
"+r"(src_b), // %2
@ -4340,9 +4342,9 @@ void SplitXRGBRow_SSE2(const uint8_t* src_argb,
}
#endif
static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8, 12, 1, 5, 9, 13,
2, 6, 10, 14, 3, 7, 11, 15};
#ifdef HAS_SPLITARGBROW_SSSE3
static const uvec8 kShuffleMaskARGBSplit = {0u, 4u, 8u, 12u, 1u, 5u, 9u, 13u,
2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u};
void SplitARGBRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
@ -4351,6 +4353,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
int width) {
asm volatile(
"movdqa %6,%%xmm3 \n"
"sub %1,%2 \n"
"sub %1,%3 \n"
"sub %1,%4 \n"
@ -4360,8 +4363,8 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
"movdqu (%0),%%xmm0 \n" // 00-0F
"movdqu 16(%0),%%xmm1 \n" // 10-1F
"pshufb %6,%%xmm0 \n" // 048C159D26AE37BF (lo)
"pshufb %6,%%xmm1 \n" // 048C159D26AE37BF (hi)
"pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo)
"pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi)
"movdqa %%xmm0,%%xmm2 \n"
"punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
"punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
@ -4385,7 +4388,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
"+rm"(width) // %5
#endif
: "m"(kShuffleMaskARGBSplit) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2");
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
@ -4395,13 +4398,15 @@ void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
int width) {
asm volatile(
"movdqa %5,%%xmm3 \n"
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n" // 00-0F
"movdqu 16(%0),%%xmm1 \n" // 10-1F
"pshufb %5,%%xmm0 \n" // 048C159D26AE37BF (lo)
"pshufb %5,%%xmm1 \n" // 048C159D26AE37BF (hi)
"pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo)
"pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi)
"movdqa %%xmm0,%%xmm2 \n"
"punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
"punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
@ -4421,16 +4426,12 @@ void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
"+r"(dst_b), // %3
"+r"(width) // %4
: "m"(kShuffleMaskARGBSplit) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2");
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
#endif
#ifdef HAS_SPLITARGBROW_AVX2
static const lvec8 kShuffleMaskARGBSplit_AVX2 = {
0u, 4u, 8u, 12u, 1u, 5u, 9u, 13u, 2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u,
0u, 4u, 8u, 12u, 1u, 5u, 9u, 13u, 2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u};
static const ulvec32 kShuffleMaskARGBPermute_AVX2 = {0u, 4u, 1u, 5u,
2u, 6u, 3u, 7u};
static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7};
void SplitARGBRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_r,
uint8_t* dst_g,
@ -4442,7 +4443,8 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
"sub %1,%2 \n"
"sub %1,%3 \n"
"sub %1,%4 \n"
"vmovdqu %7,%%ymm3 \n"
"vmovdqa %7,%%ymm3 \n"
"vbroadcastf128 %6,%%ymm4 \n"
LABELALIGN
"1: \n"
@ -4451,8 +4453,8 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
"vmovdqu 16(%0),%%xmm1 \n" // 10-1F
"vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F
"vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F
"vpshufb %6,%%ymm0,%%ymm0 \n"
"vpshufb %6,%%ymm1,%%ymm1 \n"
"vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
"vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
"vpermd %%ymm0,%%ymm3,%%ymm0 \n"
"vpermd %%ymm1,%%ymm3,%%ymm1 \n"
"vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA
@ -4465,6 +4467,7 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
"lea 16(%1),%1 \n"
"subl $0x10,%5 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
@ -4475,9 +4478,9 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
#else
"+rm"(width) // %5
#endif
: "m"(kShuffleMaskARGBSplit_AVX2), // %6
"m"(kShuffleMaskARGBPermute_AVX2) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
: "m"(kShuffleMaskARGBSplit), // %6
"m"(kShuffleMaskARGBPermute) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
void SplitXRGBRow_AVX2(const uint8_t* src_argb,
@ -4487,15 +4490,18 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb,
int width) {
asm volatile(
"vmovdqu %6,%%ymm3 \n" LABELALIGN
"vmovdqa %6,%%ymm3 \n"
"vbroadcastf128 %5,%%ymm4 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 00-0F
"vmovdqu 16(%0),%%xmm1 \n" // 10-1F
"vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F
"vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F
"vpshufb %5,%%ymm0,%%ymm0 \n"
"vpshufb %5,%%ymm1,%%ymm1 \n"
"vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
"vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
"vpermd %%ymm0,%%ymm3,%%ymm0 \n"
"vpermd %%ymm1,%%ymm3,%%ymm1 \n"
"vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA
@ -4510,13 +4516,14 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb,
"lea 16(%3),%3 \n"
"sub $0x10,%4 \n"
"jg 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: "m"(kShuffleMaskARGBSplit_AVX2), // %5
"m"(kShuffleMaskARGBPermute_AVX2) // %6
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_r), // %1
"+r"(dst_g), // %2
"+r"(dst_b), // %3
"+r"(width) // %4
: "m"(kShuffleMaskARGBSplit), // %5
"m"(kShuffleMaskARGBPermute) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
#endif

View File

@ -1441,20 +1441,16 @@ void ScalePlaneUp2_Bilinear(int src_width,
}
#endif
if (src_height == 1) {
Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
} else {
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
dst_ptr += dst_stride;
for (x = 0; x < src_height - 1; ++x) {
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
src_ptr += src_stride;
// TODO: Test performance of writing one row of destination at a time.
dst_ptr += 2 * dst_stride;
}
if (!(dst_height & 1)) {
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
dst_ptr += dst_stride;
for (x = 0; x < src_height - 1; ++x) {
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
src_ptr += src_stride;
// TODO: Test performance of writing one row of destination at a time.
dst_ptr += 2 * dst_stride;
}
if (!(dst_height & 1)) {
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
}
}
}
@ -1480,9 +1476,9 @@ void ScalePlaneUp2_16_Linear(int src_width,
// This function can only scale up by 2 times horizontally.
assert(src_width == ((dst_width + 1) / 2));
#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2;
#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3
if (TestCpuFlag(kCpuHasSSSE3)) {
ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSSE3;
}
#endif
@ -1534,9 +1530,9 @@ void ScalePlaneUp2_16_Bilinear(int src_width,
assert(src_width == ((dst_width + 1) / 2));
assert(src_height == ((dst_height + 1) / 2));
#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2;
#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3
if (TestCpuFlag(kCpuHasSSSE3)) {
Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSSE3;
}
#endif
@ -1552,19 +1548,15 @@ void ScalePlaneUp2_16_Bilinear(int src_width,
}
#endif
if (src_height == 1) {
Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
} else {
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
dst_ptr += dst_stride;
for (x = 0; x < src_height - 1; ++x) {
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
src_ptr += src_stride;
dst_ptr += 2 * dst_stride;
}
if (!(dst_height & 1)) {
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
dst_ptr += dst_stride;
for (x = 0; x < src_height - 1; ++x) {
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
src_ptr += src_stride;
dst_ptr += 2 * dst_stride;
}
if (!(dst_height & 1)) {
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
}
}
}

View File

@ -656,9 +656,9 @@ SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
uint8_t)
#endif
#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
ScaleRowUp2_Linear_16_SSE2,
#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3
SUH2LANY(ScaleRowUp2_Linear_16_Any_SSSE3,
ScaleRowUp2_Linear_16_SSSE3,
ScaleRowUp2_Linear_16_C,
15,
uint16_t)
@ -676,7 +676,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
ScaleRowUp2_Linear_16_AVX2,
ScaleRowUp2_Linear_16_C,
15,
31,
uint16_t)
#endif
@ -744,9 +744,9 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
uint8_t)
#endif
#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
ScaleRowUp2_Bilinear_16_SSE2,
#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSSE3,
ScaleRowUp2_Bilinear_16_SSSE3,
ScaleRowUp2_Bilinear_16_C,
15,
uint16_t)
@ -818,6 +818,12 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_C,
0,
uint8_t)
SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C,
ScaleUVRowUp2_Linear_16_C,
ScaleUVRowUp2_Linear_16_C,
0,
uint16_t)
#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3,
ScaleUVRowUp2_Linear_SSSE3,
@ -834,6 +840,22 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2,
uint8_t)
#endif
#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE2,
ScaleUVRowUp2_Linear_16_SSE2,
ScaleUVRowUp2_Linear_16_C,
3,
uint16_t)
#endif
#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2
SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2,
ScaleUVRowUp2_Linear_16_AVX2,
ScaleUVRowUp2_Linear_16_C,
7,
uint16_t)
#endif
#ifdef HAS_SCALEUVROWUP2LINEAR_NEON
SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
ScaleUVRowUp2_Linear_NEON,
@ -842,6 +864,14 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
uint8_t)
#endif
#ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON
SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_NEON,
ScaleUVRowUp2_Linear_16_NEON,
ScaleUVRowUp2_Linear_16_C,
7,
uint16_t)
#endif
#undef SBUH2LANY
// Scale bi-planar plane up 2 times using bilinear filter.
@ -886,6 +916,12 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,
0,
uint8_t)
SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C,
ScaleUVRowUp2_Bilinear_16_C,
ScaleUVRowUp2_Bilinear_16_C,
0,
uint16_t)
#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3,
ScaleUVRowUp2_Bilinear_SSSE3,
@ -902,6 +938,22 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2,
uint8_t)
#endif
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE2,
ScaleUVRowUp2_Bilinear_16_SSE2,
ScaleUVRowUp2_Bilinear_16_C,
7,
uint16_t)
#endif
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2
SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_AVX2,
ScaleUVRowUp2_Bilinear_16_AVX2,
ScaleUVRowUp2_Bilinear_16_C,
7,
uint16_t)
#endif
#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON
SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON,
ScaleUVRowUp2_Bilinear_NEON,
@ -910,6 +962,14 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON,
uint8_t)
#endif
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON
SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_NEON,
ScaleUVRowUp2_Bilinear_16_NEON,
ScaleUVRowUp2_Bilinear_16_C,
3,
uint16_t)
#endif
#undef SBU2BLANY
#ifdef __cplusplus

View File

@ -1258,6 +1258,64 @@ void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
}
}
void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
int src_width = dst_width >> 1;
int x;
assert((dst_width % 2 == 0) && (dst_width >= 0));
for (x = 0; x < src_width; ++x) {
dst_ptr[4 * x + 0] =
(src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
dst_ptr[4 * x + 1] =
(src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
dst_ptr[4 * x + 2] =
(src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
dst_ptr[4 * x + 3] =
(src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
}
}
void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
const uint16_t* s = src_ptr;
const uint16_t* t = src_ptr + src_stride;
uint16_t* d = dst_ptr;
uint16_t* e = dst_ptr + dst_stride;
int src_width = dst_width >> 1;
int x;
assert((dst_width % 2 == 0) && (dst_width >= 0));
for (x = 0; x < src_width; ++x) {
d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
t[2 * x + 2] * 1 + 8) >>
4;
d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
t[2 * x + 3] * 1 + 8) >>
4;
d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
t[2 * x + 2] * 3 + 8) >>
4;
d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
t[2 * x + 3] * 3 + 8) >>
4;
e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
t[2 * x + 2] * 3 + 8) >>
4;
e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
t[2 * x + 3] * 3 + 8) >>
4;
e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
t[2 * x + 2] * 9 + 8) >>
4;
e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
t[2 * x + 3] * 9 + 8) >>
4;
}
}
// Scales a single row of pixels using point sampling.
void ScaleUVCols_C(uint8_t* dst_uv,
const uint8_t* src_uv,

File diff suppressed because it is too large Load Diff

View File

@ -791,6 +791,102 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
);
}
void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
const uint16_t* src_temp = src_ptr + 2;
asm volatile(
"vmov.u16 d30, #3 \n"
"1: \n"
"vld1.16 {q0}, [%0]! \n" // 00112233 (1u1v, 16)
"vld1.16 {q1}, [%3]! \n" // 11223344 (1u1v, 16)
"vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b)
"vmovl.u16 q3, d2 \n" // 1122 (1u1v, 32b)
"vmovl.u16 q4, d1 \n" // 2233 (1u1v, 32b)
"vmovl.u16 q5, d3 \n" // 3344 (1u1v, 32b)
"vmlal.u16 q2, d2, d30 \n" // 3*near+far (odd)
"vmlal.u16 q3, d0, d30 \n" // 3*near+far (even)
"vmlal.u16 q4, d3, d30 \n" // 3*near+far (odd)
"vmlal.u16 q5, d1, d30 \n" // 3*near+far (even)
"vrshrn.u32 d1, q2, #2 \n" // 3/4*near+1/4*far (odd)
"vrshrn.u32 d0, q3, #2 \n" // 3/4*near+1/4*far (even)
"vrshrn.u32 d3, q4, #2 \n" // 3/4*near+1/4*far (odd)
"vrshrn.u32 d2, q5, #2 \n" // 3/4*near+1/4*far (even)
"vst2.32 {d0, d1}, [%1]! \n" // store
"vst2.32 {d2, d3}, [%1]! \n" // store
"subs %2, %2, #8 \n" // 4 uv -> 8 uv
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_temp) // %3
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d30" // Clobber List
);
}
void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
const uint16_t* src_ptr1 = src_ptr + src_stride;
uint16_t* dst_ptr1 = dst_ptr + dst_stride;
const uint16_t* src_temp = src_ptr + 2;
const uint16_t* src_temp1 = src_ptr1 + 2;
asm volatile(
"vmov.u16 d30, #3 \n"
"vmov.u32 q14, #3 \n"
"1: \n"
"vld1.8 {d0}, [%0]! \n" // 0011 (1u1v)
"vld1.8 {d1}, [%5]! \n" // 1122 (1u1v)
"vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b)
"vmovl.u16 q3, d1 \n" // 1122 (1u1v, 32b)
"vmlal.u16 q2, d1, d30 \n" // 3*near+far (1, odd)
"vmlal.u16 q3, d0, d30 \n" // 3*near+far (1, even)
"vld1.8 {d0}, [%1]! \n" // 0011 (1u1v)
"vld1.8 {d1}, [%6]! \n" // 1122 (1u1v)
"vmovl.u16 q4, d0 \n" // 0011 (1u1v, 32b)
"vmovl.u16 q5, d1 \n" // 1122 (1u1v, 32b)
"vmlal.u16 q4, d1, d30 \n" // 3*near+far (2, odd)
"vmlal.u16 q5, d0, d30 \n" // 3*near+far (2, even)
"vmovq q0, q4 \n"
"vmovq q1, q5 \n"
"vmla.u32 q4, q2, q14 \n" // 9 3 3 1 (1, odd)
"vmla.u32 q5, q3, q14 \n" // 9 3 3 1 (1, even)
"vmla.u32 q2, q0, q14 \n" // 9 3 3 1 (2, odd)
"vmla.u32 q3, q1, q14 \n" // 9 3 3 1 (2, even)
"vrshrn.u32 d1, q4, #4 \n" // 1, odd
"vrshrn.u32 d0, q5, #4 \n" // 1, even
"vrshrn.u32 d3, q2, #4 \n" // 2, odd
"vrshrn.u32 d2, q3, #4 \n" // 2, even
"vst2.32 {d0, d1}, [%2]! \n" // store
"vst2.32 {d2, d3}, [%3]! \n" // store
"subs %4, %4, #4 \n" // 2 uv -> 4 uv
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_ptr1), // %1
"+r"(dst_ptr), // %2
"+r"(dst_ptr1), // %3
"+r"(dst_width), // %4
"+r"(src_temp), // %5
"+r"(src_temp1) // %6
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14",
"d30" // Clobber List
);
}
// Add a row of bytes to a row of shorts. Used for box filter.
// Reads 16 bytes and accumulates to 16 shorts at a time.
void ScaleAddRow_NEON(const uint8_t* src_ptr,

View File

@ -799,8 +799,8 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
"rshrn v4.8b, v4.8h, #4 \n" // 1, odd
"rshrn v3.8b, v5.8h, #4 \n" // 1, even
"st2 {v1.4h, v2.4h}, [%5], #16 \n" // store 1
"st2 {v3.4h, v4.4h}, [%4], #16 \n" // store 2
"st2 {v1.4h, v2.4h}, [%5], #16 \n" // store 2
"st2 {v3.4h, v4.4h}, [%4], #16 \n" // store 1
"subs %w6, %w6, #8 \n" // 4 uv -> 8 uv
"b.gt 1b \n"
: "+r"(src_ptr), // %0
@ -816,6 +816,106 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
);
}
void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
const uint16_t* src_temp = src_ptr + 2;
asm volatile(
"movi v31.8h, #3 \n"
"1: \n"
"ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
"ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b)
"ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b)
"ushll2 v4.4s, v0.8h, #0 \n" // 2233 (1u1v, 32b)
"ushll2 v5.4s, v1.8h, #0 \n" // 3344 (1u1v, 32b)
"umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (odd)
"umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (even)
"umlal2 v4.4s, v1.8h, v31.8h \n" // 3*near+far (odd)
"umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (even)
"rshrn v2.4h, v2.4s, #2 \n" // 3/4*near+1/4*far (odd)
"rshrn v1.4h, v3.4s, #2 \n" // 3/4*near+1/4*far (even)
"rshrn v4.4h, v4.4s, #2 \n" // 3/4*near+1/4*far (odd)
"rshrn v3.4h, v5.4s, #2 \n" // 3/4*near+1/4*far (even)
"st2 {v1.2s, v2.2s}, [%2], #16 \n" // store
"st2 {v3.2s, v4.2s}, [%2], #16 \n" // store
"subs %w3, %w3, #8 \n" // 4 uv -> 8 uv
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_temp), // %1
"+r"(dst_ptr), // %2
"+r"(dst_width) // %3
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v31" // Clobber List
);
}
void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
const uint16_t* src_ptr1 = src_ptr + src_stride;
uint16_t* dst_ptr1 = dst_ptr + dst_stride;
const uint16_t* src_temp = src_ptr + 2;
const uint16_t* src_temp1 = src_ptr1 + 2;
asm volatile(
"movi v31.4h, #3 \n"
"movi v30.4s, #3 \n"
"1: \n"
"ldr d0, [%0], #8 \n"
"ldr d1, [%2], #8 \n"
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b)
"ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b)
"umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd)
"umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even)
"ldr d0, [%1], #8 \n"
"ldr d1, [%3], #8 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"ushll v4.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b)
"ushll v5.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b)
"umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd)
"umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even)
"mov v0.4s, v4.4s \n"
"mov v1.4s, v5.4s \n"
"mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd)
"mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even)
"mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd)
"mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even)
"rshrn v1.4h, v2.4s, #4 \n" // 2, odd
"rshrn v0.4h, v3.4s, #4 \n" // 2, even
"rshrn v3.4h, v4.4s, #4 \n" // 1, odd
"rshrn v2.4h, v5.4s, #4 \n" // 1, even
"st2 {v0.2s, v1.2s}, [%5], #16 \n" // store 2
"st2 {v2.2s, v3.2s}, [%4], #16 \n" // store 1
"subs %w6, %w6, #4 \n" // 2 uv -> 4 uv
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_ptr1), // %1
"+r"(src_temp), // %2
"+r"(src_temp1), // %3
"+r"(dst_ptr), // %4
"+r"(dst_ptr1), // %5
"+r"(dst_width) // %6
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
"v31" // Clobber List
);
}
// Add a row of bytes to a row of shorts. Used for box filter.
// Reads 16 bytes and accumulates to 16 shorts at a time.
void ScaleAddRow_NEON(const uint8_t* src_ptr,

View File

@ -741,23 +741,124 @@ void ScaleUVBilinearUp2(int src_width,
}
#endif
if (src_height == 1) {
Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
} else {
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
dst_ptr += dst_stride;
for (x = 0; x < src_height - 1; ++x) {
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
src_ptr += src_stride;
// TODO: Test performance of writing one row of destination at a time.
dst_ptr += 2 * dst_stride;
}
if (!(dst_height & 1)) {
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
dst_ptr += dst_stride;
for (x = 0; x < src_height - 1; ++x) {
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
src_ptr += src_stride;
// TODO: Test performance of writing one row of destination at a time.
dst_ptr += 2 * dst_stride;
}
if (!(dst_height & 1)) {
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
}
}
// Scale 16 bit UV, horizontally up by 2 times.
// Uses linear filter horizontally, nearest vertically.
// This is an optimized version for scaling up a plane to 2 times of
// its original width, using linear interpolation.
// This is used to scale U and V planes of P210 to P410.
void ScaleUVLinearUp2_16(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint16_t* src_uv,
uint16_t* dst_uv) {
void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
ScaleUVRowUp2_Linear_16_Any_C;
int i;
int y;
int dy;
// This function can only scale up by 2 times horizontally.
assert(src_width == ((dst_width + 1) / 2));
#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE2;
}
#endif
#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2
if (TestCpuFlag(kCpuHasAVX2)) {
ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2;
}
#endif
#ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON
if (TestCpuFlag(kCpuHasNEON)) {
ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON;
}
#endif
if (dst_height == 1) {
ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, dst_width);
} else {
dy = FixedDiv(src_height - 1, dst_height - 1);
y = (1 << 15) - 1;
for (i = 0; i < dst_height; ++i) {
ScaleRowUp(src_uv + (y >> 16) * src_stride, dst_uv, dst_width);
dst_uv += dst_stride;
y += dy;
}
}
}
// Scale 16 bit UV, up by 2 times.
// This is an optimized version for scaling up a plane to 2 times of
// its original size, using bilinear interpolation.
// This is used to scale U and V planes of P010 to P410.
void ScaleUVBilinearUp2_16(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint16_t* src_ptr,
uint16_t* dst_ptr) {
void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
ScaleUVRowUp2_Bilinear_16_Any_C;
int x;
// This function can only scale up by 2 times.
assert(src_width == ((dst_width + 1) / 2));
assert(src_height == ((dst_height + 1) / 2));
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE2;
}
#endif
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2
if (TestCpuFlag(kCpuHasAVX2)) {
Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
}
#endif
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON
if (TestCpuFlag(kCpuHasNEON)) {
Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON;
}
#endif
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
dst_ptr += dst_stride;
for (x = 0; x < src_height - 1; ++x) {
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
src_ptr += src_stride;
// TODO: Test performance of writing one row of destination at a time.
dst_ptr += 2 * dst_stride;
}
if (!(dst_height & 1)) {
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
}
}
// Scale UV to/from any dimensions, without interpolation.
// Fixed point math is used for performance: The upper 16 bits
// of x and dx is the integer part of the source position and
@ -851,6 +952,26 @@ static int UVCopy(const uint8_t* src_UV,
CopyPlane(src_UV, src_stride_UV, dst_UV, dst_stride_UV, width * 2, height);
return 0;
}
static int UVCopy_16(const uint16_t* src_UV,
int src_stride_UV,
uint16_t* dst_UV,
int dst_stride_UV,
int width,
int height) {
if (!src_UV || !dst_UV || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_UV = src_UV + (height - 1) * src_stride_UV;
src_stride_UV = -src_stride_UV;
}
CopyPlane_16(src_UV, src_stride_UV, dst_UV, dst_stride_UV, width * 2, height);
return 0;
}
#endif // HAS_UVCOPY
// Scale a UV plane (from NV12)
@ -953,7 +1074,7 @@ static void ScaleUV(const uint8_t* src,
dst_stride, src, dst, x, y, dy, 4, filtering);
return;
}
if (filtering && src_height == dst_height) {
if (filtering && (dst_width + 1) / 2 == src_width) {
ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride,
dst_stride, src, dst);
return;
@ -1005,6 +1126,69 @@ int UVScale(const uint8_t* src_uv,
return 0;
}
// Scale an 16 bit UV image.
// This function is currently incomplete, it can't handle all cases.
LIBYUV_API
int UVScale_16(const uint16_t* src_uv,
int src_stride_uv,
int src_width,
int src_height,
uint16_t* dst_uv,
int dst_stride_uv,
int dst_width,
int dst_height,
enum FilterMode filtering) {
int dy = 0;
if (!src_uv || src_width == 0 || src_height == 0 || src_width > 32768 ||
src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
return -1;
}
// UV does not support box filter yet, but allow the user to pass it.
// Simplify filtering when possible.
filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
filtering);
// Negative src_height means invert the image.
if (src_height < 0) {
src_height = -src_height;
src_uv = src_uv + (src_height - 1) * src_stride_uv;
src_stride_uv = -src_stride_uv;
}
src_width = Abs(src_width);
#ifdef HAS_UVCOPY
if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) {
if (dst_height == 1) {
UVCopy_16(src_uv + ((src_height - 1) / 2) * src_stride_uv, src_stride_uv,
dst_uv, dst_stride_uv, dst_width, dst_height);
} else {
dy = src_height / dst_height;
UVCopy_16(src_uv + src_stride_uv * ((dy - 1) / 2), src_stride_uv * dy,
dst_uv, dst_stride_uv, dst_width, dst_height);
}
return 0;
}
#endif
if (filtering && (dst_width + 1) / 2 == src_width) {
ScaleUVLinearUp2_16(src_width, src_height, dst_width, dst_height,
src_stride_uv, dst_stride_uv, src_uv, dst_uv);
return 0;
}
if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
(filtering == kFilterBilinear || filtering == kFilterBox)) {
ScaleUVBilinearUp2_16(src_width, src_height, dst_width, dst_height,
src_stride_uv, dst_stride_uv, src_uv, dst_uv);
return 0;
}
return -1;
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -377,89 +377,119 @@ TESTPLANARTOBP(I444, 1, 1, NV12, 2, 2)
TESTPLANARTOBP(I444, 1, 1, NV21, 2, 2)
TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, \
OFF, DOY) \
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = benchmark_height_; \
align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
OFF); \
align_buffer_page_end(dst_y_c, kWidth* kHeight); \
align_buffer_page_end(dst_uv_c, 2 * SUBSAMPLE(kWidth, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
align_buffer_page_end(dst_uv_opt, 2 * SUBSAMPLE(kWidth, SUBSAMP_X) * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
for (int i = 0; i < kHeight; ++i) \
for (int j = 0; j < kWidth; ++j) \
src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
(fastrand() & 0xff); \
} \
} \
memset(dst_y_c, 1, kWidth* kHeight); \
memset(dst_uv_c, 2, \
2 * SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
memset(dst_y_opt, 101, kWidth* kHeight); \
memset(dst_uv_opt, 102, \
2 * SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
MaskCpuFlags(disable_cpu_flags_); \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_uv + OFF, \
2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_c : NULL, kWidth, \
dst_uv_c, 2 * SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_uv + OFF, \
2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_opt : NULL, \
kWidth, dst_uv_opt, 2 * SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, \
NEG kHeight); \
} \
if (DOY) { \
for (int i = 0; i < kHeight; ++i) { \
for (int j = 0; j < kWidth; ++j) { \
EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \
} \
} \
} \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
EXPECT_EQ(dst_uv_c[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \
dst_uv_opt[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \
} \
} \
free_aligned_buffer_page_end(dst_y_c); \
free_aligned_buffer_page_end(dst_uv_c); \
free_aligned_buffer_page_end(dst_y_opt); \
free_aligned_buffer_page_end(dst_uv_opt); \
free_aligned_buffer_page_end(src_y); \
free_aligned_buffer_page_end(src_uv); \
#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \
DOY, SRC_DEPTH) \
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \
static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \
static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \
"DST SRC_SUBSAMP_X unsupported"); \
static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \
"DST SRC_SUBSAMP_Y unsupported"); \
static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \
"DST DST_SUBSAMP_X unsupported"); \
static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \
"DST DST_SUBSAMP_Y unsupported"); \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = benchmark_height_; \
const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \
const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \
const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \
align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \
align_buffer_page_end(src_uv, \
2 * kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF); \
align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \
align_buffer_page_end(dst_uv_c, \
2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \
align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \
align_buffer_page_end(dst_uv_opt, \
2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \
MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC); \
MemRandomize(src_uv + OFF, 2 * kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \
SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF); \
SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF); \
for (int i = 0; i < kWidth * kHeight; ++i) { \
src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1); \
} \
for (int i = 0; i < 2 * kSrcHalfWidth * kSrcHalfHeight; ++i) { \
src_uv_p[i] = src_uv_p[i] & ((1 << SRC_DEPTH) - 1); \
} \
memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \
memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \
memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \
memset(dst_uv_opt, 102, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \
MaskCpuFlags(disable_cpu_flags_); \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y_p, kWidth, src_uv_p, 2 * kSrcHalfWidth, \
DOY ? reinterpret_cast<DST_T*>(dst_y_c) : NULL, kWidth, \
reinterpret_cast<DST_T*>(dst_uv_c), 2 * kDstHalfWidth, \
kWidth, NEG kHeight); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y_p, kWidth, src_uv_p, 2 * kSrcHalfWidth, \
DOY ? reinterpret_cast<DST_T*>(dst_y_opt) : NULL, kWidth, \
reinterpret_cast<DST_T*>(dst_uv_opt), 2 * kDstHalfWidth, \
kWidth, NEG kHeight); \
} \
if (DOY) { \
for (int i = 0; i < kHeight; ++i) { \
for (int j = 0; j < kWidth; ++j) { \
EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \
} \
} \
} \
for (int i = 0; i < kDstHalfHeight; ++i) { \
for (int j = 0; j < 2 * kDstHalfWidth; ++j) { \
EXPECT_EQ(dst_uv_c[i * 2 * kDstHalfWidth + j], \
dst_uv_opt[i * 2 * kDstHalfWidth + j]); \
} \
} \
free_aligned_buffer_page_end(dst_y_c); \
free_aligned_buffer_page_end(dst_uv_c); \
free_aligned_buffer_page_end(dst_y_opt); \
free_aligned_buffer_page_end(dst_uv_opt); \
free_aligned_buffer_page_end(src_y); \
free_aligned_buffer_page_end(src_uv); \
}
#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1) \
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1, \
1) \
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1) \
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1) \
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0)
#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
DST_SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1, \
SRC_DEPTH) \
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1, 1, \
SRC_DEPTH) \
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1, \
SRC_DEPTH) \
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH) \
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
DST_SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0, \
SRC_DEPTH)
TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
TESTBIPLANARTOBP(NV12, 2, 2, NV24, 1, 1)
TESTBIPLANARTOBP(NV16, 2, 1, NV24, 1, 1)
TESTBIPLANARTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8)
TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8)
TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8)
TESTBIPLANARTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8)
// These formats put data at high bits, so test on full 16bit range.
TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 16)
TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 16)
TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 16)
TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 16)
TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 16)
TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 16)
#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \