mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-01-01 03:12:16 +08:00
Add P010ToP410 and P210ToP410
These are 16 bit bi-planar convert functions to scale UV plane to Y plane's size using (bi)linear filter. libyuv_unittest --gtest_filter=*ToP41* R=fbarchard@chromium.org Bug: libyuv:872 Change-Id: I3cb4fafe2b2c9eedd0d91cf4c619abb9ee107bc1 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2690102 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
12a4a2372c
commit
d4ecb70610
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1776
|
||||
Version: 1777
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -315,6 +315,44 @@ int NV16ToNV24(const uint8_t* src_y,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Convert P010 to P410.
|
||||
LIBYUV_API
|
||||
int P010ToP410(const uint16_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint16_t* src_uv,
|
||||
int src_stride_uv,
|
||||
uint16_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint16_t* dst_uv,
|
||||
int dst_stride_uv,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Convert P012 to P412.
|
||||
#define P012ToP412 P010ToP410
|
||||
|
||||
// Convert P016 to P416.
|
||||
#define P016ToP416 P010ToP410
|
||||
|
||||
// Convert P210 to P410.
|
||||
LIBYUV_API
|
||||
int P210ToP410(const uint16_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint16_t* src_uv,
|
||||
int src_stride_uv,
|
||||
uint16_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint16_t* dst_uv,
|
||||
int dst_stride_uv,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Convert P212 to P412.
|
||||
#define P212ToP412 P210ToP410
|
||||
|
||||
// Convert P216 to P416.
|
||||
#define P216ToP416 P210ToP410
|
||||
|
||||
// Convert YUY2 to I420.
|
||||
LIBYUV_API
|
||||
int YUY2ToI420(const uint8_t* src_yuy2,
|
||||
|
||||
@ -81,10 +81,12 @@ extern "C" {
|
||||
#define HAS_SCALEROWUP2LINEAR_SSSE3
|
||||
#define HAS_SCALEROWUP2BILINEAR_SSE2
|
||||
#define HAS_SCALEROWUP2BILINEAR_SSSE3
|
||||
#define HAS_SCALEROWUP2LINEAR_16_SSE2
|
||||
#define HAS_SCALEROWUP2BILINEAR_16_SSE2
|
||||
#define HAS_SCALEROWUP2LINEAR_16_SSSE3
|
||||
#define HAS_SCALEROWUP2BILINEAR_16_SSSE3
|
||||
#define HAS_SCALEUVROWUP2LINEAR_SSSE3
|
||||
#define HAS_SCALEUVROWUP2BILINEAR_SSSE3
|
||||
#define HAS_SCALEUVROWUP2LINEAR_16_SSE2
|
||||
#define HAS_SCALEUVROWUP2BILINEAR_16_SSE2
|
||||
#endif
|
||||
|
||||
// The following are available for gcc/clang x86 platforms, but
|
||||
@ -100,6 +102,8 @@ extern "C" {
|
||||
#define HAS_SCALEROWUP2BILINEAR_16_AVX2
|
||||
#define HAS_SCALEUVROWUP2LINEAR_AVX2
|
||||
#define HAS_SCALEUVROWUP2BILINEAR_AVX2
|
||||
#define HAS_SCALEUVROWUP2LINEAR_16_AVX2
|
||||
#define HAS_SCALEUVROWUP2BILINEAR_16_AVX2
|
||||
#endif
|
||||
|
||||
// The following are available on all x86 platforms, but
|
||||
@ -134,6 +138,8 @@ extern "C" {
|
||||
#define HAS_SCALEROWUP2BILINEAR_16_NEON
|
||||
#define HAS_SCALEUVROWUP2LINEAR_NEON
|
||||
#define HAS_SCALEUVROWUP2BILINEAR_NEON
|
||||
#define HAS_SCALEUVROWUP2LINEAR_16_NEON
|
||||
#define HAS_SCALEUVROWUP2BILINEAR_16_NEON
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
|
||||
@ -487,6 +493,22 @@ void ScaleUVRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleUVRowUp2_Linear_16_Any_C(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleUVRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
|
||||
void ScaleUVCols_C(uint8_t* dst_uv,
|
||||
const uint8_t* src_uv,
|
||||
@ -589,10 +611,10 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
|
||||
void ScaleRowUp2_Linear_16_SSSE3(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
|
||||
void ScaleRowUp2_Bilinear_16_SSSE3(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
@ -629,10 +651,10 @@ void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
|
||||
void ScaleRowUp2_Linear_16_Any_SSSE3(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr,
|
||||
void ScaleRowUp2_Bilinear_16_Any_SSSE3(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
@ -1235,6 +1257,54 @@ void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleUVRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleUVRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleUVRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleUVRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleUVRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleUVRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
|
||||
// ScaleRowDown2Box also used by planar functions
|
||||
// NEON downscalers with interpolation.
|
||||
|
||||
@ -30,6 +30,19 @@ int UVScale(const uint8_t* src_uv,
|
||||
int dst_height,
|
||||
enum FilterMode filtering);
|
||||
|
||||
// Scale an 16 bit UV image.
|
||||
// This function is currently incomplete, it can't handle all cases.
|
||||
LIBYUV_API
|
||||
int UVScale_16(const uint16_t* src_uv,
|
||||
int src_stride_uv,
|
||||
int src_width,
|
||||
int src_height,
|
||||
uint16_t* dst_uv,
|
||||
int dst_stride_uv,
|
||||
int dst_width,
|
||||
int dst_height,
|
||||
enum FilterMode filtering);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1776
|
||||
#define LIBYUV_VERSION 1777
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -663,6 +663,55 @@ int NV16ToNV24(const uint8_t* src_y,
|
||||
return 0;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int P010ToP410(const uint16_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint16_t* src_uv,
|
||||
int src_stride_uv,
|
||||
uint16_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint16_t* dst_uv,
|
||||
int dst_stride_uv,
|
||||
int width,
|
||||
int height) {
|
||||
if (width == 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (dst_y) {
|
||||
ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
|
||||
Abs(width), Abs(height), kFilterBilinear);
|
||||
}
|
||||
UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1),
|
||||
SUBSAMPLE(height, 1, 1), dst_uv, dst_stride_uv, Abs(width),
|
||||
Abs(height), kFilterBilinear);
|
||||
return 0;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int P210ToP410(const uint16_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint16_t* src_uv,
|
||||
int src_stride_uv,
|
||||
uint16_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint16_t* dst_uv,
|
||||
int dst_stride_uv,
|
||||
int width,
|
||||
int height) {
|
||||
if (width == 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (dst_y) {
|
||||
ScalePlane_16(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
|
||||
Abs(width), Abs(height), kFilterBilinear);
|
||||
}
|
||||
UVScale_16(src_uv, src_stride_uv, SUBSAMPLE(width, 1, 1), height, dst_uv,
|
||||
dst_stride_uv, Abs(width), Abs(height), kFilterBilinear);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Convert YUY2 to I420.
|
||||
LIBYUV_API
|
||||
int YUY2ToI420(const uint8_t* src_yuy2,
|
||||
|
||||
@ -4190,6 +4190,7 @@ void MergeARGBRow_AVX2(const uint8_t* src_r,
|
||||
"lea 64(%4),%4 \n"
|
||||
"sub $0x10,%5 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
@ -4231,6 +4232,7 @@ void MergeXRGBRow_AVX2(const uint8_t* src_r,
|
||||
"lea 64(%3),%3 \n"
|
||||
"sub $0x10,%4 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_r), // %0
|
||||
"+r"(src_g), // %1
|
||||
"+r"(src_b), // %2
|
||||
@ -4340,9 +4342,9 @@ void SplitXRGBRow_SSE2(const uint8_t* src_argb,
|
||||
}
|
||||
#endif
|
||||
|
||||
static const uvec8 kShuffleMaskARGBSplit = {0, 4, 8, 12, 1, 5, 9, 13,
|
||||
2, 6, 10, 14, 3, 7, 11, 15};
|
||||
#ifdef HAS_SPLITARGBROW_SSSE3
|
||||
static const uvec8 kShuffleMaskARGBSplit = {0u, 4u, 8u, 12u, 1u, 5u, 9u, 13u,
|
||||
2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u};
|
||||
void SplitARGBRow_SSSE3(const uint8_t* src_argb,
|
||||
uint8_t* dst_r,
|
||||
uint8_t* dst_g,
|
||||
@ -4351,6 +4353,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
|
||||
int width) {
|
||||
asm volatile(
|
||||
|
||||
"movdqa %6,%%xmm3 \n"
|
||||
"sub %1,%2 \n"
|
||||
"sub %1,%3 \n"
|
||||
"sub %1,%4 \n"
|
||||
@ -4360,8 +4363,8 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
|
||||
|
||||
"movdqu (%0),%%xmm0 \n" // 00-0F
|
||||
"movdqu 16(%0),%%xmm1 \n" // 10-1F
|
||||
"pshufb %6,%%xmm0 \n" // 048C159D26AE37BF (lo)
|
||||
"pshufb %6,%%xmm1 \n" // 048C159D26AE37BF (hi)
|
||||
"pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo)
|
||||
"pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi)
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
|
||||
"punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
|
||||
@ -4385,7 +4388,7 @@ void SplitARGBRow_SSSE3(const uint8_t* src_argb,
|
||||
"+rm"(width) // %5
|
||||
#endif
|
||||
: "m"(kShuffleMaskARGBSplit) // %6
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
|
||||
}
|
||||
|
||||
void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
|
||||
@ -4395,13 +4398,15 @@ void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
|
||||
int width) {
|
||||
asm volatile(
|
||||
|
||||
"movdqa %5,%%xmm3 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
|
||||
"movdqu (%0),%%xmm0 \n" // 00-0F
|
||||
"movdqu 16(%0),%%xmm1 \n" // 10-1F
|
||||
"pshufb %5,%%xmm0 \n" // 048C159D26AE37BF (lo)
|
||||
"pshufb %5,%%xmm1 \n" // 048C159D26AE37BF (hi)
|
||||
"pshufb %%xmm3,%%xmm0 \n" // 048C159D26AE37BF (lo)
|
||||
"pshufb %%xmm3,%%xmm1 \n" // 048C159D26AE37BF (hi)
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"punpckldq %%xmm1,%%xmm0 \n" // 048C048C159D159D (BG)
|
||||
"punpckhdq %%xmm1,%%xmm2 \n" // 26AE26AE37BF37BF (RA)
|
||||
@ -4421,16 +4426,12 @@ void SplitXRGBRow_SSSE3(const uint8_t* src_argb,
|
||||
"+r"(dst_b), // %3
|
||||
"+r"(width) // %4
|
||||
: "m"(kShuffleMaskARGBSplit) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2");
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SPLITARGBROW_AVX2
|
||||
static const lvec8 kShuffleMaskARGBSplit_AVX2 = {
|
||||
0u, 4u, 8u, 12u, 1u, 5u, 9u, 13u, 2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u,
|
||||
0u, 4u, 8u, 12u, 1u, 5u, 9u, 13u, 2u, 6u, 10u, 14u, 3u, 7u, 11u, 15u};
|
||||
static const ulvec32 kShuffleMaskARGBPermute_AVX2 = {0u, 4u, 1u, 5u,
|
||||
2u, 6u, 3u, 7u};
|
||||
static const ulvec32 kShuffleMaskARGBPermute = {0, 4, 1, 5, 2, 6, 3, 7};
|
||||
void SplitARGBRow_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_r,
|
||||
uint8_t* dst_g,
|
||||
@ -4442,7 +4443,8 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
|
||||
"sub %1,%2 \n"
|
||||
"sub %1,%3 \n"
|
||||
"sub %1,%4 \n"
|
||||
"vmovdqu %7,%%ymm3 \n"
|
||||
"vmovdqa %7,%%ymm3 \n"
|
||||
"vbroadcastf128 %6,%%ymm4 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
@ -4451,8 +4453,8 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
|
||||
"vmovdqu 16(%0),%%xmm1 \n" // 10-1F
|
||||
"vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F
|
||||
"vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F
|
||||
"vpshufb %6,%%ymm0,%%ymm0 \n"
|
||||
"vpshufb %6,%%ymm1,%%ymm1 \n"
|
||||
"vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
|
||||
"vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
|
||||
"vpermd %%ymm0,%%ymm3,%%ymm0 \n"
|
||||
"vpermd %%ymm1,%%ymm3,%%ymm1 \n"
|
||||
"vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA
|
||||
@ -4465,6 +4467,7 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
|
||||
"lea 16(%1),%1 \n"
|
||||
"subl $0x10,%5 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_r), // %1
|
||||
"+r"(dst_g), // %2
|
||||
@ -4475,9 +4478,9 @@ void SplitARGBRow_AVX2(const uint8_t* src_argb,
|
||||
#else
|
||||
"+rm"(width) // %5
|
||||
#endif
|
||||
: "m"(kShuffleMaskARGBSplit_AVX2), // %6
|
||||
"m"(kShuffleMaskARGBPermute_AVX2) // %7
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
|
||||
: "m"(kShuffleMaskARGBSplit), // %6
|
||||
"m"(kShuffleMaskARGBPermute) // %7
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
|
||||
}
|
||||
|
||||
void SplitXRGBRow_AVX2(const uint8_t* src_argb,
|
||||
@ -4487,15 +4490,18 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb,
|
||||
int width) {
|
||||
asm volatile(
|
||||
|
||||
"vmovdqu %6,%%ymm3 \n" LABELALIGN
|
||||
"vmovdqa %6,%%ymm3 \n"
|
||||
"vbroadcastf128 %5,%%ymm4 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
|
||||
"vmovdqu (%0),%%xmm0 \n" // 00-0F
|
||||
"vmovdqu 16(%0),%%xmm1 \n" // 10-1F
|
||||
"vinserti128 $1,32(%0),%%ymm0,%%ymm0 \n" // 00-0F 20-2F
|
||||
"vinserti128 $1,48(%0),%%ymm1,%%ymm1 \n" // 10-1F 30-3F
|
||||
"vpshufb %5,%%ymm0,%%ymm0 \n"
|
||||
"vpshufb %5,%%ymm1,%%ymm1 \n"
|
||||
"vpshufb %%ymm4,%%ymm0,%%ymm0 \n"
|
||||
"vpshufb %%ymm4,%%ymm1,%%ymm1 \n"
|
||||
"vpermd %%ymm0,%%ymm3,%%ymm0 \n"
|
||||
"vpermd %%ymm1,%%ymm3,%%ymm1 \n"
|
||||
"vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n" // GA
|
||||
@ -4510,13 +4516,14 @@ void SplitXRGBRow_AVX2(const uint8_t* src_argb,
|
||||
"lea 16(%3),%3 \n"
|
||||
"sub $0x10,%4 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_r), // %1
|
||||
"+r"(dst_g), // %2
|
||||
"+r"(dst_b), // %3
|
||||
"+r"(width) // %4
|
||||
: "m"(kShuffleMaskARGBSplit_AVX2), // %5
|
||||
"m"(kShuffleMaskARGBPermute_AVX2) // %6
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_r), // %1
|
||||
"+r"(dst_g), // %2
|
||||
"+r"(dst_b), // %3
|
||||
"+r"(width) // %4
|
||||
: "m"(kShuffleMaskARGBSplit), // %5
|
||||
"m"(kShuffleMaskARGBPermute) // %6
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1441,20 +1441,16 @@ void ScalePlaneUp2_Bilinear(int src_width,
|
||||
}
|
||||
#endif
|
||||
|
||||
if (src_height == 1) {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
|
||||
} else {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
|
||||
dst_ptr += dst_stride;
|
||||
for (x = 0; x < src_height - 1; ++x) {
|
||||
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
|
||||
src_ptr += src_stride;
|
||||
// TODO: Test performance of writing one row of destination at a time.
|
||||
dst_ptr += 2 * dst_stride;
|
||||
}
|
||||
if (!(dst_height & 1)) {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
|
||||
dst_ptr += dst_stride;
|
||||
for (x = 0; x < src_height - 1; ++x) {
|
||||
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
|
||||
src_ptr += src_stride;
|
||||
// TODO: Test performance of writing one row of destination at a time.
|
||||
dst_ptr += 2 * dst_stride;
|
||||
}
|
||||
if (!(dst_height & 1)) {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -1480,9 +1476,9 @@ void ScalePlaneUp2_16_Linear(int src_width,
|
||||
// This function can only scale up by 2 times horizontally.
|
||||
assert(src_width == ((dst_width + 1) / 2));
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2;
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSSE3;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1534,9 +1530,9 @@ void ScalePlaneUp2_16_Bilinear(int src_width,
|
||||
assert(src_width == ((dst_width + 1) / 2));
|
||||
assert(src_height == ((dst_height + 1) / 2));
|
||||
|
||||
#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2;
|
||||
#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSSE3;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -1552,19 +1548,15 @@ void ScalePlaneUp2_16_Bilinear(int src_width,
|
||||
}
|
||||
#endif
|
||||
|
||||
if (src_height == 1) {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
|
||||
} else {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
|
||||
dst_ptr += dst_stride;
|
||||
for (x = 0; x < src_height - 1; ++x) {
|
||||
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += 2 * dst_stride;
|
||||
}
|
||||
if (!(dst_height & 1)) {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
|
||||
dst_ptr += dst_stride;
|
||||
for (x = 0; x < src_height - 1; ++x) {
|
||||
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += 2 * dst_stride;
|
||||
}
|
||||
if (!(dst_height & 1)) {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -656,9 +656,9 @@ SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
|
||||
SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
|
||||
ScaleRowUp2_Linear_16_SSE2,
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_16_SSSE3
|
||||
SUH2LANY(ScaleRowUp2_Linear_16_Any_SSSE3,
|
||||
ScaleRowUp2_Linear_16_SSSE3,
|
||||
ScaleRowUp2_Linear_16_C,
|
||||
15,
|
||||
uint16_t)
|
||||
@ -676,7 +676,7 @@ SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
|
||||
SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
|
||||
ScaleRowUp2_Linear_16_AVX2,
|
||||
ScaleRowUp2_Linear_16_C,
|
||||
15,
|
||||
31,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
@ -744,9 +744,9 @@ SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2BILINEAR_16_SSE2
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
|
||||
ScaleRowUp2_Bilinear_16_SSE2,
|
||||
#ifdef HAS_SCALEROWUP2BILINEAR_16_SSSE3
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSSE3,
|
||||
ScaleRowUp2_Bilinear_16_SSSE3,
|
||||
ScaleRowUp2_Bilinear_16_C,
|
||||
15,
|
||||
uint16_t)
|
||||
@ -818,6 +818,12 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_C,
|
||||
0,
|
||||
uint8_t)
|
||||
|
||||
SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_C,
|
||||
ScaleUVRowUp2_Linear_16_C,
|
||||
ScaleUVRowUp2_Linear_16_C,
|
||||
0,
|
||||
uint16_t)
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2LINEAR_SSSE3
|
||||
SBUH2LANY(ScaleUVRowUp2_Linear_Any_SSSE3,
|
||||
ScaleUVRowUp2_Linear_SSSE3,
|
||||
@ -834,6 +840,22 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
|
||||
SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE2,
|
||||
ScaleUVRowUp2_Linear_16_SSE2,
|
||||
ScaleUVRowUp2_Linear_16_C,
|
||||
3,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2
|
||||
SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_AVX2,
|
||||
ScaleUVRowUp2_Linear_16_AVX2,
|
||||
ScaleUVRowUp2_Linear_16_C,
|
||||
7,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2LINEAR_NEON
|
||||
SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
|
||||
ScaleUVRowUp2_Linear_NEON,
|
||||
@ -842,6 +864,14 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_NEON,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON
|
||||
SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_NEON,
|
||||
ScaleUVRowUp2_Linear_16_NEON,
|
||||
ScaleUVRowUp2_Linear_16_C,
|
||||
7,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#undef SBUH2LANY
|
||||
|
||||
// Scale bi-planar plane up 2 times using bilinear filter.
|
||||
@ -886,6 +916,12 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_C,
|
||||
0,
|
||||
uint8_t)
|
||||
|
||||
SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_C,
|
||||
ScaleUVRowUp2_Bilinear_16_C,
|
||||
ScaleUVRowUp2_Bilinear_16_C,
|
||||
0,
|
||||
uint16_t)
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2BILINEAR_SSSE3
|
||||
SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_SSSE3,
|
||||
ScaleUVRowUp2_Bilinear_SSSE3,
|
||||
@ -902,6 +938,22 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
|
||||
SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE2,
|
||||
ScaleUVRowUp2_Bilinear_16_SSE2,
|
||||
ScaleUVRowUp2_Bilinear_16_C,
|
||||
7,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2
|
||||
SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_AVX2,
|
||||
ScaleUVRowUp2_Bilinear_16_AVX2,
|
||||
ScaleUVRowUp2_Bilinear_16_C,
|
||||
7,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2BILINEAR_NEON
|
||||
SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON,
|
||||
ScaleUVRowUp2_Bilinear_NEON,
|
||||
@ -910,6 +962,14 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_NEON,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON
|
||||
SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_NEON,
|
||||
ScaleUVRowUp2_Bilinear_16_NEON,
|
||||
ScaleUVRowUp2_Bilinear_16_C,
|
||||
3,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#undef SBU2BLANY
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -1258,6 +1258,64 @@ void ScaleUVRowUp2_Bilinear_C(const uint8_t* src_ptr,
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleUVRowUp2_Linear_16_C(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
int src_width = dst_width >> 1;
|
||||
int x;
|
||||
assert((dst_width % 2 == 0) && (dst_width >= 0));
|
||||
for (x = 0; x < src_width; ++x) {
|
||||
dst_ptr[4 * x + 0] =
|
||||
(src_ptr[2 * x + 0] * 3 + src_ptr[2 * x + 2] * 1 + 2) >> 2;
|
||||
dst_ptr[4 * x + 1] =
|
||||
(src_ptr[2 * x + 1] * 3 + src_ptr[2 * x + 3] * 1 + 2) >> 2;
|
||||
dst_ptr[4 * x + 2] =
|
||||
(src_ptr[2 * x + 0] * 1 + src_ptr[2 * x + 2] * 3 + 2) >> 2;
|
||||
dst_ptr[4 * x + 3] =
|
||||
(src_ptr[2 * x + 1] * 1 + src_ptr[2 * x + 3] * 3 + 2) >> 2;
|
||||
}
|
||||
}
|
||||
|
||||
void ScaleUVRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
const uint16_t* s = src_ptr;
|
||||
const uint16_t* t = src_ptr + src_stride;
|
||||
uint16_t* d = dst_ptr;
|
||||
uint16_t* e = dst_ptr + dst_stride;
|
||||
int src_width = dst_width >> 1;
|
||||
int x;
|
||||
assert((dst_width % 2 == 0) && (dst_width >= 0));
|
||||
for (x = 0; x < src_width; ++x) {
|
||||
d[4 * x + 0] = (s[2 * x + 0] * 9 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
|
||||
t[2 * x + 2] * 1 + 8) >>
|
||||
4;
|
||||
d[4 * x + 1] = (s[2 * x + 1] * 9 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
|
||||
t[2 * x + 3] * 1 + 8) >>
|
||||
4;
|
||||
d[4 * x + 2] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 9 + t[2 * x + 0] * 1 +
|
||||
t[2 * x + 2] * 3 + 8) >>
|
||||
4;
|
||||
d[4 * x + 3] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 9 + t[2 * x + 1] * 1 +
|
||||
t[2 * x + 3] * 3 + 8) >>
|
||||
4;
|
||||
e[4 * x + 0] = (s[2 * x + 0] * 3 + s[2 * x + 2] * 1 + t[2 * x + 0] * 9 +
|
||||
t[2 * x + 2] * 3 + 8) >>
|
||||
4;
|
||||
e[4 * x + 1] = (s[2 * x + 1] * 3 + s[2 * x + 3] * 1 + t[2 * x + 1] * 9 +
|
||||
t[2 * x + 3] * 3 + 8) >>
|
||||
4;
|
||||
e[4 * x + 2] = (s[2 * x + 0] * 1 + s[2 * x + 2] * 3 + t[2 * x + 0] * 3 +
|
||||
t[2 * x + 2] * 9 + 8) >>
|
||||
4;
|
||||
e[4 * x + 3] = (s[2 * x + 1] * 1 + s[2 * x + 3] * 3 + t[2 * x + 1] * 3 +
|
||||
t[2 * x + 3] * 9 + 8) >>
|
||||
4;
|
||||
}
|
||||
}
|
||||
|
||||
// Scales a single row of pixels using point sampling.
|
||||
void ScaleUVCols_C(uint8_t* dst_uv,
|
||||
const uint8_t* src_uv,
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -791,6 +791,102 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
const uint16_t* src_temp = src_ptr + 2;
|
||||
asm volatile(
|
||||
"vmov.u16 d30, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
"vld1.16 {q0}, [%0]! \n" // 00112233 (1u1v, 16)
|
||||
"vld1.16 {q1}, [%3]! \n" // 11223344 (1u1v, 16)
|
||||
|
||||
"vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b)
|
||||
"vmovl.u16 q3, d2 \n" // 1122 (1u1v, 32b)
|
||||
"vmovl.u16 q4, d1 \n" // 2233 (1u1v, 32b)
|
||||
"vmovl.u16 q5, d3 \n" // 3344 (1u1v, 32b)
|
||||
"vmlal.u16 q2, d2, d30 \n" // 3*near+far (odd)
|
||||
"vmlal.u16 q3, d0, d30 \n" // 3*near+far (even)
|
||||
"vmlal.u16 q4, d3, d30 \n" // 3*near+far (odd)
|
||||
"vmlal.u16 q5, d1, d30 \n" // 3*near+far (even)
|
||||
|
||||
"vrshrn.u32 d1, q2, #2 \n" // 3/4*near+1/4*far (odd)
|
||||
"vrshrn.u32 d0, q3, #2 \n" // 3/4*near+1/4*far (even)
|
||||
"vrshrn.u32 d3, q4, #2 \n" // 3/4*near+1/4*far (odd)
|
||||
"vrshrn.u32 d2, q5, #2 \n" // 3/4*near+1/4*far (even)
|
||||
|
||||
"vst2.32 {d0, d1}, [%1]! \n" // store
|
||||
"vst2.32 {d2, d3}, [%1]! \n" // store
|
||||
"subs %2, %2, #8 \n" // 4 uv -> 8 uv
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(src_temp) // %3
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "d30" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
const uint16_t* src_ptr1 = src_ptr + src_stride;
|
||||
uint16_t* dst_ptr1 = dst_ptr + dst_stride;
|
||||
const uint16_t* src_temp = src_ptr + 2;
|
||||
const uint16_t* src_temp1 = src_ptr1 + 2;
|
||||
|
||||
asm volatile(
|
||||
"vmov.u16 d30, #3 \n"
|
||||
"vmov.u32 q14, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
"vld1.8 {d0}, [%0]! \n" // 0011 (1u1v)
|
||||
"vld1.8 {d1}, [%5]! \n" // 1122 (1u1v)
|
||||
"vmovl.u16 q2, d0 \n" // 0011 (1u1v, 32b)
|
||||
"vmovl.u16 q3, d1 \n" // 1122 (1u1v, 32b)
|
||||
"vmlal.u16 q2, d1, d30 \n" // 3*near+far (1, odd)
|
||||
"vmlal.u16 q3, d0, d30 \n" // 3*near+far (1, even)
|
||||
|
||||
"vld1.8 {d0}, [%1]! \n" // 0011 (1u1v)
|
||||
"vld1.8 {d1}, [%6]! \n" // 1122 (1u1v)
|
||||
"vmovl.u16 q4, d0 \n" // 0011 (1u1v, 32b)
|
||||
"vmovl.u16 q5, d1 \n" // 1122 (1u1v, 32b)
|
||||
"vmlal.u16 q4, d1, d30 \n" // 3*near+far (2, odd)
|
||||
"vmlal.u16 q5, d0, d30 \n" // 3*near+far (2, even)
|
||||
|
||||
"vmovq q0, q4 \n"
|
||||
"vmovq q1, q5 \n"
|
||||
"vmla.u32 q4, q2, q14 \n" // 9 3 3 1 (1, odd)
|
||||
"vmla.u32 q5, q3, q14 \n" // 9 3 3 1 (1, even)
|
||||
"vmla.u32 q2, q0, q14 \n" // 9 3 3 1 (2, odd)
|
||||
"vmla.u32 q3, q1, q14 \n" // 9 3 3 1 (2, even)
|
||||
|
||||
"vrshrn.u32 d1, q4, #4 \n" // 1, odd
|
||||
"vrshrn.u32 d0, q5, #4 \n" // 1, even
|
||||
"vrshrn.u32 d3, q2, #4 \n" // 2, odd
|
||||
"vrshrn.u32 d2, q3, #4 \n" // 2, even
|
||||
|
||||
"vst2.32 {d0, d1}, [%2]! \n" // store
|
||||
"vst2.32 {d2, d3}, [%3]! \n" // store
|
||||
"subs %4, %4, #4 \n" // 2 uv -> 4 uv
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_ptr1), // %1
|
||||
"+r"(dst_ptr), // %2
|
||||
"+r"(dst_ptr1), // %3
|
||||
"+r"(dst_width), // %4
|
||||
"+r"(src_temp), // %5
|
||||
"+r"(src_temp1) // %6
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q14",
|
||||
"d30" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
// Add a row of bytes to a row of shorts. Used for box filter.
|
||||
// Reads 16 bytes and accumulates to 16 shorts at a time.
|
||||
void ScaleAddRow_NEON(const uint8_t* src_ptr,
|
||||
|
||||
@ -799,8 +799,8 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
|
||||
"rshrn v4.8b, v4.8h, #4 \n" // 1, odd
|
||||
"rshrn v3.8b, v5.8h, #4 \n" // 1, even
|
||||
|
||||
"st2 {v1.4h, v2.4h}, [%5], #16 \n" // store 1
|
||||
"st2 {v3.4h, v4.4h}, [%4], #16 \n" // store 2
|
||||
"st2 {v1.4h, v2.4h}, [%5], #16 \n" // store 2
|
||||
"st2 {v3.4h, v4.4h}, [%4], #16 \n" // store 1
|
||||
"subs %w6, %w6, #8 \n" // 4 uv -> 8 uv
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
@ -816,6 +816,106 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
const uint16_t* src_temp = src_ptr + 2;
|
||||
asm volatile(
|
||||
"movi v31.8h, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
|
||||
"ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
|
||||
"ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b)
|
||||
"ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b)
|
||||
"ushll2 v4.4s, v0.8h, #0 \n" // 2233 (1u1v, 32b)
|
||||
"ushll2 v5.4s, v1.8h, #0 \n" // 3344 (1u1v, 32b)
|
||||
|
||||
"umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (odd)
|
||||
"umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (even)
|
||||
"umlal2 v4.4s, v1.8h, v31.8h \n" // 3*near+far (odd)
|
||||
"umlal2 v5.4s, v0.8h, v31.8h \n" // 3*near+far (even)
|
||||
|
||||
"rshrn v2.4h, v2.4s, #2 \n" // 3/4*near+1/4*far (odd)
|
||||
"rshrn v1.4h, v3.4s, #2 \n" // 3/4*near+1/4*far (even)
|
||||
"rshrn v4.4h, v4.4s, #2 \n" // 3/4*near+1/4*far (odd)
|
||||
"rshrn v3.4h, v5.4s, #2 \n" // 3/4*near+1/4*far (even)
|
||||
|
||||
"st2 {v1.2s, v2.2s}, [%2], #16 \n" // store
|
||||
"st2 {v3.2s, v4.2s}, [%2], #16 \n" // store
|
||||
"subs %w3, %w3, #8 \n" // 4 uv -> 8 uv
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_temp), // %1
|
||||
"+r"(dst_ptr), // %2
|
||||
"+r"(dst_width) // %3
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v31" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
const uint16_t* src_ptr1 = src_ptr + src_stride;
|
||||
uint16_t* dst_ptr1 = dst_ptr + dst_stride;
|
||||
const uint16_t* src_temp = src_ptr + 2;
|
||||
const uint16_t* src_temp1 = src_ptr1 + 2;
|
||||
|
||||
asm volatile(
|
||||
"movi v31.4h, #3 \n"
|
||||
"movi v30.4s, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
"ldr d0, [%0], #8 \n"
|
||||
"ldr d1, [%2], #8 \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"ushll v2.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b)
|
||||
"ushll v3.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b)
|
||||
"umlal v2.4s, v1.4h, v31.4h \n" // 3*near+far (1, odd)
|
||||
"umlal v3.4s, v0.4h, v31.4h \n" // 3*near+far (1, even)
|
||||
|
||||
"ldr d0, [%1], #8 \n"
|
||||
"ldr d1, [%3], #8 \n"
|
||||
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
|
||||
"ushll v4.4s, v0.4h, #0 \n" // 0011 (1u1v, 32b)
|
||||
"ushll v5.4s, v1.4h, #0 \n" // 1122 (1u1v, 32b)
|
||||
"umlal v4.4s, v1.4h, v31.4h \n" // 3*near+far (2, odd)
|
||||
"umlal v5.4s, v0.4h, v31.4h \n" // 3*near+far (2, even)
|
||||
|
||||
"mov v0.4s, v4.4s \n"
|
||||
"mov v1.4s, v5.4s \n"
|
||||
"mla v4.4s, v2.4s, v30.4s \n" // 9 3 3 1 (1, odd)
|
||||
"mla v5.4s, v3.4s, v30.4s \n" // 9 3 3 1 (1, even)
|
||||
"mla v2.4s, v0.4s, v30.4s \n" // 9 3 3 1 (2, odd)
|
||||
"mla v3.4s, v1.4s, v30.4s \n" // 9 3 3 1 (2, even)
|
||||
|
||||
"rshrn v1.4h, v2.4s, #4 \n" // 2, odd
|
||||
"rshrn v0.4h, v3.4s, #4 \n" // 2, even
|
||||
"rshrn v3.4h, v4.4s, #4 \n" // 1, odd
|
||||
"rshrn v2.4h, v5.4s, #4 \n" // 1, even
|
||||
|
||||
"st2 {v0.2s, v1.2s}, [%5], #16 \n" // store 2
|
||||
"st2 {v2.2s, v3.2s}, [%4], #16 \n" // store 1
|
||||
"subs %w6, %w6, #4 \n" // 2 uv -> 4 uv
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_ptr1), // %1
|
||||
"+r"(src_temp), // %2
|
||||
"+r"(src_temp1), // %3
|
||||
"+r"(dst_ptr), // %4
|
||||
"+r"(dst_ptr1), // %5
|
||||
"+r"(dst_width) // %6
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
|
||||
"v31" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
// Add a row of bytes to a row of shorts. Used for box filter.
|
||||
// Reads 16 bytes and accumulates to 16 shorts at a time.
|
||||
void ScaleAddRow_NEON(const uint8_t* src_ptr,
|
||||
|
||||
@ -741,23 +741,124 @@ void ScaleUVBilinearUp2(int src_width,
|
||||
}
|
||||
#endif
|
||||
|
||||
if (src_height == 1) {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
|
||||
} else {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
|
||||
dst_ptr += dst_stride;
|
||||
for (x = 0; x < src_height - 1; ++x) {
|
||||
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
|
||||
src_ptr += src_stride;
|
||||
// TODO: Test performance of writing one row of destination at a time.
|
||||
dst_ptr += 2 * dst_stride;
|
||||
}
|
||||
if (!(dst_height & 1)) {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
|
||||
dst_ptr += dst_stride;
|
||||
for (x = 0; x < src_height - 1; ++x) {
|
||||
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
|
||||
src_ptr += src_stride;
|
||||
// TODO: Test performance of writing one row of destination at a time.
|
||||
dst_ptr += 2 * dst_stride;
|
||||
}
|
||||
if (!(dst_height & 1)) {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
|
||||
}
|
||||
}
|
||||
|
||||
// Scale 16 bit UV, horizontally up by 2 times.
|
||||
// Uses linear filter horizontally, nearest vertically.
|
||||
// This is an optimized version for scaling up a plane to 2 times of
|
||||
// its original width, using linear interpolation.
|
||||
// This is used to scale U and V planes of P210 to P410.
|
||||
void ScaleUVLinearUp2_16(int src_width,
|
||||
int src_height,
|
||||
int dst_width,
|
||||
int dst_height,
|
||||
int src_stride,
|
||||
int dst_stride,
|
||||
const uint16_t* src_uv,
|
||||
uint16_t* dst_uv) {
|
||||
void (*ScaleRowUp)(const uint16_t* src_uv, uint16_t* dst_uv, int dst_width) =
|
||||
ScaleUVRowUp2_Linear_16_Any_C;
|
||||
int i;
|
||||
int y;
|
||||
int dy;
|
||||
|
||||
// This function can only scale up by 2 times horizontally.
|
||||
assert(src_width == ((dst_width + 1) / 2));
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2LINEAR_16_AVX2
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_AVX2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2LINEAR_16_NEON
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_NEON;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (dst_height == 1) {
|
||||
ScaleRowUp(src_uv + ((src_height - 1) / 2) * src_stride, dst_uv, dst_width);
|
||||
} else {
|
||||
dy = FixedDiv(src_height - 1, dst_height - 1);
|
||||
y = (1 << 15) - 1;
|
||||
for (i = 0; i < dst_height; ++i) {
|
||||
ScaleRowUp(src_uv + (y >> 16) * src_stride, dst_uv, dst_width);
|
||||
dst_uv += dst_stride;
|
||||
y += dy;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Scale 16 bit UV, up by 2 times.
|
||||
// This is an optimized version for scaling up a plane to 2 times of
|
||||
// its original size, using bilinear interpolation.
|
||||
// This is used to scale U and V planes of P010 to P410.
|
||||
void ScaleUVBilinearUp2_16(int src_width,
|
||||
int src_height,
|
||||
int dst_width,
|
||||
int dst_height,
|
||||
int src_stride,
|
||||
int dst_stride,
|
||||
const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr) {
|
||||
void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
|
||||
ScaleUVRowUp2_Bilinear_16_Any_C;
|
||||
int x;
|
||||
|
||||
// This function can only scale up by 2 times.
|
||||
assert(src_width == ((dst_width + 1) / 2));
|
||||
assert(src_height == ((dst_height + 1) / 2));
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_AVX2
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_AVX2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_NEON
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_NEON;
|
||||
}
|
||||
#endif
|
||||
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
|
||||
dst_ptr += dst_stride;
|
||||
for (x = 0; x < src_height - 1; ++x) {
|
||||
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
|
||||
src_ptr += src_stride;
|
||||
// TODO: Test performance of writing one row of destination at a time.
|
||||
dst_ptr += 2 * dst_stride;
|
||||
}
|
||||
if (!(dst_height & 1)) {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
|
||||
}
|
||||
}
|
||||
|
||||
// Scale UV to/from any dimensions, without interpolation.
|
||||
// Fixed point math is used for performance: The upper 16 bits
|
||||
// of x and dx is the integer part of the source position and
|
||||
@ -851,6 +952,26 @@ static int UVCopy(const uint8_t* src_UV,
|
||||
CopyPlane(src_UV, src_stride_UV, dst_UV, dst_stride_UV, width * 2, height);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int UVCopy_16(const uint16_t* src_UV,
|
||||
int src_stride_UV,
|
||||
uint16_t* dst_UV,
|
||||
int dst_stride_UV,
|
||||
int width,
|
||||
int height) {
|
||||
if (!src_UV || !dst_UV || width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_UV = src_UV + (height - 1) * src_stride_UV;
|
||||
src_stride_UV = -src_stride_UV;
|
||||
}
|
||||
|
||||
CopyPlane_16(src_UV, src_stride_UV, dst_UV, dst_stride_UV, width * 2, height);
|
||||
return 0;
|
||||
}
|
||||
#endif // HAS_UVCOPY
|
||||
|
||||
// Scale a UV plane (from NV12)
|
||||
@ -953,7 +1074,7 @@ static void ScaleUV(const uint8_t* src,
|
||||
dst_stride, src, dst, x, y, dy, 4, filtering);
|
||||
return;
|
||||
}
|
||||
if (filtering && src_height == dst_height) {
|
||||
if (filtering && (dst_width + 1) / 2 == src_width) {
|
||||
ScaleUVLinearUp2(src_width, src_height, clip_width, clip_height, src_stride,
|
||||
dst_stride, src, dst);
|
||||
return;
|
||||
@ -1005,6 +1126,69 @@ int UVScale(const uint8_t* src_uv,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Scale an 16 bit UV image.
|
||||
// This function is currently incomplete, it can't handle all cases.
|
||||
LIBYUV_API
|
||||
int UVScale_16(const uint16_t* src_uv,
|
||||
int src_stride_uv,
|
||||
int src_width,
|
||||
int src_height,
|
||||
uint16_t* dst_uv,
|
||||
int dst_stride_uv,
|
||||
int dst_width,
|
||||
int dst_height,
|
||||
enum FilterMode filtering) {
|
||||
int dy = 0;
|
||||
|
||||
if (!src_uv || src_width == 0 || src_height == 0 || src_width > 32768 ||
|
||||
src_height > 32768 || !dst_uv || dst_width <= 0 || dst_height <= 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
// UV does not support box filter yet, but allow the user to pass it.
|
||||
// Simplify filtering when possible.
|
||||
filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
|
||||
filtering);
|
||||
|
||||
// Negative src_height means invert the image.
|
||||
if (src_height < 0) {
|
||||
src_height = -src_height;
|
||||
src_uv = src_uv + (src_height - 1) * src_stride_uv;
|
||||
src_stride_uv = -src_stride_uv;
|
||||
}
|
||||
src_width = Abs(src_width);
|
||||
|
||||
#ifdef HAS_UVCOPY
|
||||
if (!filtering && src_width == dst_width && (src_height % dst_height == 0)) {
|
||||
if (dst_height == 1) {
|
||||
UVCopy_16(src_uv + ((src_height - 1) / 2) * src_stride_uv, src_stride_uv,
|
||||
dst_uv, dst_stride_uv, dst_width, dst_height);
|
||||
} else {
|
||||
dy = src_height / dst_height;
|
||||
UVCopy_16(src_uv + src_stride_uv * ((dy - 1) / 2), src_stride_uv * dy,
|
||||
dst_uv, dst_stride_uv, dst_width, dst_height);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (filtering && (dst_width + 1) / 2 == src_width) {
|
||||
ScaleUVLinearUp2_16(src_width, src_height, dst_width, dst_height,
|
||||
src_stride_uv, dst_stride_uv, src_uv, dst_uv);
|
||||
return 0;
|
||||
}
|
||||
|
||||
if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
|
||||
(filtering == kFilterBilinear || filtering == kFilterBox)) {
|
||||
ScaleUVBilinearUp2_16(src_width, src_height, dst_width, dst_height,
|
||||
src_stride_uv, dst_stride_uv, src_uv, dst_uv);
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -377,89 +377,119 @@ TESTPLANARTOBP(I444, 1, 1, NV12, 2, 2)
|
||||
TESTPLANARTOBP(I444, 1, 1, NV21, 2, 2)
|
||||
TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
|
||||
|
||||
#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
|
||||
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, \
|
||||
OFF, DOY) \
|
||||
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
|
||||
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
|
||||
const int kHeight = benchmark_height_; \
|
||||
align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
|
||||
align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \
|
||||
SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
|
||||
OFF); \
|
||||
align_buffer_page_end(dst_y_c, kWidth* kHeight); \
|
||||
align_buffer_page_end(dst_uv_c, 2 * SUBSAMPLE(kWidth, SUBSAMP_X) * \
|
||||
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
|
||||
align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
|
||||
align_buffer_page_end(dst_uv_opt, 2 * SUBSAMPLE(kWidth, SUBSAMP_X) * \
|
||||
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
|
||||
for (int i = 0; i < kHeight; ++i) \
|
||||
for (int j = 0; j < kWidth; ++j) \
|
||||
src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
|
||||
for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
|
||||
for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
|
||||
src_uv[(i * 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \
|
||||
(fastrand() & 0xff); \
|
||||
} \
|
||||
} \
|
||||
memset(dst_y_c, 1, kWidth* kHeight); \
|
||||
memset(dst_uv_c, 2, \
|
||||
2 * SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
|
||||
memset(dst_y_opt, 101, kWidth* kHeight); \
|
||||
memset(dst_uv_opt, 102, \
|
||||
2 * SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
SRC_FMT_PLANAR##To##FMT_PLANAR( \
|
||||
src_y + OFF, kWidth, src_uv + OFF, \
|
||||
2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_c : NULL, kWidth, \
|
||||
dst_uv_c, 2 * SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \
|
||||
MaskCpuFlags(benchmark_cpu_info_); \
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) { \
|
||||
SRC_FMT_PLANAR##To##FMT_PLANAR( \
|
||||
src_y + OFF, kWidth, src_uv + OFF, \
|
||||
2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X), DOY ? dst_y_opt : NULL, \
|
||||
kWidth, dst_uv_opt, 2 * SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, \
|
||||
NEG kHeight); \
|
||||
} \
|
||||
if (DOY) { \
|
||||
for (int i = 0; i < kHeight; ++i) { \
|
||||
for (int j = 0; j < kWidth; ++j) { \
|
||||
EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
|
||||
for (int j = 0; j < 2 * SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
|
||||
EXPECT_EQ(dst_uv_c[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j], \
|
||||
dst_uv_opt[i * 2 * SUBSAMPLE(kWidth, SUBSAMP_X) + j]); \
|
||||
} \
|
||||
} \
|
||||
free_aligned_buffer_page_end(dst_y_c); \
|
||||
free_aligned_buffer_page_end(dst_uv_c); \
|
||||
free_aligned_buffer_page_end(dst_y_opt); \
|
||||
free_aligned_buffer_page_end(dst_uv_opt); \
|
||||
free_aligned_buffer_page_end(src_y); \
|
||||
free_aligned_buffer_page_end(src_uv); \
|
||||
#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
|
||||
DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \
|
||||
DOY, SRC_DEPTH) \
|
||||
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
|
||||
static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \
|
||||
static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \
|
||||
static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \
|
||||
"DST SRC_SUBSAMP_X unsupported"); \
|
||||
static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \
|
||||
"DST SRC_SUBSAMP_Y unsupported"); \
|
||||
static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \
|
||||
"DST DST_SUBSAMP_X unsupported"); \
|
||||
static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \
|
||||
"DST DST_SUBSAMP_Y unsupported"); \
|
||||
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
|
||||
const int kHeight = benchmark_height_; \
|
||||
const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \
|
||||
const int kSrcHalfHeight = SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); \
|
||||
const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \
|
||||
const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \
|
||||
align_buffer_page_end(src_y, kWidth* kHeight* SRC_BPC + OFF); \
|
||||
align_buffer_page_end(src_uv, \
|
||||
2 * kSrcHalfWidth * kSrcHalfHeight * SRC_BPC + OFF); \
|
||||
align_buffer_page_end(dst_y_c, kWidth* kHeight* DST_BPC); \
|
||||
align_buffer_page_end(dst_uv_c, \
|
||||
2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \
|
||||
align_buffer_page_end(dst_y_opt, kWidth* kHeight* DST_BPC); \
|
||||
align_buffer_page_end(dst_uv_opt, \
|
||||
2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \
|
||||
MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC); \
|
||||
MemRandomize(src_uv + OFF, 2 * kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \
|
||||
SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF); \
|
||||
SRC_T* src_uv_p = reinterpret_cast<SRC_T*>(src_uv + OFF); \
|
||||
for (int i = 0; i < kWidth * kHeight; ++i) { \
|
||||
src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1); \
|
||||
} \
|
||||
for (int i = 0; i < 2 * kSrcHalfWidth * kSrcHalfHeight; ++i) { \
|
||||
src_uv_p[i] = src_uv_p[i] & ((1 << SRC_DEPTH) - 1); \
|
||||
} \
|
||||
memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \
|
||||
memset(dst_uv_c, 2, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \
|
||||
memset(dst_y_opt, 101, kWidth* kHeight* DST_BPC); \
|
||||
memset(dst_uv_opt, 102, 2 * kDstHalfWidth * kDstHalfHeight * DST_BPC); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
SRC_FMT_PLANAR##To##FMT_PLANAR( \
|
||||
src_y_p, kWidth, src_uv_p, 2 * kSrcHalfWidth, \
|
||||
DOY ? reinterpret_cast<DST_T*>(dst_y_c) : NULL, kWidth, \
|
||||
reinterpret_cast<DST_T*>(dst_uv_c), 2 * kDstHalfWidth, \
|
||||
kWidth, NEG kHeight); \
|
||||
MaskCpuFlags(benchmark_cpu_info_); \
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) { \
|
||||
SRC_FMT_PLANAR##To##FMT_PLANAR( \
|
||||
src_y_p, kWidth, src_uv_p, 2 * kSrcHalfWidth, \
|
||||
DOY ? reinterpret_cast<DST_T*>(dst_y_opt) : NULL, kWidth, \
|
||||
reinterpret_cast<DST_T*>(dst_uv_opt), 2 * kDstHalfWidth, \
|
||||
kWidth, NEG kHeight); \
|
||||
} \
|
||||
if (DOY) { \
|
||||
for (int i = 0; i < kHeight; ++i) { \
|
||||
for (int j = 0; j < kWidth; ++j) { \
|
||||
EXPECT_EQ(dst_y_c[i * kWidth + j], dst_y_opt[i * kWidth + j]); \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
for (int i = 0; i < kDstHalfHeight; ++i) { \
|
||||
for (int j = 0; j < 2 * kDstHalfWidth; ++j) { \
|
||||
EXPECT_EQ(dst_uv_c[i * 2 * kDstHalfWidth + j], \
|
||||
dst_uv_opt[i * 2 * kDstHalfWidth + j]); \
|
||||
} \
|
||||
} \
|
||||
free_aligned_buffer_page_end(dst_y_c); \
|
||||
free_aligned_buffer_page_end(dst_uv_c); \
|
||||
free_aligned_buffer_page_end(dst_y_opt); \
|
||||
free_aligned_buffer_page_end(dst_uv_opt); \
|
||||
free_aligned_buffer_page_end(src_y); \
|
||||
free_aligned_buffer_page_end(src_uv); \
|
||||
}
|
||||
|
||||
#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
|
||||
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
|
||||
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
|
||||
SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1) \
|
||||
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
|
||||
SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1, \
|
||||
1) \
|
||||
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
|
||||
SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1) \
|
||||
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
|
||||
SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1) \
|
||||
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
|
||||
SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0)
|
||||
#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
|
||||
DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \
|
||||
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
||||
DST_SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0, 1, \
|
||||
SRC_DEPTH) \
|
||||
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
||||
DST_SUBSAMP_Y, benchmark_width_, _Unaligned, +, 1, 1, \
|
||||
SRC_DEPTH) \
|
||||
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
||||
DST_SUBSAMP_Y, benchmark_width_, _Invert, -, 0, 1, \
|
||||
SRC_DEPTH) \
|
||||
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
||||
DST_SUBSAMP_Y, benchmark_width_, _Opt, +, 0, 1, SRC_DEPTH) \
|
||||
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, \
|
||||
DST_SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0, \
|
||||
SRC_DEPTH)
|
||||
|
||||
TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
|
||||
TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
|
||||
TESTBIPLANARTOBP(NV12, 2, 2, NV24, 1, 1)
|
||||
TESTBIPLANARTOBP(NV16, 2, 1, NV24, 1, 1)
|
||||
TESTBIPLANARTOBP(NV21, uint8_t, 1, 2, 2, NV12, uint8_t, 1, 2, 2, 8)
|
||||
TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV12Mirror, uint8_t, 1, 2, 2, 8)
|
||||
TESTBIPLANARTOBP(NV12, uint8_t, 1, 2, 2, NV24, uint8_t, 1, 1, 1, 8)
|
||||
TESTBIPLANARTOBP(NV16, uint8_t, 1, 2, 1, NV24, uint8_t, 1, 1, 1, 8)
|
||||
// These formats put data at high bits, so test on full 16bit range.
|
||||
TESTBIPLANARTOBP(P010, uint16_t, 2, 2, 2, P410, uint16_t, 2, 1, 1, 16)
|
||||
TESTBIPLANARTOBP(P210, uint16_t, 2, 2, 1, P410, uint16_t, 2, 1, 1, 16)
|
||||
TESTBIPLANARTOBP(P012, uint16_t, 2, 2, 2, P412, uint16_t, 2, 1, 1, 16)
|
||||
TESTBIPLANARTOBP(P212, uint16_t, 2, 2, 1, P412, uint16_t, 2, 1, 1, 16)
|
||||
TESTBIPLANARTOBP(P016, uint16_t, 2, 2, 2, P416, uint16_t, 2, 1, 1, 16)
|
||||
TESTBIPLANARTOBP(P216, uint16_t, 2, 2, 1, P416, uint16_t, 2, 1, 1, 16)
|
||||
|
||||
#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
|
||||
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user