Add special optimization for I420ToI444 and I422ToI444

These functions use (bi)linear filter, to scale U and V planes to the size of Y plane.
This will help enhance the quality of YUV to RGB conversion.

Also added 10bit and 12bit version:
I010ToI410
I210ToI410
I012ToI412
I212ToI412

libyuv_unittest --gtest_filter=LibYUVConvertTest.I42*ToI444*:LibYUVConvertTest.I*1*ToI41*

R=fbarchard@chromium.org

Change-Id: Ie4a711a5ba28f2ff1f44c021f7a5c149022264c5
Bug: libyuv:872
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2658097
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Yuan Tong 2021-02-03 14:21:07 +08:00 committed by Frank Barchard
parent c28d404936
commit fc61dde1eb
13 changed files with 2155 additions and 30 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1772
Version: 1774
License: BSD
License File: LICENSE

View File

@ -89,6 +89,23 @@ int I422ToI420(const uint8_t* src_y,
int width,
int height);
// Convert I422 to I444.
LIBYUV_API
int I422ToI444(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
int src_stride_u,
const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// Convert I422 to NV21.
LIBYUV_API
int I422ToNV21(const uint8_t* src_y,
@ -122,6 +139,23 @@ int I420Copy(const uint8_t* src_y,
int width,
int height);
// Convert I420 to I444.
LIBYUV_API
int I420ToI444(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
int src_stride_u,
const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// Copy I010 to I010
#define I010ToI010 I010Copy
#define H010ToH010 I010Copy
@ -159,6 +193,46 @@ int I010ToI420(const uint16_t* src_y,
int width,
int height);
// Convert I010 to I410
LIBYUV_API
int I010ToI410(const uint16_t* src_y,
int src_stride_y,
const uint16_t* src_u,
int src_stride_u,
const uint16_t* src_v,
int src_stride_v,
uint16_t* dst_y,
int dst_stride_y,
uint16_t* dst_u,
int dst_stride_u,
uint16_t* dst_v,
int dst_stride_v,
int width,
int height);
// Convert I012 to I412
#define I012ToI412 I010ToI410
// Convert I212 to I412
LIBYUV_API
int I210ToI410(const uint16_t* src_y,
int src_stride_y,
const uint16_t* src_u,
int src_stride_u,
const uint16_t* src_v,
int src_stride_v,
uint16_t* dst_y,
int dst_stride_y,
uint16_t* dst_u,
int dst_stride_u,
uint16_t* dst_v,
int dst_stride_v,
int width,
int height);
// Convert I212 to I412
#define I212ToI412 I210ToI410
// Convert I400 (grey) to I420.
LIBYUV_API
int I400ToI420(const uint8_t* src_y,

View File

@ -49,6 +49,18 @@ void ScalePlane_16(const uint16_t* src,
int dst_height,
enum FilterMode filtering);
// Sample is expected to be in the low 12 bits.
LIBYUV_API
void ScalePlane_12(const uint16_t* src,
int src_stride,
int src_width,
int src_height,
uint16_t* dst,
int dst_stride,
int dst_width,
int dst_height,
enum FilterMode filtering);
// Scales a YUV 4:2:0 image from the src width and height to the
// dst width and height.
// If filtering is kFilterNone, a simple nearest-neighbor algorithm is

View File

@ -77,6 +77,12 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
#define HAS_SCALEUVROWDOWN2BOX_SSSE3
#define HAS_SCALECOLUP2LINEAR_SSE2
#define HAS_SCALECOLUP2LINEAR_SSSE3
#define HAS_SCALEROWUP2LINEAR_SSE2
#define HAS_SCALEROWUP2LINEAR_SSSE3
#define HAS_SCALECOLUP2LINEAR_16_SSE2
#define HAS_SCALEROWUP2LINEAR_16_SSE2
#endif
// The following are available for gcc/clang x86 platforms, but
@ -86,6 +92,10 @@ extern "C" {
(defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
#define HAS_SCALEUVROWDOWN2BOX_AVX2
#define HAS_SCALECOLUP2LINEAR_AVX2
#define HAS_SCALEROWUP2LINEAR_AVX2
#define HAS_SCALECOLUP2LINEAR_16_AVX2
#define HAS_SCALEROWUP2LINEAR_16_AVX2
#endif
// The following are available on all x86 platforms, but
@ -114,6 +124,10 @@ extern "C" {
#define HAS_SCALEROWDOWN4_NEON
#define HAS_SCALEUVROWDOWN2BOX_NEON
#define HAS_SCALEUVROWDOWNEVEN_NEON
#define HAS_SCALECOLUP2LINEAR_NEON
#define HAS_SCALEROWUP2LINEAR_NEON
#define HAS_SCALECOLUP2LINEAR_16_NEON
#define HAS_SCALEROWUP2LINEAR_16_NEON
#endif
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
@ -279,6 +293,40 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* d,
int dst_width);
void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_Any_C(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_16_Any_C(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleCols_C(uint8_t* dst_ptr,
const uint8_t* src_ptr,
int dst_width,
@ -508,6 +556,88 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
@ -1143,6 +1273,39 @@ void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
void ScaleAddRow_Any_NEON(const uint8_t* src_ptr,
uint16_t* dst_ptr,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1773
#define LIBYUV_VERSION 1774
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -159,6 +159,102 @@ int I420ToI444(const uint8_t* src_y,
dst_uv_height);
}
// 420 chroma to 444 chroma, 10/12 bit version
LIBYUV_API
int I010ToI410(const uint16_t* src_y,
int src_stride_y,
const uint16_t* src_u,
int src_stride_u,
const uint16_t* src_v,
int src_stride_v,
uint16_t* dst_y,
int dst_stride_y,
uint16_t* dst_u,
int dst_stride_u,
uint16_t* dst_v,
int dst_stride_v,
int width,
int height) {
if (width == 0 || height == 0) {
return -1;
}
if (dst_y) {
ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
Abs(width), Abs(height), kFilterBilinear);
}
ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1),
SUBSAMPLE(height, 1, 1), dst_u, dst_stride_u, Abs(width),
Abs(height), kFilterBilinear);
ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1),
SUBSAMPLE(height, 1, 1), dst_v, dst_stride_v, Abs(width),
Abs(height), kFilterBilinear);
return 0;
}
// 422 chroma to 444 chroma, 10/12 bit version
LIBYUV_API
int I210ToI410(const uint16_t* src_y,
int src_stride_y,
const uint16_t* src_u,
int src_stride_u,
const uint16_t* src_v,
int src_stride_v,
uint16_t* dst_y,
int dst_stride_y,
uint16_t* dst_u,
int dst_stride_u,
uint16_t* dst_v,
int dst_stride_v,
int width,
int height) {
if (width == 0 || height == 0) {
return -1;
}
if (dst_y) {
ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
Abs(width), Abs(height), kFilterBilinear);
}
ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
return 0;
}
// 422 chroma is 1/2 width, 1x height
// 444 chroma is 1x width, 1x height
LIBYUV_API
int I422ToI444(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
int src_stride_u,
const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
if (width == 0 || height == 0) {
return -1;
}
if (dst_y) {
ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
Abs(width), Abs(height), kFilterBilinear);
}
ScalePlane(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
ScalePlane(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
return 0;
}
// Copy to I400. Source can be I420,422,444,400,NV12,NV21
LIBYUV_API
int I400Copy(const uint8_t* src_y,

View File

@ -1336,6 +1336,238 @@ void ScalePlaneBilinearUp(int src_width,
}
}
// Scale plane, horizontally 2 times, vertically any time.
// Uses linear filter horizontally, nearest vertically.
// This is an optimized version for scaling up a plane to 2 times of
// its original width, using linear interpolation.
// This is used to scale U and V planes of I422 to I444.
void ScalePlaneUp2_Linear(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8_t* src_ptr,
uint8_t* dst_ptr) {
void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
ScaleRowUp2_Linear_Any_C;
int i;
int y;
int dy;
// This function can only scale up by 2 times horizontally.
assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2;
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
if (TestCpuFlag(kCpuHasSSSE3)) {
ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3;
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
if (TestCpuFlag(kCpuHasAVX2)) {
ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2;
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_NEON
if (TestCpuFlag(kCpuHasNEON)) {
ScaleRowUp = ScaleRowUp2_Linear_Any_NEON;
}
#endif
if (dst_height == 1) {
ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
dst_width);
} else {
dy = FixedDiv(src_height - 1, dst_height - 1);
y = (1 << 15) - 1;
for (i = 0; i < dst_height; ++i) {
ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
dst_ptr += dst_stride;
y += dy;
}
}
}
// Scale plane, 2 times.
// This is an optimized version for scaling up a plane to 2 times of
// its original size, using bilinear interpolation.
// This is used to scale U and V planes of I420 to I444.
void ScalePlaneUp2_Bilinear(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint8_t* src_ptr,
uint8_t* dst_ptr) {
void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
ScaleRowUp2_Bilinear_Any_C;
int x;
// This function can only scale up by 2 times.
assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
if (TestCpuFlag(kCpuHasSSSE3)) {
Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
if (TestCpuFlag(kCpuHasAVX2)) {
Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_NEON
if (TestCpuFlag(kCpuHasNEON)) {
Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
}
#endif
if (src_height == 1) {
Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
} else {
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
dst_ptr += dst_stride;
for (x = 0; x < src_height - 1; ++x) {
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
src_ptr += src_stride;
// TODO test performance of writing one row of destination at a time
dst_ptr += 2 * dst_stride;
}
if (!(dst_height & 1)) {
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
}
}
}
// Scale at most 14bit plane, horizontally 2 times.
// This is an optimized version for scaling up a plane to 2 times of
// its original width, using linear interpolation.
// stride is in count of uint16_t.
// This is used to scale U and V planes of I210 to I410 and I212 to I412.
void ScalePlaneUp2_16_Linear(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint16_t* src_ptr,
uint16_t* dst_ptr) {
void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
int dst_width) = ScaleRowUp2_Linear_16_Any_C;
int i;
int y;
int dy;
// This function can only scale up by 2 times horizontally.
assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2;
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
if (TestCpuFlag(kCpuHasAVX2)) {
ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2;
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_NEON
if (TestCpuFlag(kCpuHasNEON)) {
ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON;
}
#endif
if (dst_height == 1) {
ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
dst_width);
} else {
dy = FixedDiv(src_height - 1, dst_height - 1);
y = (1 << 15) - 1;
for (i = 0; i < dst_height; ++i) {
ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
dst_ptr += dst_stride;
y += dy;
}
}
}
// Scale at most 12bit plane, up 2 times.
// This is an optimized version for scaling up a plane to 2 times of
// its original size, using bilinear interpolation.
// stride is in count of uint16_t.
// This is used to scale U and V planes of I010 to I410 and I012 to I412.
void ScalePlaneUp2_16_Bilinear(int src_width,
int src_height,
int dst_width,
int dst_height,
int src_stride,
int dst_stride,
const uint16_t* src_ptr,
uint16_t* dst_ptr) {
void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
ScaleRowUp2_Bilinear_16_Any_C;
int x;
// This function can only scale up by 2 times.
assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2;
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
if (TestCpuFlag(kCpuHasAVX2)) {
Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2;
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_NEON
if (TestCpuFlag(kCpuHasNEON)) {
Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON;
}
#endif
if (src_height == 1) {
Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
} else {
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
dst_ptr += dst_stride;
for (x = 0; x < src_height - 1; ++x) {
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
src_ptr += src_stride;
dst_ptr += 2 * dst_stride;
}
if (!(dst_height & 1)) {
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
}
}
}
void ScalePlaneBilinearUp_16(int src_width,
int src_height,
int dst_width,
@ -1627,6 +1859,17 @@ void ScalePlane(const uint8_t* src,
dst_stride, src, dst);
return;
}
if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst);
return;
}
if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
(filtering == kFilterBilinear || filtering == kFilterBox)) {
ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst);
return;
}
if (filtering && dst_height > src_height) {
ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst, filtering);
@ -1724,6 +1967,43 @@ void ScalePlane_16(const uint16_t* src,
dst_stride, src, dst);
}
LIBYUV_API
void ScalePlane_12(const uint16_t* src,
int src_stride,
int src_width,
int src_height,
uint16_t* dst,
int dst_stride,
int dst_width,
int dst_height,
enum FilterMode filtering) {
// Simplify filtering when possible.
filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
filtering);
// Negative height means invert the image.
if (src_height < 0) {
src_height = -src_height;
src = src + (src_height - 1) * src_stride;
src_stride = -src_stride;
}
if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst);
return;
}
if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
(filtering == kFilterBilinear || filtering == kFilterBox)) {
ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height,
src_stride, dst_stride, src, dst);
return;
}
ScalePlane_16(src, src_stride, src_width, src_height, dst, dst_stride,
dst_width, dst_height, filtering);
}
// Scale an I420 image.
// This function in turn calls a scaling function for each plane.

View File

@ -609,6 +609,191 @@ CANY(ScaleARGBFilterCols_Any_MSA,
#endif
#undef CANY
// Scale up horizontally 2 times using linear filter.
#define SUH2LANY(NAME, SIMD, C, MASK, PTYPE) \
void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
int work_width = (dst_width - 1) & ~1; \
int r = work_width & MASK; \
int n = work_width & ~MASK; \
dst_ptr[0] = src_ptr[0]; \
if (work_width > 0) { \
if (n != 0) { \
SIMD(src_ptr, dst_ptr + 1, n); \
} \
C(src_ptr + (n / 2), dst_ptr + n + 1, r); \
} \
dst_ptr[dst_width - 1] = src_ptr[(dst_width / 2) - 1]; \
}
// Even the C version need to be wrapped, because boundary pixels have to
// be handled differently
SUH2LANY(ScaleRowUp2_Linear_Any_C,
ScaleRowUp2_Linear_C,
ScaleRowUp2_Linear_C,
0,
uint8_t)
SUH2LANY(ScaleRowUp2_Linear_16_Any_C,
ScaleRowUp2_Linear_16_C,
ScaleRowUp2_Linear_16_C,
0,
uint16_t)
#ifdef HAS_SCALECOLUP2LINEAR_SSE2
SUH2LANY(ScaleRowUp2_Linear_Any_SSE2,
ScaleRowUp2_Linear_SSE2,
ScaleRowUp2_Linear_C,
15,
uint8_t)
#endif
#ifdef HAS_SCALECOLUP2LINEAR_SSSE3
SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
ScaleRowUp2_Linear_SSSE3,
ScaleRowUp2_Linear_C,
15,
uint8_t)
#endif
#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
ScaleRowUp2_Linear_16_SSE2,
ScaleRowUp2_Linear_16_C,
15,
uint16_t)
#endif
#ifdef HAS_SCALECOLUP2LINEAR_AVX2
SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
ScaleRowUp2_Linear_AVX2,
ScaleRowUp2_Linear_C,
31,
uint8_t)
#endif
#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2
SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
ScaleRowUp2_Linear_16_AVX2,
ScaleRowUp2_Linear_16_C,
15,
uint16_t)
#endif
#ifdef HAS_SCALECOLUP2LINEAR_NEON
SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
ScaleRowUp2_Linear_NEON,
ScaleRowUp2_Linear_C,
15,
uint8_t)
#endif
#ifdef HAS_SCALECOLUP2LINEAR_16_NEON
SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
ScaleRowUp2_Linear_16_NEON,
ScaleRowUp2_Linear_16_C,
15,
uint16_t)
#endif
#undef SUH2LANY
// Scale up 2 times using bilinear filter.
// This function produces 2 rows at a time
#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \
void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
ptrdiff_t dst_stride, int dst_width) { \
int work_width = (dst_width - 1) & ~1; \
int r = work_width & MASK; \
int n = work_width & ~MASK; \
const PTYPE* sa = src_ptr; \
const PTYPE* sb = src_ptr + src_stride; \
PTYPE* da = dst_ptr; \
PTYPE* db = dst_ptr + dst_stride; \
da[0] = (3 * sa[0] + sb[0]) >> 2; \
db[0] = (sa[0] + 3 * sb[0]) >> 2; \
if (work_width > 0) { \
if (n != 0) { \
SIMD(sa, sb - sa, da + 1, db - da, n); \
} \
C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \
} \
da[dst_width - 1] = \
(3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2]) >> 2; \
db[dst_width - 1] = \
(sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2]) >> 2; \
}
SU2BLANY(ScaleRowUp2_Bilinear_Any_C,
ScaleRowUp2_Bilinear_C,
ScaleRowUp2_Bilinear_C,
0,
uint8_t)
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C,
ScaleRowUp2_Bilinear_16_C,
ScaleRowUp2_Bilinear_16_C,
0,
uint16_t)
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
ScaleRowUp2_Bilinear_SSE2,
ScaleRowUp2_Bilinear_C,
15,
uint8_t)
#endif
#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
ScaleRowUp2_Bilinear_16_SSE2,
ScaleRowUp2_Bilinear_16_C,
15,
uint16_t)
#endif
#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
ScaleRowUp2_Bilinear_SSSE3,
ScaleRowUp2_Bilinear_C,
15,
uint8_t)
#endif
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
ScaleRowUp2_Bilinear_AVX2,
ScaleRowUp2_Bilinear_C,
31,
uint8_t)
#endif
#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
ScaleRowUp2_Bilinear_16_AVX2,
ScaleRowUp2_Bilinear_16_C,
15,
uint16_t)
#endif
#ifdef HAS_SCALEROWUP2LINEAR_NEON
SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON,
ScaleRowUp2_Bilinear_NEON,
ScaleRowUp2_Bilinear_C,
15,
uint8_t)
#endif
#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON,
ScaleRowUp2_Bilinear_16_NEON,
ScaleRowUp2_Bilinear_16_C,
15,
uint16_t)
#endif
#undef SU2BLANY
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -400,6 +400,95 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
}
}
// sample position: (O is src sample position, X is dst sample position)
//
// v dst_ptr at here v stop at here
// X O X X O X X O X X O X X O X
// ^ src_ptr at here
void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
int src_width = dst_width >> 1;
int x;
assert((dst_width % 2 == 0) && (dst_width >= 0));
for (x = 0; x < src_width; ++x) {
dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
}
}
// sample position: (O is src sample position, X is dst sample position)
//
// src_ptr at here
// X v X X X X X X X X X
// O O O O O
// X X X X X X X X X X
// ^ dst_ptr at here ^ stop at here
// X X X X X X X X X X
// O O O O O
// X X X X X X X X X X
void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
const uint8_t* s = src_ptr;
const uint8_t* t = src_ptr + src_stride;
uint8_t* d = dst_ptr;
uint8_t* e = dst_ptr + dst_stride;
int src_width = dst_width >> 1;
int x;
assert((dst_width % 2 == 0) && (dst_width >= 0));
for (x = 0; x < src_width; ++x) {
d[2 * x + 0] =
(s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
d[2 * x + 1] =
(s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
e[2 * x + 0] =
(s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
e[2 * x + 1] =
(s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
}
}
// only suitable for at most 14bit range.
void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
int src_width = dst_width >> 1;
int x;
assert((dst_width % 2 == 0) && (dst_width >= 0));
for (x = 0; x < src_width; ++x) {
dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
}
}
// Only suitable for at most 12bit range.
void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
const uint16_t* s = src_ptr;
const uint16_t* t = src_ptr + src_stride;
uint16_t* d = dst_ptr;
uint16_t* e = dst_ptr + dst_stride;
int src_width = dst_width >> 1;
int x;
assert((dst_width % 2 == 0) && (dst_width >= 0));
for (x = 0; x < src_width; ++x) {
d[2 * x + 0] =
(s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
d[2 * x + 1] =
(s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
e[2 * x + 0] =
(s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
e[2 * x + 1] =
(s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
}
}
// Scales a single row of pixels using point sampling.
void ScaleCols_C(uint8_t* dst_ptr,
const uint8_t* src_ptr,

View File

@ -785,6 +785,836 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
"xmm7");
}
#ifdef HAS_SCALECOLUP2LINEAR_SSE2
void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"pxor %%xmm0,%%xmm0 \n" // 0
"pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n"
"psllw $1,%%xmm6 \n" // all 2
LABELALIGN
"1: \n"
"movq (%0),%%xmm1 \n" // 01234567
"movq 1(%0),%%xmm2 \n" // 12345678
"movdqa %%xmm1,%%xmm3 \n"
"punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
"punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
"punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
"movdqa %%xmm1,%%xmm4 \n"
"punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
"movdqa %%xmm2,%%xmm5 \n"
"punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
"paddw %%xmm5,%%xmm4 \n"
"movdqa %%xmm3,%%xmm5 \n"
"paddw %%xmm6,%%xmm4 \n"
"punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
"paddw %%xmm5,%%xmm5 \n"
"paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo)
"psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo)
"punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
"punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
"paddw %%xmm2,%%xmm1 \n"
"punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
"paddw %%xmm6,%%xmm1 \n"
"paddw %%xmm3,%%xmm3 \n"
"paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
"psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
"packuswb %%xmm1,%%xmm5 \n"
"movdqu %%xmm5,(%1) \n"
"lea 0x8(%0),%0 \n"
"lea 0x10(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
LABELALIGN
"1: \n"
"pxor %%xmm0,%%xmm0 \n" // 0
// above line
"movq (%0),%%xmm1 \n" // 01234567
"movq 1(%0),%%xmm2 \n" // 12345678
"movdqa %%xmm1,%%xmm3 \n"
"punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
"punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
"punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
"movdqa %%xmm1,%%xmm4 \n"
"punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
"movdqa %%xmm2,%%xmm5 \n"
"punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
"paddw %%xmm5,%%xmm4 \n" // near+far
"movdqa %%xmm3,%%xmm5 \n"
"punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
"paddw %%xmm5,%%xmm5 \n" // 2*near
"paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo)
"punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
"punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
"paddw %%xmm2,%%xmm1 \n"
"punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
"paddw %%xmm3,%%xmm3 \n" // 2*near
"paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
// below line
"movq (%0,%3),%%xmm6 \n" // 01234567
"movq 1(%0,%3),%%xmm2 \n" // 12345678
"movdqa %%xmm6,%%xmm3 \n"
"punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
"punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677
"punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
"movdqa %%xmm6,%%xmm5 \n"
"punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16)
"movdqa %%xmm2,%%xmm7 \n"
"punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16)
"paddw %%xmm7,%%xmm5 \n" // near+far
"movdqa %%xmm3,%%xmm7 \n"
"punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16)
"paddw %%xmm7,%%xmm7 \n" // 2*near
"paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo)
"punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16)
"punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
"paddw %%xmm6,%%xmm2 \n" // near+far
"punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
"paddw %%xmm3,%%xmm3 \n" // 2*near
"paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi)
// xmm4 xmm1
// xmm5 xmm2
"pcmpeqw %%xmm0,%%xmm0 \n"
"psrlw $15,%%xmm0 \n"
"psllw $3,%%xmm0 \n" // all 8
"movdqa %%xmm4,%%xmm3 \n"
"movdqa %%xmm5,%%xmm6 \n"
"psllw $1,%%xmm3 \n" // 6*near+2*far (1, lo)
"paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo)
"paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo)
"paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo)
"psrlw $4,%%xmm3 \n" // ^ div by 16
"movdqa %%xmm1,%%xmm7 \n"
"movdqa %%xmm2,%%xmm6 \n"
"psllw $1,%%xmm7 \n" // 6*near+2*far (1, hi)
"paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi)
"paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi)
"paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi)
"psrlw $4,%%xmm7 \n" // ^ div by 16
"packuswb %%xmm7,%%xmm3 \n"
"movdqu %%xmm3,(%1) \n" // save above line
"movdqa %%xmm5,%%xmm3 \n"
"paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo)
"psllw $1,%%xmm3 \n" // 6*near+2*far (2, lo)
"paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo)
"paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo)
"psrlw $4,%%xmm5 \n" // ^ div by 16
"movdqa %%xmm2,%%xmm3 \n"
"paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi)
"psllw $1,%%xmm3 \n" // 6*near+2*far (2, hi)
"paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
"paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi)
"psrlw $4,%%xmm2 \n" // ^ div by 16
"packuswb %%xmm2,%%xmm5 \n"
"movdqu %%xmm5,(%1,%4) \n" // save below line
"lea 0x8(%0),%0 \n"
"lea 0x10(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif
#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
"pxor %%xmm0,%%xmm0 \n" // 0
"pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n"
"psllw $1,%%xmm6 \n" // all 2
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm1 \n" // 01234567 (16)
"movdqu 2(%0),%%xmm2 \n" // 12345678 (16)
"movdqa %%xmm1,%%xmm4 \n"
"punpcklwd %%xmm4,%%xmm4 \n" // 00112233 (16)
"movdqa %%xmm2,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n" // 11223344 (16)
"paddw %%xmm5,%%xmm4 \n"
"movdqa %%xmm1,%%xmm5 \n"
"paddw %%xmm6,%%xmm4 \n"
"punpcklwd %%xmm2,%%xmm5 \n" // 01122334 (16)
"psllw $1,%%xmm5 \n"
"paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo)
"psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo)
"movdqu %%xmm5,(%1) \n"
"movdqa %%xmm1,%%xmm3 \n"
"punpckhwd %%xmm2,%%xmm3 \n" // 45566778 (16)
"punpckhwd %%xmm1,%%xmm1 \n" // 44556677 (16)
"punpckhwd %%xmm2,%%xmm2 \n" // 55667788 (16)
"paddw %%xmm2,%%xmm1 \n"
"paddw %%xmm6,%%xmm1 \n"
"psllw $1,%%xmm3 \n"
"paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
"psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
"movdqu %%xmm1,0x10(%1) \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
:
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"pxor %%xmm0,%%xmm0 \n" // 0
"pcmpeqw %%xmm7,%%xmm7 \n"
"psrlw $15,%%xmm7 \n"
"psllw $3,%%xmm7 \n" // all 8
LABELALIGN
"1: \n"
// above line
"movdqu (%0),%%xmm1 \n" // 01234567 (16)
"movdqu 2(%0),%%xmm2 \n" // 12345678 (16)
"movdqa %%xmm1,%%xmm4 \n"
"punpcklwd %%xmm4,%%xmm4 \n" // 00112233 (16)
"movdqa %%xmm2,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n" // 11223344 (16)
"paddw %%xmm5,%%xmm4 \n"
"movdqa %%xmm1,%%xmm5 \n"
"punpcklwd %%xmm2,%%xmm5 \n" // 01122334 (16)
"paddw %%xmm5,%%xmm5 \n"
"paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo)
"movdqa %%xmm1,%%xmm3 \n"
"punpckhwd %%xmm2,%%xmm3 \n" // 45566778 (16)
"punpckhwd %%xmm1,%%xmm1 \n" // 44556677 (16)
"punpckhwd %%xmm2,%%xmm2 \n" // 55667788 (16)
"paddw %%xmm2,%%xmm1 \n"
"paddw %%xmm3,%%xmm3 \n"
"paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
// below line
"movdqu (%0,%3,2),%%xmm6 \n" // 01234567 (16)
"movdqu 2(%0,%3,2),%%xmm2 \n" // 12345678 (16)
"movdqa %%xmm6,%%xmm3 \n"
"punpcklwd %%xmm3,%%xmm3 \n" // 00112233 (16)
"movdqa %%xmm2,%%xmm5 \n"
"punpcklwd %%xmm5,%%xmm5 \n" // 11223344 (16)
"paddw %%xmm5,%%xmm3 \n"
"movdqa %%xmm6,%%xmm5 \n"
"punpcklwd %%xmm2,%%xmm5 \n" // 01122334 (16)
"paddw %%xmm5,%%xmm5 \n"
"paddw %%xmm3,%%xmm5 \n" // 3*near+far (2, lo)
"movdqa %%xmm6,%%xmm3 \n"
"punpckhwd %%xmm2,%%xmm3 \n" // 45566778 (16)
"punpckhwd %%xmm6,%%xmm6 \n" // 44556677 (16)
"punpckhwd %%xmm2,%%xmm2 \n" // 55667788 (16)
"paddw %%xmm6,%%xmm2 \n"
"paddw %%xmm3,%%xmm3 \n"
"paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi)
// xmm4 xmm1
// xmm5 xmm2
"movdqa %%xmm4,%%xmm3 \n"
"movdqa %%xmm5,%%xmm6 \n"
"psllw $1,%%xmm3 \n" // 6*near+2*far (1, lo)
"paddw %%xmm7,%%xmm6 \n" // 3*near+far+8 (2, lo)
"paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo)
"paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo)
"psrlw $4,%%xmm3 \n" // ^ div by 16
"movdqu %%xmm3,(%1) \n"
"movdqa %%xmm1,%%xmm3 \n"
"movdqa %%xmm2,%%xmm6 \n"
"psllw $1,%%xmm3 \n" // 6*near+2*far (1, hi)
"paddw %%xmm7,%%xmm6 \n" // 3*near+far+8 (2, hi)
"paddw %%xmm1,%%xmm3 \n" // 9*near+3*far (1, hi)
"paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, hi)
"psrlw $4,%%xmm3 \n" // ^ div by 16
"movdqu %%xmm3,0x10(%1) \n"
"movdqa %%xmm5,%%xmm3 \n"
"paddw %%xmm7,%%xmm4 \n" // 3*near+far+8 (1, lo)
"psllw $1,%%xmm3 \n" // 6*near+2*far (2, lo)
"paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo)
"paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
"psrlw $4,%%xmm5 \n" // ^ div by 16
"movdqu %%xmm5,(%1,%4,2) \n"
"movdqa %%xmm2,%%xmm3 \n"
"paddw %%xmm7,%%xmm1 \n" // 3*near+far+8 (1, hi)
"psllw $1,%%xmm3 \n" // 6*near+2*far (2, hi)
"paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
"paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi)
"psrlw $4,%%xmm2 \n" // ^ div by 16
"movdqu %%xmm2,0x10(%1,%4,2) \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif
#ifdef HAS_SCALECOLUP2LINEAR_SSSE3
static const uvec8 kLinearMadd31_SSSE3 = {3, 1, 1, 3, 3, 1, 1, 3,
3, 1, 1, 3, 3, 1, 1, 3};
void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"pcmpeqw %%xmm4,%%xmm4 \n"
"psrlw $15,%%xmm4 \n"
"psllw $1,%%xmm4 \n" // all 2
"movdqu %3,%%xmm3 \n"
LABELALIGN
"1: \n"
"movq (%0),%%xmm0 \n" // 01234567
"movq 1(%0),%%xmm1 \n" // 12345678
"punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
"punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
"movdqa %%xmm0,%%xmm2 \n"
"punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
"punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
"pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi)
"pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo)
"paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
"paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
"psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
"psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
"vpackuswb %%xmm2,%%xmm0,%%xmm0 \n"
"vmovdqu %%xmm0,(%1) \n"
"lea 0x8(%0),%0 \n"
"lea 0x10(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "m"(kLinearMadd31_SSSE3) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n"
"psllw $3,%%xmm6 \n" // all 8
"movdqu %5,%%xmm7 \n"
LABELALIGN
"1: \n"
"movq (%0),%%xmm0 \n" // 01234567
"movq 1(%0),%%xmm1 \n" // 12345678
"punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
"punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
"movdqa %%xmm0,%%xmm2 \n"
"punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
"punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
"pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi)
"pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo)
"movq (%0,%3),%%xmm1 \n"
"movq 1(%0,%3),%%xmm4 \n"
"punpcklwd %%xmm1,%%xmm1 \n"
"punpcklwd %%xmm4,%%xmm4 \n"
"movdqa %%xmm1,%%xmm3 \n"
"punpckhdq %%xmm4,%%xmm3 \n"
"punpckldq %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
"pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
// xmm0 xmm2
// xmm1 xmm3
"movdqa %%xmm0,%%xmm4 \n"
"movdqa %%xmm1,%%xmm5 \n"
"paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
"paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
"paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
"paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
"psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
"movdqa %%xmm1,%%xmm5 \n"
"paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
"paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
"paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
"paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
"psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
"movdqa %%xmm2,%%xmm0 \n"
"movdqa %%xmm3,%%xmm1 \n"
"paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
"paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
"paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
"paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
"psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
"movdqa %%xmm3,%%xmm1 \n"
"paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
"paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
"paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
"paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
"psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
"packuswb %%xmm0,%%xmm4 \n"
"movdqu %%xmm4,(%1) \n" // store above
"packuswb %%xmm1,%%xmm5 \n"
"movdqu %%xmm5,(%1,%4) \n" // store below
"lea 0x8(%0),%0 \n"
"lea 0x10(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)), // %4
"m"(kLinearMadd31_SSSE3) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif
#ifdef HAS_SCALECOLUP2LINEAR_AVX2
static const lvec8 kLinearMadd31_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1,
3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1,
1, 3, 3, 1, 1, 3, 3, 1, 1, 3};
void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
"vmovdqu %3,%%ymm3 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
"vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
"vpermq $0b11011000,%%ymm1,%%ymm1 \n"
"vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
"vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
"vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
"vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
"vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
"vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
"vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
"vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 16 sample to 32 sample
"sub $0x20,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "m"(kLinearMadd31_AVX2) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrlw $15,%%ymm6,%%ymm6 \n"
"vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
"vmovdqu %5,%%ymm7 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
"vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
"vpermq $0b11011000,%%ymm1,%%ymm1 \n"
"vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
"vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
"vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
"vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
"vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
"vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm2,%%ymm2 \n"
"vpermq $0b11011000,%%ymm3,%%ymm3 \n"
"vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n"
"vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n"
"vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n"
"vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
"vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
// ymm0 ymm1
// ymm2 ymm3
"vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
"vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
"vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
"vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
"vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
"vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
"vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
"vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
"vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
"vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
"vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
"vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
"vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
"vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
"vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
"vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
"vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
"vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
"vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
"vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
"vmovdqu %%ymm4,(%1) \n" // store above
"vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
"vmovdqu %%ymm5,(%1,%4) \n" // store below
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 16 sample to 32 sample
"sub $0x20,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)), // %4
"m"(kLinearMadd31_AVX2) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif
#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2
static const lvec16 kLinearMadd31_16_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3,
3, 1, 1, 3, 3, 1, 1, 3};
void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
"vmovdqu %3,%%ymm3 \n"
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
"vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo)
"vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
"vpackssdw %%ymm1,%%ymm0,%%ymm0 \n" // 3*near+far
"vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2
"vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far
"vmovdqu %%ymm0,(%1) \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "m"(kLinearMadd31_16_AVX2) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
// This version can handle full 16bit range but is slower
void ScaleRowUp2_Linear_16_AVX2_Full(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
"vmovdqu %3,%%ymm3 \n"
"vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrld $31,%%ymm4,%%ymm4 \n"
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
"vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo)
"vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
"vpaddd %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
"vpaddd %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
"vpsrad $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
"vpsrad $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
"vpackssdw %%ymm1,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "m"(kLinearMadd31_16_AVX2) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
}
#endif
#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"vmovdqu %5,%%ymm5 \n"
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $3,%%ymm4,%%ymm4 \n" // all 8
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
"vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo)
"vpmaddwd %%ymm5,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
"vpackssdw %%ymm1,%%ymm0,%%ymm2 \n" // 3*near+far (1)
"vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm3 \n" // 2323343467677878
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
"vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (2, lo)
"vpmaddwd %%ymm5,%%ymm3,%%ymm1 \n" // 3*near+far (2, hi)
"vpackssdw %%ymm1,%%ymm0,%%ymm3 \n" // 3*near+far (2)
"vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1)
"vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2)
"vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1)
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1)
"vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
"vmovdqu %%ymm0,(%1) \n" // store above
"vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2)
"vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1)
"vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2)
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2)
"vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
"vmovdqu %%ymm0,(%1,%4,2) \n" // store below
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)), // %4
"m"(kLinearMadd31_16_AVX2) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
// This version can handle full 16bit range but is slower.
void ScaleRowUp2_Bilinear_16_AVX2_Full(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"vmovdqu %5,%%ymm7 \n"
"vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrld $31,%%ymm6,%%ymm6 \n"
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
"vpmaddwd %%ymm7,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo)
"vpmaddwd %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
"vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b)
"vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm2,%%ymm2 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm3,%%ymm3 \n" // 1234000056780000
"vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n" // 0101232345456767
"vpunpckldq %%ymm3,%%ymm3,%%ymm3 \n" // 1212343456567878
"vpunpckhqdq %%ymm3,%%ymm2,%%ymm4 \n" // 2323343467677878
"vpunpcklqdq %%ymm3,%%ymm2,%%ymm3 \n" // 0101121245455656
"vpmaddwd %%ymm7,%%ymm3,%%ymm2 \n" // 3*near+far (2, lo)
"vpmaddwd %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
"vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
"vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
"vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
"vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
"vpsrad $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
"vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
"vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
"vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
"vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
"vpsrad $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
"vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
"vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
"vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
"vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
"vpsrad $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
"vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
"vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
"vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
"vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
"vpsrad $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
"vpackssdw %%ymm0,%%ymm4,%%ymm4 \n"
"vmovdqu %%ymm4,(%1) \n" // store above
"vpackssdw %%ymm2,%%ymm5,%%ymm5 \n"
"vmovdqu %%ymm5,(%1,%4,2) \n" // store below
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
: "r"((intptr_t)(src_stride)), // %3
"r"((intptr_t)(dst_stride)), // %4
"m"(kLinearMadd31_16_AVX2) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#endif
// Reads 16xN bytes and produces 16 shorts at a time.
void ScaleAddRow_SSE2(const uint8_t* src_ptr,
uint16_t* dst_ptr,
@ -946,8 +1776,8 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
"x"(kFsub80), // %8
"x"(kFadd40) // %9
#else
"m"(kFsub80), // %8
"m"(kFadd40) // %9
"m"(kFsub80), // %8
"m"(kFadd40) // %9
#endif
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");

View File

@ -504,6 +504,200 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
}
void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
const uint8_t* src_temp = src_ptr + 1;
asm volatile(
"vmov.u16 q15, #3 \n"
"1: \n"
"vld1.8 {d0}, [%0]! \n" // 01234567
"vld1.8 {d2}, [%3]! \n" // 12345678
"vmovl.u8 q0, d0 \n" // 01234567 (16b)
"vmovl.u8 q1, d2 \n" // 12345678 (16b)
"vmovq q2, q0 \n"
"vmla.u16 q2, q1, q15 \n" // 3*near+far (odd)
"vmla.u16 q1, q0, q15 \n" // 3*near+far (even)
"vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (odd)
"vrshrn.u16 d1, q2, #2 \n" // 3/4*near+1/4*far (even)
"vst2.8 {d0, d1}, [%1]! \n" // store
"subs %2, %2, #16 \n" // 8 sample -> 16 sample
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_temp) // %3
:
: "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List
);
}
void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
const uint8_t* src_ptr1 = src_ptr + src_stride;
uint8_t* dst_ptr1 = dst_ptr + dst_stride;
const uint8_t* src_temp = src_ptr + 1;
const uint8_t* src_temp1 = src_ptr1 + 1;
asm volatile(
"vmov.u16 q15, #3 \n"
"1: \n"
"vld1.8 {d0}, [%0]! \n" // 01234567
"vld1.8 {d2}, [%5]! \n" // 12345678
"vmovl.u8 q0, d0 \n" // 01234567 (16b)
"vmovl.u8 q1, d2 \n" // 12345678 (16b)
"vmovq q2, q0 \n"
"vmla.u16 q0, q1, q15 \n" // 3*near+far (1, odd)
"vmla.u16 q1, q2, q15 \n" // 3*near+far (1, even)
"vld1.8 {d4}, [%1]! \n" // 01234567
"vld1.8 {d6}, [%6]! \n" // 12345678
"vmovl.u8 q2, d4 \n" // 01234567 (16b)
"vmovl.u8 q3, d6 \n" // 12345678 (16b)
"vmovq q4, q2 \n"
"vmla.u16 q2, q3, q15 \n" // 3*near+far (2, odd)
"vmla.u16 q3, q4, q15 \n" // 3*near+far (2, even)
// e o
// q1 q0
// q3 q2
"vmovq q4, q2 \n"
"vmovq q5, q3 \n"
"vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd)
"vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even)
"vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd)
"vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even)
// e o
// q5 q4
// q1 q0
"vrshrn.u16 d2, q1, #4 \n" // 2, even
"vrshrn.u16 d3, q0, #4 \n" // 2, odd
"vrshrn.u16 d0, q5, #4 \n" // 1, even
"vrshrn.u16 d1, q4, #4 \n" // 1, odd
"vst2.8 {d0, d1}, [%2]! \n" // store
"vst2.8 {d2, d3}, [%3]! \n" // store
"subs %4, %4, #16 \n" // 8 sample -> 16 sample
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_ptr1), // %1
"+r"(dst_ptr), // %2
"+r"(dst_ptr1), // %3
"+r"(dst_width), // %4
"+r"(src_temp), // %5
"+r"(src_temp1) // %6
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
"q15" // Clobber List
);
}
void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
const uint16_t* src_temp = src_ptr + 1;
asm volatile(
"vmov.u16 q15, #3 \n"
"1: \n"
"vld1.16 {q1}, [%0]! \n" // 01234567 (16b)
"vld1.16 {q0}, [%3]! \n" // 12345678 (16b)
"vmovq q2, q0 \n"
"vmla.u16 q0, q1, q15 \n" // 3*near+far (odd)
"vmla.u16 q1, q2, q15 \n" // 3*near+far (even)
"vrshr.u16 q0, q0, #2 \n" // 3/4*near+1/4*far (odd)
"vrshr.u16 q1, q1, #2 \n" // 3/4*near+1/4*far (even)
"vst2.16 {d0, d1, d2, d3}, [%1]! \n" // store
"subs %2, %2, #16 \n" // 8 sample -> 16 sample
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
"+r"(src_temp) // %3
:
: "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List
);
}
void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
const uint16_t* src_ptr1 = src_ptr + src_stride;
uint16_t* dst_ptr1 = dst_ptr + dst_stride;
const uint16_t* src_temp = src_ptr + 1;
const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile(
"vmov.u16 q15, #3 \n"
"1: \n"
"add %5, %0, #2 \n"
"vld1.16 {q0}, [%0]! \n" // 01234567 (16b)
"vld1.16 {q1}, [%5]! \n" // 12345678 (16b)
"vmovq q2, q0 \n"
"vmla.u16 q0, q1, q15 \n" // 3*near+far (odd)
"vmla.u16 q1, q2, q15 \n" // 3*near+far (even)
"add %5, %1, #2 \n"
"vld1.16 {q2}, [%1]! \n" // 01234567 (16b)
"vld1.16 {q3}, [%6]! \n" // 12345678 (16b)
"vmovq q4, q2 \n"
"vmla.u16 q2, q3, q15 \n" // 3*near+far (odd)
"vmla.u16 q3, q4, q15 \n" // 3*near+far (even)
"vmovq q4, q2 \n"
"vmovq q5, q3 \n"
"vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd)
"vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even)
"vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd)
"vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even)
"vrshr.u16 q2, q1, #4 \n" // 2, even
"vrshr.u16 q3, q0, #4 \n" // 2, odd
"vrshr.u16 q0, q5, #4 \n" // 1, even
"vrshr.u16 q1, q4, #4 \n" // 1, odd
"vst2.16 {d0, d1, d2, d3}, [%2]! \n" // store
"vst2.16 {d4, d5, d6, d7}, [%3]! \n" // store
"subs %4, %4, #16 \n" // 8 sample -> 16 sample
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_ptr1), // %1
"+r"(dst_ptr), // %2
"+r"(dst_ptr1), // %3
"+r"(dst_width), // %4
"+r"(src_temp), // %5
"+r"(src_temp1) // %6
:
: "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
"q15" // Clobber List
);
}
// Add a row of bytes to a row of shorts. Used for box filter.
// Reads 16 bytes and accumulates to 16 shorts at a time.
void ScaleAddRow_NEON(const uint8_t* src_ptr,

View File

@ -535,6 +535,196 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
"v19", "v30", "v31", "memory", "cc");
}
void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
const uint8_t* src_temp = src_ptr + 1;
asm volatile(
"movi v31.8b, #3 \n"
"1: \n"
"ldr d0, [%0], #8 \n" // 01234567
"ldr d1, [%1], #8 \n" // 12345678
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b)
"ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b)
"umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd)
"umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even)
"rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd)
"rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even)
"st2 {v1.8b, v2.8b}, [%2], #16 \n" // store
"subs %w3, %w3, #16 \n" // 8 sample -> 16 sample
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_temp), // %1
"+r"(dst_ptr), // %2
"+r"(dst_width) // %3
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List
);
}
void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
const uint8_t* src_ptr1 = src_ptr + src_stride;
uint8_t* dst_ptr1 = dst_ptr + dst_stride;
const uint8_t* src_temp = src_ptr + 1;
const uint8_t* src_temp1 = src_ptr1 + 1;
asm volatile(
"movi v31.8b, #3 \n"
"movi v30.8h, #3 \n"
"1: \n"
"ldr d0, [%0], #8 \n" // 01234567
"ldr d1, [%2], #8 \n" // 12345678
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b)
"ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b)
"umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd)
"umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even)
"ldr d0, [%1], #8 \n"
"ldr d1, [%3], #8 \n"
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"ushll v4.8h, v0.8b, #0 \n" // 01234567 (16b)
"ushll v5.8h, v1.8b, #0 \n" // 12345678 (16b)
"umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd)
"umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even)
"mov v0.8h, v4.8h \n"
"mov v1.8h, v5.8h \n"
"mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd)
"mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even)
"mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd)
"mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even)
"rshrn v2.8b, v2.8h, #4 \n" // 2, odd
"rshrn v1.8b, v3.8h, #4 \n" // 2, even
"rshrn v4.8b, v4.8h, #4 \n" // 1, odd
"rshrn v3.8b, v5.8h, #4 \n" // 1, even
"st2 {v1.8b, v2.8b}, [%5], #16 \n" // store 1
"st2 {v3.8b, v4.8b}, [%4], #16 \n" // store 2
"subs %w6, %w6, #16 \n" // 8 sample -> 16 sample
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_ptr1), // %1
"+r"(src_temp), // %2
"+r"(src_temp1), // %3
"+r"(dst_ptr), // %4
"+r"(dst_ptr1), // %5
"+r"(dst_width) // %6
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
"v31" // Clobber List
);
}
void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
const uint16_t* src_temp = src_ptr + 1;
asm volatile(
"movi v31.8h, #3 \n"
"1: \n"
"ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
"ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"mov v2.8h, v0.8h \n"
"mla v0.8h, v1.8h, v31.8h \n" // 3*near+far (odd)
"mla v1.8h, v2.8h, v31.8h \n" // 3*near+far (even)
"urshr v2.8h, v0.8h, #2 \n" // 3/4*near+1/4*far (odd)
"urshr v1.8h, v1.8h, #2 \n" // 3/4*near+1/4*far (even)
"st2 {v1.8h, v2.8h}, [%2], #32 \n" // store
"subs %w3, %w3, #16 \n" // 8 sample -> 16 sample
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_temp), // %1
"+r"(dst_ptr), // %2
"+r"(dst_width) // %3
:
: "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List
);
}
void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
const uint16_t* src_ptr1 = src_ptr + src_stride;
uint16_t* dst_ptr1 = dst_ptr + dst_stride;
const uint16_t* src_temp = src_ptr + 1;
const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile(
"movi v31.8h, #3 \n"
"1: \n"
"ld1 {v2.8h}, [%0], #16 \n" // 01234567 (16b)
"ld1 {v3.8h}, [%2], #16 \n" // 12345678 (16b)
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
"mov v0.8h, v2.8h \n"
"mla v2.8h, v3.8h, v31.8h \n" // 3*near+far (odd)
"mla v3.8h, v0.8h, v31.8h \n" // 3*near+far (even)
"ld1 {v4.8h}, [%1], #16 \n" // 01234567 (16b)
"ld1 {v5.8h}, [%3], #16 \n" // 12345678 (16b)
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
"mov v0.8h, v4.8h \n"
"mla v4.8h, v5.8h, v31.8h \n" // 3*near+far (odd)
"mla v5.8h, v0.8h, v31.8h \n" // 3*near+far (even)
"mov v0.8h, v4.8h \n"
"mov v1.8h, v5.8h \n"
"mla v4.8h, v2.8h, v31.8h \n" // 9 3 3 1 (1, odd)
"mla v5.8h, v3.8h, v31.8h \n" // 9 3 3 1 (1, even)
"mla v2.8h, v0.8h, v31.8h \n" // 9 3 3 1 (2, odd)
"mla v3.8h, v1.8h, v31.8h \n" // 9 3 3 1 (2, even)
"urshr v2.8h, v2.8h, #4 \n" // 2, odd
"urshr v1.8h, v3.8h, #4 \n" // 2, even
"urshr v4.8h, v4.8h, #4 \n" // 1, odd
"urshr v3.8h, v5.8h, #4 \n" // 1, even
"st2 {v3.8h, v4.8h}, [%4], #32 \n" // store 1
"st2 {v1.8h, v2.8h}, [%5], #32 \n" // store 2
"subs %w6, %w6, #16 \n" // 8 sample -> 16 sample
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src_ptr1), // %1
"+r"(src_temp), // %2
"+r"(src_temp1), // %3
"+r"(dst_ptr), // %4
"+r"(dst_ptr1), // %5
"+r"(dst_width) // %6
:
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
"v31" // Clobber List
);
}
// Add a row of bytes to a row of shorts. Used for box filter.
// Reads 16 bytes and accumulates to 16 shorts at a time.
void ScaleAddRow_NEON(const uint8_t* src_ptr,

View File

@ -49,7 +49,8 @@ namespace libyuv {
#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF) \
DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \
SRC_DEPTH) \
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \
static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \
@ -81,6 +82,16 @@ namespace libyuv {
MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC); \
MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \
MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \
SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF); \
SRC_T* src_u_p = reinterpret_cast<SRC_T*>(src_u + OFF); \
SRC_T* src_v_p = reinterpret_cast<SRC_T*>(src_v + OFF); \
for (int i = 0; i < kWidth * kHeight; ++i) { \
src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1); \
} \
for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight; ++i) { \
src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1); \
src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1); \
} \
memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \
memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
@ -89,9 +100,7 @@ namespace libyuv {
memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
MaskCpuFlags(disable_cpu_flags_); \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
reinterpret_cast<SRC_T*>(src_y + OFF), kWidth, \
reinterpret_cast<SRC_T*>(src_u + OFF), kSrcHalfWidth, \
reinterpret_cast<SRC_T*>(src_v + OFF), kSrcHalfWidth, \
src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth, \
reinterpret_cast<DST_T*>(dst_y_c), kWidth, \
reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth, \
reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth, \
@ -99,9 +108,7 @@ namespace libyuv {
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
reinterpret_cast<SRC_T*>(src_y + OFF), kWidth, \
reinterpret_cast<SRC_T*>(src_u + OFF), kSrcHalfWidth, \
reinterpret_cast<SRC_T*>(src_v + OFF), kSrcHalfWidth, \
src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth, \
reinterpret_cast<DST_T*>(dst_y_opt), kWidth, \
reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth, \
reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth, \
@ -127,34 +134,39 @@ namespace libyuv {
#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
DST_SUBSAMP_X, DST_SUBSAMP_Y) \
DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \
TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
benchmark_width_ - 4, _Any, +, 0) \
benchmark_width_ - 4, _Any, +, 0, SRC_DEPTH) \
TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
benchmark_width_, _Unaligned, +, 1) \
benchmark_width_, _Unaligned, +, 1, SRC_DEPTH) \
TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
benchmark_width_, _Invert, -, 0) \
benchmark_width_, _Invert, -, 0, SRC_DEPTH) \
TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
benchmark_width_, _Opt, +, 0)
benchmark_width_, _Opt, +, 0, SRC_DEPTH)
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2)
TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2)
TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2)
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1)
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1)
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2)
TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1)
TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1)
TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2)
TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2)
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2)
TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2)
TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2)
TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2)
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2, 8)
TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2, 8)
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1, 8)
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1, 8)
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2, 8)
TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1, 8)
TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I444, uint8_t, 1, 1, 1, 8)
TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1, 8)
TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2, 10)
TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 10)
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2, 8)
TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10)
TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10)
TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8)
TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10)
TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10)
TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12)
TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I412, uint16_t, 2, 1, 1, 12)
// Test Android 420 to I420
#define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, \