mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
Add special optimization for I420ToI444 and I422ToI444
These functions use (bi)linear filter, to scale U and V planes to the size of Y plane. This will help enhance the quality of YUV to RGB conversion. Also added 10bit and 12bit version: I010ToI410 I210ToI410 I012ToI412 I212ToI412 libyuv_unittest --gtest_filter=LibYUVConvertTest.I42*ToI444*:LibYUVConvertTest.I*1*ToI41* R=fbarchard@chromium.org Change-Id: Ie4a711a5ba28f2ff1f44c021f7a5c149022264c5 Bug: libyuv:872 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2658097 Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
c28d404936
commit
fc61dde1eb
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 1772
|
||||
Version: 1774
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -89,6 +89,23 @@ int I422ToI420(const uint8_t* src_y,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Convert I422 to I444.
|
||||
LIBYUV_API
|
||||
int I422ToI444(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint8_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Convert I422 to NV21.
|
||||
LIBYUV_API
|
||||
int I422ToNV21(const uint8_t* src_y,
|
||||
@ -122,6 +139,23 @@ int I420Copy(const uint8_t* src_y,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Convert I420 to I444.
|
||||
LIBYUV_API
|
||||
int I420ToI444(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint8_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Copy I010 to I010
|
||||
#define I010ToI010 I010Copy
|
||||
#define H010ToH010 I010Copy
|
||||
@ -159,6 +193,46 @@ int I010ToI420(const uint16_t* src_y,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Convert I010 to I410
|
||||
LIBYUV_API
|
||||
int I010ToI410(const uint16_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint16_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint16_t* src_v,
|
||||
int src_stride_v,
|
||||
uint16_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint16_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint16_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Convert I012 to I412
|
||||
#define I012ToI412 I010ToI410
|
||||
|
||||
// Convert I212 to I412
|
||||
LIBYUV_API
|
||||
int I210ToI410(const uint16_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint16_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint16_t* src_v,
|
||||
int src_stride_v,
|
||||
uint16_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint16_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint16_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Convert I212 to I412
|
||||
#define I212ToI412 I210ToI410
|
||||
|
||||
// Convert I400 (grey) to I420.
|
||||
LIBYUV_API
|
||||
int I400ToI420(const uint8_t* src_y,
|
||||
|
||||
@ -49,6 +49,18 @@ void ScalePlane_16(const uint16_t* src,
|
||||
int dst_height,
|
||||
enum FilterMode filtering);
|
||||
|
||||
// Sample is expected to be in the low 12 bits.
|
||||
LIBYUV_API
|
||||
void ScalePlane_12(const uint16_t* src,
|
||||
int src_stride,
|
||||
int src_width,
|
||||
int src_height,
|
||||
uint16_t* dst,
|
||||
int dst_stride,
|
||||
int dst_width,
|
||||
int dst_height,
|
||||
enum FilterMode filtering);
|
||||
|
||||
// Scales a YUV 4:2:0 image from the src width and height to the
|
||||
// dst width and height.
|
||||
// If filtering is kFilterNone, a simple nearest-neighbor algorithm is
|
||||
|
||||
@ -77,6 +77,12 @@ extern "C" {
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
|
||||
#define HAS_SCALEUVROWDOWN2BOX_SSSE3
|
||||
#define HAS_SCALECOLUP2LINEAR_SSE2
|
||||
#define HAS_SCALECOLUP2LINEAR_SSSE3
|
||||
#define HAS_SCALEROWUP2LINEAR_SSE2
|
||||
#define HAS_SCALEROWUP2LINEAR_SSSE3
|
||||
#define HAS_SCALECOLUP2LINEAR_16_SSE2
|
||||
#define HAS_SCALEROWUP2LINEAR_16_SSE2
|
||||
#endif
|
||||
|
||||
// The following are available for gcc/clang x86 platforms, but
|
||||
@ -86,6 +92,10 @@ extern "C" {
|
||||
(defined(__x86_64__) || defined(__i386__)) && !defined(_MSC_VER) && \
|
||||
(defined(CLANG_HAS_AVX2) || defined(GCC_HAS_AVX2))
|
||||
#define HAS_SCALEUVROWDOWN2BOX_AVX2
|
||||
#define HAS_SCALECOLUP2LINEAR_AVX2
|
||||
#define HAS_SCALEROWUP2LINEAR_AVX2
|
||||
#define HAS_SCALECOLUP2LINEAR_16_AVX2
|
||||
#define HAS_SCALEROWUP2LINEAR_16_AVX2
|
||||
#endif
|
||||
|
||||
// The following are available on all x86 platforms, but
|
||||
@ -114,6 +124,10 @@ extern "C" {
|
||||
#define HAS_SCALEROWDOWN4_NEON
|
||||
#define HAS_SCALEUVROWDOWN2BOX_NEON
|
||||
#define HAS_SCALEUVROWDOWNEVEN_NEON
|
||||
#define HAS_SCALECOLUP2LINEAR_NEON
|
||||
#define HAS_SCALEROWUP2LINEAR_NEON
|
||||
#define HAS_SCALECOLUP2LINEAR_16_NEON
|
||||
#define HAS_SCALEROWUP2LINEAR_16_NEON
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
|
||||
@ -279,6 +293,40 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* d,
|
||||
int dst_width);
|
||||
|
||||
void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Linear_Any_C(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_Any_C(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Linear_16_Any_C(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_16_Any_C(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
|
||||
void ScaleCols_C(uint8_t* dst_ptr,
|
||||
const uint8_t* src_ptr,
|
||||
int dst_width,
|
||||
@ -508,6 +556,88 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
|
||||
void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Linear_Any_SSE2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_Any_SSE2(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Linear_Any_SSSE3(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_Any_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Linear_Any_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_Any_AVX2(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Linear_16_Any_AVX2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_16_Any_AVX2(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
|
||||
void ScaleRowDown2_Any_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
@ -1143,6 +1273,39 @@ void ScaleRowDown38_2_Box_Any_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
|
||||
void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Linear_Any_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Linear_16_Any_NEON(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width);
|
||||
void ScaleRowUp2_Bilinear_16_Any_NEON(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width);
|
||||
|
||||
void ScaleAddRow_NEON(const uint8_t* src_ptr, uint16_t* dst_ptr, int src_width);
|
||||
void ScaleAddRow_Any_NEON(const uint8_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1773
|
||||
#define LIBYUV_VERSION 1774
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -159,6 +159,102 @@ int I420ToI444(const uint8_t* src_y,
|
||||
dst_uv_height);
|
||||
}
|
||||
|
||||
// 420 chroma to 444 chroma, 10/12 bit version
|
||||
LIBYUV_API
|
||||
int I010ToI410(const uint16_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint16_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint16_t* src_v,
|
||||
int src_stride_v,
|
||||
uint16_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint16_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint16_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height) {
|
||||
if (width == 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (dst_y) {
|
||||
ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
|
||||
Abs(width), Abs(height), kFilterBilinear);
|
||||
}
|
||||
ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1),
|
||||
SUBSAMPLE(height, 1, 1), dst_u, dst_stride_u, Abs(width),
|
||||
Abs(height), kFilterBilinear);
|
||||
ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1),
|
||||
SUBSAMPLE(height, 1, 1), dst_v, dst_stride_v, Abs(width),
|
||||
Abs(height), kFilterBilinear);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 422 chroma to 444 chroma, 10/12 bit version
|
||||
LIBYUV_API
|
||||
int I210ToI410(const uint16_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint16_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint16_t* src_v,
|
||||
int src_stride_v,
|
||||
uint16_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint16_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint16_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height) {
|
||||
if (width == 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (dst_y) {
|
||||
ScalePlane_12(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
|
||||
Abs(width), Abs(height), kFilterBilinear);
|
||||
}
|
||||
ScalePlane_12(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
|
||||
dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
|
||||
ScalePlane_12(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
|
||||
dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// 422 chroma is 1/2 width, 1x height
|
||||
// 444 chroma is 1x width, 1x height
|
||||
LIBYUV_API
|
||||
int I422ToI444(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint8_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height) {
|
||||
if (width == 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (dst_y) {
|
||||
ScalePlane(src_y, src_stride_y, width, height, dst_y, dst_stride_y,
|
||||
Abs(width), Abs(height), kFilterBilinear);
|
||||
}
|
||||
ScalePlane(src_u, src_stride_u, SUBSAMPLE(width, 1, 1), height, dst_u,
|
||||
dst_stride_u, Abs(width), Abs(height), kFilterBilinear);
|
||||
ScalePlane(src_v, src_stride_v, SUBSAMPLE(width, 1, 1), height, dst_v,
|
||||
dst_stride_v, Abs(width), Abs(height), kFilterBilinear);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Copy to I400. Source can be I420,422,444,400,NV12,NV21
|
||||
LIBYUV_API
|
||||
int I400Copy(const uint8_t* src_y,
|
||||
|
||||
280
source/scale.cc
280
source/scale.cc
@ -1336,6 +1336,238 @@ void ScalePlaneBilinearUp(int src_width,
|
||||
}
|
||||
}
|
||||
|
||||
// Scale plane, horizontally 2 times, vertically any time.
|
||||
// Uses linear filter horizontally, nearest vertically.
|
||||
// This is an optimized version for scaling up a plane to 2 times of
|
||||
// its original width, using linear interpolation.
|
||||
// This is used to scale U and V planes of I422 to I444.
|
||||
void ScalePlaneUp2_Linear(int src_width,
|
||||
int src_height,
|
||||
int dst_width,
|
||||
int dst_height,
|
||||
int src_stride,
|
||||
int dst_stride,
|
||||
const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr) {
|
||||
void (*ScaleRowUp)(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) =
|
||||
ScaleRowUp2_Linear_Any_C;
|
||||
int i;
|
||||
int y;
|
||||
int dy;
|
||||
|
||||
// This function can only scale up by 2 times horizontally.
|
||||
assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleRowUp = ScaleRowUp2_Linear_Any_SSE2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ScaleRowUp = ScaleRowUp2_Linear_Any_SSSE3;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ScaleRowUp = ScaleRowUp2_Linear_Any_AVX2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_NEON
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleRowUp = ScaleRowUp2_Linear_Any_NEON;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (dst_height == 1) {
|
||||
ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
|
||||
dst_width);
|
||||
} else {
|
||||
dy = FixedDiv(src_height - 1, dst_height - 1);
|
||||
y = (1 << 15) - 1;
|
||||
for (i = 0; i < dst_height; ++i) {
|
||||
ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
|
||||
dst_ptr += dst_stride;
|
||||
y += dy;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Scale plane, 2 times.
|
||||
// This is an optimized version for scaling up a plane to 2 times of
|
||||
// its original size, using bilinear interpolation.
|
||||
// This is used to scale U and V planes of I420 to I444.
|
||||
void ScalePlaneUp2_Bilinear(int src_width,
|
||||
int src_height,
|
||||
int dst_width,
|
||||
int dst_height,
|
||||
int src_stride,
|
||||
int dst_stride,
|
||||
const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr) {
|
||||
void (*Scale2RowUp)(const uint8_t* src_ptr, ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
|
||||
ScaleRowUp2_Bilinear_Any_C;
|
||||
int x;
|
||||
|
||||
// This function can only scale up by 2 times.
|
||||
assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
|
||||
assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSE2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
Scale2RowUp = ScaleRowUp2_Bilinear_Any_SSSE3;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
Scale2RowUp = ScaleRowUp2_Bilinear_Any_AVX2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_NEON
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
Scale2RowUp = ScaleRowUp2_Bilinear_Any_NEON;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (src_height == 1) {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
|
||||
} else {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
|
||||
dst_ptr += dst_stride;
|
||||
for (x = 0; x < src_height - 1; ++x) {
|
||||
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
|
||||
src_ptr += src_stride;
|
||||
// TODO test performance of writing one row of destination at a time
|
||||
dst_ptr += 2 * dst_stride;
|
||||
}
|
||||
if (!(dst_height & 1)) {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Scale at most 14bit plane, horizontally 2 times.
|
||||
// This is an optimized version for scaling up a plane to 2 times of
|
||||
// its original width, using linear interpolation.
|
||||
// stride is in count of uint16_t.
|
||||
// This is used to scale U and V planes of I210 to I410 and I212 to I412.
|
||||
void ScalePlaneUp2_16_Linear(int src_width,
|
||||
int src_height,
|
||||
int dst_width,
|
||||
int dst_height,
|
||||
int src_stride,
|
||||
int dst_stride,
|
||||
const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr) {
|
||||
void (*ScaleRowUp)(const uint16_t* src_ptr, uint16_t* dst_ptr,
|
||||
int dst_width) = ScaleRowUp2_Linear_16_Any_C;
|
||||
int i;
|
||||
int y;
|
||||
int dy;
|
||||
|
||||
// This function can only scale up by 2 times horizontally.
|
||||
assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ScaleRowUp = ScaleRowUp2_Linear_16_Any_SSE2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ScaleRowUp = ScaleRowUp2_Linear_16_Any_AVX2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_NEON
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ScaleRowUp = ScaleRowUp2_Linear_16_Any_NEON;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (dst_height == 1) {
|
||||
ScaleRowUp(src_ptr + ((src_height - 1) / 2) * src_stride, dst_ptr,
|
||||
dst_width);
|
||||
} else {
|
||||
dy = FixedDiv(src_height - 1, dst_height - 1);
|
||||
y = (1 << 15) - 1;
|
||||
for (i = 0; i < dst_height; ++i) {
|
||||
ScaleRowUp(src_ptr + (y >> 16) * src_stride, dst_ptr, dst_width);
|
||||
dst_ptr += dst_stride;
|
||||
y += dy;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Scale at most 12bit plane, up 2 times.
|
||||
// This is an optimized version for scaling up a plane to 2 times of
|
||||
// its original size, using bilinear interpolation.
|
||||
// stride is in count of uint16_t.
|
||||
// This is used to scale U and V planes of I010 to I410 and I012 to I412.
|
||||
void ScalePlaneUp2_16_Bilinear(int src_width,
|
||||
int src_height,
|
||||
int dst_width,
|
||||
int dst_height,
|
||||
int src_stride,
|
||||
int dst_stride,
|
||||
const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr) {
|
||||
void (*Scale2RowUp)(const uint16_t* src_ptr, ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr, ptrdiff_t dst_stride, int dst_width) =
|
||||
ScaleRowUp2_Bilinear_16_Any_C;
|
||||
int x;
|
||||
|
||||
// This function can only scale up by 2 times.
|
||||
assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
|
||||
assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_SSE2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_AVX2;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_NEON
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
Scale2RowUp = ScaleRowUp2_Bilinear_16_Any_NEON;
|
||||
}
|
||||
#endif
|
||||
|
||||
if (src_height == 1) {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, dst_stride, dst_width);
|
||||
} else {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
|
||||
dst_ptr += dst_stride;
|
||||
for (x = 0; x < src_height - 1; ++x) {
|
||||
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
|
||||
src_ptr += src_stride;
|
||||
dst_ptr += 2 * dst_stride;
|
||||
}
|
||||
if (!(dst_height & 1)) {
|
||||
Scale2RowUp(src_ptr, 0, dst_ptr, 0, dst_width);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void ScalePlaneBilinearUp_16(int src_width,
|
||||
int src_height,
|
||||
int dst_width,
|
||||
@ -1627,6 +1859,17 @@ void ScalePlane(const uint8_t* src,
|
||||
dst_stride, src, dst);
|
||||
return;
|
||||
}
|
||||
if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
|
||||
ScalePlaneUp2_Linear(src_width, src_height, dst_width, dst_height,
|
||||
src_stride, dst_stride, src, dst);
|
||||
return;
|
||||
}
|
||||
if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
|
||||
(filtering == kFilterBilinear || filtering == kFilterBox)) {
|
||||
ScalePlaneUp2_Bilinear(src_width, src_height, dst_width, dst_height,
|
||||
src_stride, dst_stride, src, dst);
|
||||
return;
|
||||
}
|
||||
if (filtering && dst_height > src_height) {
|
||||
ScalePlaneBilinearUp(src_width, src_height, dst_width, dst_height,
|
||||
src_stride, dst_stride, src, dst, filtering);
|
||||
@ -1724,6 +1967,43 @@ void ScalePlane_16(const uint16_t* src,
|
||||
dst_stride, src, dst);
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
void ScalePlane_12(const uint16_t* src,
|
||||
int src_stride,
|
||||
int src_width,
|
||||
int src_height,
|
||||
uint16_t* dst,
|
||||
int dst_stride,
|
||||
int dst_width,
|
||||
int dst_height,
|
||||
enum FilterMode filtering) {
|
||||
// Simplify filtering when possible.
|
||||
filtering = ScaleFilterReduce(src_width, src_height, dst_width, dst_height,
|
||||
filtering);
|
||||
|
||||
// Negative height means invert the image.
|
||||
if (src_height < 0) {
|
||||
src_height = -src_height;
|
||||
src = src + (src_height - 1) * src_stride;
|
||||
src_stride = -src_stride;
|
||||
}
|
||||
|
||||
if ((dst_width + 1) / 2 == src_width && filtering == kFilterLinear) {
|
||||
ScalePlaneUp2_16_Linear(src_width, src_height, dst_width, dst_height,
|
||||
src_stride, dst_stride, src, dst);
|
||||
return;
|
||||
}
|
||||
if ((dst_height + 1) / 2 == src_height && (dst_width + 1) / 2 == src_width &&
|
||||
(filtering == kFilterBilinear || filtering == kFilterBox)) {
|
||||
ScalePlaneUp2_16_Bilinear(src_width, src_height, dst_width, dst_height,
|
||||
src_stride, dst_stride, src, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
ScalePlane_16(src, src_stride, src_width, src_height, dst, dst_stride,
|
||||
dst_width, dst_height, filtering);
|
||||
}
|
||||
|
||||
// Scale an I420 image.
|
||||
// This function in turn calls a scaling function for each plane.
|
||||
|
||||
|
||||
@ -609,6 +609,191 @@ CANY(ScaleARGBFilterCols_Any_MSA,
|
||||
#endif
|
||||
#undef CANY
|
||||
|
||||
// Scale up horizontally 2 times using linear filter.
|
||||
#define SUH2LANY(NAME, SIMD, C, MASK, PTYPE) \
|
||||
void NAME(const PTYPE* src_ptr, PTYPE* dst_ptr, int dst_width) { \
|
||||
int work_width = (dst_width - 1) & ~1; \
|
||||
int r = work_width & MASK; \
|
||||
int n = work_width & ~MASK; \
|
||||
dst_ptr[0] = src_ptr[0]; \
|
||||
if (work_width > 0) { \
|
||||
if (n != 0) { \
|
||||
SIMD(src_ptr, dst_ptr + 1, n); \
|
||||
} \
|
||||
C(src_ptr + (n / 2), dst_ptr + n + 1, r); \
|
||||
} \
|
||||
dst_ptr[dst_width - 1] = src_ptr[(dst_width / 2) - 1]; \
|
||||
}
|
||||
|
||||
// Even the C version need to be wrapped, because boundary pixels have to
|
||||
// be handled differently
|
||||
|
||||
SUH2LANY(ScaleRowUp2_Linear_Any_C,
|
||||
ScaleRowUp2_Linear_C,
|
||||
ScaleRowUp2_Linear_C,
|
||||
0,
|
||||
uint8_t)
|
||||
|
||||
SUH2LANY(ScaleRowUp2_Linear_16_Any_C,
|
||||
ScaleRowUp2_Linear_16_C,
|
||||
ScaleRowUp2_Linear_16_C,
|
||||
0,
|
||||
uint16_t)
|
||||
|
||||
#ifdef HAS_SCALECOLUP2LINEAR_SSE2
|
||||
SUH2LANY(ScaleRowUp2_Linear_Any_SSE2,
|
||||
ScaleRowUp2_Linear_SSE2,
|
||||
ScaleRowUp2_Linear_C,
|
||||
15,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALECOLUP2LINEAR_SSSE3
|
||||
SUH2LANY(ScaleRowUp2_Linear_Any_SSSE3,
|
||||
ScaleRowUp2_Linear_SSSE3,
|
||||
ScaleRowUp2_Linear_C,
|
||||
15,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
|
||||
SUH2LANY(ScaleRowUp2_Linear_16_Any_SSE2,
|
||||
ScaleRowUp2_Linear_16_SSE2,
|
||||
ScaleRowUp2_Linear_16_C,
|
||||
15,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALECOLUP2LINEAR_AVX2
|
||||
SUH2LANY(ScaleRowUp2_Linear_Any_AVX2,
|
||||
ScaleRowUp2_Linear_AVX2,
|
||||
ScaleRowUp2_Linear_C,
|
||||
31,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2
|
||||
SUH2LANY(ScaleRowUp2_Linear_16_Any_AVX2,
|
||||
ScaleRowUp2_Linear_16_AVX2,
|
||||
ScaleRowUp2_Linear_16_C,
|
||||
15,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALECOLUP2LINEAR_NEON
|
||||
SUH2LANY(ScaleRowUp2_Linear_Any_NEON,
|
||||
ScaleRowUp2_Linear_NEON,
|
||||
ScaleRowUp2_Linear_C,
|
||||
15,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALECOLUP2LINEAR_16_NEON
|
||||
SUH2LANY(ScaleRowUp2_Linear_16_Any_NEON,
|
||||
ScaleRowUp2_Linear_16_NEON,
|
||||
ScaleRowUp2_Linear_16_C,
|
||||
15,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#undef SUH2LANY
|
||||
|
||||
// Scale up 2 times using bilinear filter.
|
||||
// This function produces 2 rows at a time
|
||||
#define SU2BLANY(NAME, SIMD, C, MASK, PTYPE) \
|
||||
void NAME(const PTYPE* src_ptr, ptrdiff_t src_stride, PTYPE* dst_ptr, \
|
||||
ptrdiff_t dst_stride, int dst_width) { \
|
||||
int work_width = (dst_width - 1) & ~1; \
|
||||
int r = work_width & MASK; \
|
||||
int n = work_width & ~MASK; \
|
||||
const PTYPE* sa = src_ptr; \
|
||||
const PTYPE* sb = src_ptr + src_stride; \
|
||||
PTYPE* da = dst_ptr; \
|
||||
PTYPE* db = dst_ptr + dst_stride; \
|
||||
da[0] = (3 * sa[0] + sb[0]) >> 2; \
|
||||
db[0] = (sa[0] + 3 * sb[0]) >> 2; \
|
||||
if (work_width > 0) { \
|
||||
if (n != 0) { \
|
||||
SIMD(sa, sb - sa, da + 1, db - da, n); \
|
||||
} \
|
||||
C(sa + (n / 2), sb - sa, da + n + 1, db - da, r); \
|
||||
} \
|
||||
da[dst_width - 1] = \
|
||||
(3 * sa[(dst_width - 1) / 2] + sb[(dst_width - 1) / 2]) >> 2; \
|
||||
db[dst_width - 1] = \
|
||||
(sa[(dst_width - 1) / 2] + 3 * sb[(dst_width - 1) / 2]) >> 2; \
|
||||
}
|
||||
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_Any_C,
|
||||
ScaleRowUp2_Bilinear_C,
|
||||
ScaleRowUp2_Bilinear_C,
|
||||
0,
|
||||
uint8_t)
|
||||
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_C,
|
||||
ScaleRowUp2_Bilinear_16_C,
|
||||
ScaleRowUp2_Bilinear_16_C,
|
||||
0,
|
||||
uint16_t)
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_Any_SSE2,
|
||||
ScaleRowUp2_Bilinear_SSE2,
|
||||
ScaleRowUp2_Bilinear_C,
|
||||
15,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_SSE2,
|
||||
ScaleRowUp2_Bilinear_16_SSE2,
|
||||
ScaleRowUp2_Bilinear_16_C,
|
||||
15,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_Any_SSSE3,
|
||||
ScaleRowUp2_Bilinear_SSSE3,
|
||||
ScaleRowUp2_Bilinear_C,
|
||||
15,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_Any_AVX2,
|
||||
ScaleRowUp2_Bilinear_AVX2,
|
||||
ScaleRowUp2_Bilinear_C,
|
||||
31,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_AVX2,
|
||||
ScaleRowUp2_Bilinear_16_AVX2,
|
||||
ScaleRowUp2_Bilinear_16_C,
|
||||
15,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_NEON
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_Any_NEON,
|
||||
ScaleRowUp2_Bilinear_NEON,
|
||||
ScaleRowUp2_Bilinear_C,
|
||||
15,
|
||||
uint8_t)
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_16_NEON
|
||||
SU2BLANY(ScaleRowUp2_Bilinear_16_Any_NEON,
|
||||
ScaleRowUp2_Bilinear_16_NEON,
|
||||
ScaleRowUp2_Bilinear_16_C,
|
||||
15,
|
||||
uint16_t)
|
||||
#endif
|
||||
|
||||
#undef SU2BLANY
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -400,6 +400,95 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
|
||||
}
|
||||
}
|
||||
|
||||
// sample position: (O is src sample position, X is dst sample position)
|
||||
//
|
||||
// v dst_ptr at here v stop at here
|
||||
// X O X X O X X O X X O X X O X
|
||||
// ^ src_ptr at here
|
||||
void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
int src_width = dst_width >> 1;
|
||||
int x;
|
||||
assert((dst_width % 2 == 0) && (dst_width >= 0));
|
||||
for (x = 0; x < src_width; ++x) {
|
||||
dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
|
||||
dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
|
||||
}
|
||||
}
|
||||
|
||||
// sample position: (O is src sample position, X is dst sample position)
|
||||
//
|
||||
// src_ptr at here
|
||||
// X v X X X X X X X X X
|
||||
// O O O O O
|
||||
// X X X X X X X X X X
|
||||
// ^ dst_ptr at here ^ stop at here
|
||||
// X X X X X X X X X X
|
||||
// O O O O O
|
||||
// X X X X X X X X X X
|
||||
void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
const uint8_t* s = src_ptr;
|
||||
const uint8_t* t = src_ptr + src_stride;
|
||||
uint8_t* d = dst_ptr;
|
||||
uint8_t* e = dst_ptr + dst_stride;
|
||||
int src_width = dst_width >> 1;
|
||||
int x;
|
||||
assert((dst_width % 2 == 0) && (dst_width >= 0));
|
||||
for (x = 0; x < src_width; ++x) {
|
||||
d[2 * x + 0] =
|
||||
(s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
|
||||
d[2 * x + 1] =
|
||||
(s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
|
||||
e[2 * x + 0] =
|
||||
(s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
|
||||
e[2 * x + 1] =
|
||||
(s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
|
||||
}
|
||||
}
|
||||
|
||||
// only suitable for at most 14bit range.
|
||||
void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
int src_width = dst_width >> 1;
|
||||
int x;
|
||||
assert((dst_width % 2 == 0) && (dst_width >= 0));
|
||||
for (x = 0; x < src_width; ++x) {
|
||||
dst_ptr[2 * x + 0] = (src_ptr[x + 0] * 3 + src_ptr[x + 1] * 1 + 2) >> 2;
|
||||
dst_ptr[2 * x + 1] = (src_ptr[x + 0] * 1 + src_ptr[x + 1] * 3 + 2) >> 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Only suitable for at most 12bit range.
|
||||
void ScaleRowUp2_Bilinear_16_C(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
const uint16_t* s = src_ptr;
|
||||
const uint16_t* t = src_ptr + src_stride;
|
||||
uint16_t* d = dst_ptr;
|
||||
uint16_t* e = dst_ptr + dst_stride;
|
||||
int src_width = dst_width >> 1;
|
||||
int x;
|
||||
assert((dst_width % 2 == 0) && (dst_width >= 0));
|
||||
for (x = 0; x < src_width; ++x) {
|
||||
d[2 * x + 0] =
|
||||
(s[x + 0] * 9 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 1 + 8) >> 4;
|
||||
d[2 * x + 1] =
|
||||
(s[x + 0] * 3 + s[x + 1] * 9 + t[x + 0] * 1 + t[x + 1] * 3 + 8) >> 4;
|
||||
e[2 * x + 0] =
|
||||
(s[x + 0] * 3 + s[x + 1] * 1 + t[x + 0] * 9 + t[x + 1] * 3 + 8) >> 4;
|
||||
e[2 * x + 1] =
|
||||
(s[x + 0] * 1 + s[x + 1] * 3 + t[x + 0] * 3 + t[x + 1] * 9 + 8) >> 4;
|
||||
}
|
||||
}
|
||||
|
||||
// Scales a single row of pixels using point sampling.
|
||||
void ScaleCols_C(uint8_t* dst_ptr,
|
||||
const uint8_t* src_ptr,
|
||||
|
||||
@ -785,6 +785,836 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
|
||||
"xmm7");
|
||||
}
|
||||
|
||||
#ifdef HAS_SCALECOLUP2LINEAR_SSE2
|
||||
void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile(
|
||||
|
||||
"pxor %%xmm0,%%xmm0 \n" // 0
|
||||
"pcmpeqw %%xmm6,%%xmm6 \n"
|
||||
"psrlw $15,%%xmm6 \n"
|
||||
"psllw $1,%%xmm6 \n" // all 2
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movq (%0),%%xmm1 \n" // 01234567
|
||||
"movq 1(%0),%%xmm2 \n" // 12345678
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
|
||||
"punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
|
||||
"punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
|
||||
"movdqa %%xmm1,%%xmm4 \n"
|
||||
"punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
|
||||
"movdqa %%xmm2,%%xmm5 \n"
|
||||
"punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
|
||||
"paddw %%xmm5,%%xmm4 \n"
|
||||
"movdqa %%xmm3,%%xmm5 \n"
|
||||
"paddw %%xmm6,%%xmm4 \n"
|
||||
"punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
|
||||
"paddw %%xmm5,%%xmm5 \n"
|
||||
"paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo)
|
||||
"psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo)
|
||||
|
||||
"punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
|
||||
"punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
|
||||
"paddw %%xmm2,%%xmm1 \n"
|
||||
"punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
|
||||
"paddw %%xmm6,%%xmm1 \n"
|
||||
"paddw %%xmm3,%%xmm3 \n"
|
||||
"paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
|
||||
"psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
|
||||
|
||||
"packuswb %%xmm1,%%xmm5 \n"
|
||||
"movdqu %%xmm5,(%1) \n"
|
||||
|
||||
"lea 0x8(%0),%0 \n"
|
||||
"lea 0x10(%1),%1 \n" // 8 sample to 16 sample
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
|
||||
void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
asm volatile(
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"pxor %%xmm0,%%xmm0 \n" // 0
|
||||
// above line
|
||||
"movq (%0),%%xmm1 \n" // 01234567
|
||||
"movq 1(%0),%%xmm2 \n" // 12345678
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
|
||||
"punpcklbw %%xmm1,%%xmm1 \n" // 0011223344556677
|
||||
"punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
|
||||
|
||||
"movdqa %%xmm1,%%xmm4 \n"
|
||||
"punpcklbw %%xmm0,%%xmm4 \n" // 00112233 (16)
|
||||
"movdqa %%xmm2,%%xmm5 \n"
|
||||
"punpcklbw %%xmm0,%%xmm5 \n" // 11223344 (16)
|
||||
"paddw %%xmm5,%%xmm4 \n" // near+far
|
||||
"movdqa %%xmm3,%%xmm5 \n"
|
||||
"punpcklbw %%xmm0,%%xmm5 \n" // 01122334 (16)
|
||||
"paddw %%xmm5,%%xmm5 \n" // 2*near
|
||||
"paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo)
|
||||
|
||||
"punpckhbw %%xmm0,%%xmm1 \n" // 44556677 (16)
|
||||
"punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
|
||||
"paddw %%xmm2,%%xmm1 \n"
|
||||
"punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
|
||||
"paddw %%xmm3,%%xmm3 \n" // 2*near
|
||||
"paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
|
||||
|
||||
// below line
|
||||
"movq (%0,%3),%%xmm6 \n" // 01234567
|
||||
"movq 1(%0,%3),%%xmm2 \n" // 12345678
|
||||
"movdqa %%xmm6,%%xmm3 \n"
|
||||
"punpcklbw %%xmm2,%%xmm3 \n" // 0112233445566778
|
||||
"punpcklbw %%xmm6,%%xmm6 \n" // 0011223344556677
|
||||
"punpcklbw %%xmm2,%%xmm2 \n" // 1122334455667788
|
||||
|
||||
"movdqa %%xmm6,%%xmm5 \n"
|
||||
"punpcklbw %%xmm0,%%xmm5 \n" // 00112233 (16)
|
||||
"movdqa %%xmm2,%%xmm7 \n"
|
||||
"punpcklbw %%xmm0,%%xmm7 \n" // 11223344 (16)
|
||||
"paddw %%xmm7,%%xmm5 \n" // near+far
|
||||
"movdqa %%xmm3,%%xmm7 \n"
|
||||
"punpcklbw %%xmm0,%%xmm7 \n" // 01122334 (16)
|
||||
"paddw %%xmm7,%%xmm7 \n" // 2*near
|
||||
"paddw %%xmm7,%%xmm5 \n" // 3*near+far (2, lo)
|
||||
|
||||
"punpckhbw %%xmm0,%%xmm6 \n" // 44556677 (16)
|
||||
"punpckhbw %%xmm0,%%xmm2 \n" // 55667788 (16)
|
||||
"paddw %%xmm6,%%xmm2 \n" // near+far
|
||||
"punpckhbw %%xmm0,%%xmm3 \n" // 45566778 (16)
|
||||
"paddw %%xmm3,%%xmm3 \n" // 2*near
|
||||
"paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi)
|
||||
|
||||
// xmm4 xmm1
|
||||
// xmm5 xmm2
|
||||
"pcmpeqw %%xmm0,%%xmm0 \n"
|
||||
"psrlw $15,%%xmm0 \n"
|
||||
"psllw $3,%%xmm0 \n" // all 8
|
||||
|
||||
"movdqa %%xmm4,%%xmm3 \n"
|
||||
"movdqa %%xmm5,%%xmm6 \n"
|
||||
"psllw $1,%%xmm3 \n" // 6*near+2*far (1, lo)
|
||||
"paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, lo)
|
||||
"paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo)
|
||||
"paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo)
|
||||
"psrlw $4,%%xmm3 \n" // ^ div by 16
|
||||
|
||||
"movdqa %%xmm1,%%xmm7 \n"
|
||||
"movdqa %%xmm2,%%xmm6 \n"
|
||||
"psllw $1,%%xmm7 \n" // 6*near+2*far (1, hi)
|
||||
"paddw %%xmm0,%%xmm6 \n" // 3*near+far+8 (2, hi)
|
||||
"paddw %%xmm1,%%xmm7 \n" // 9*near+3*far (1, hi)
|
||||
"paddw %%xmm6,%%xmm7 \n" // 9 3 3 1 + 8 (1, hi)
|
||||
"psrlw $4,%%xmm7 \n" // ^ div by 16
|
||||
|
||||
"packuswb %%xmm7,%%xmm3 \n"
|
||||
"movdqu %%xmm3,(%1) \n" // save above line
|
||||
|
||||
"movdqa %%xmm5,%%xmm3 \n"
|
||||
"paddw %%xmm0,%%xmm4 \n" // 3*near+far+8 (1, lo)
|
||||
"psllw $1,%%xmm3 \n" // 6*near+2*far (2, lo)
|
||||
"paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo)
|
||||
"paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (lo)
|
||||
"psrlw $4,%%xmm5 \n" // ^ div by 16
|
||||
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"paddw %%xmm0,%%xmm1 \n" // 3*near+far+8 (1, hi)
|
||||
"psllw $1,%%xmm3 \n" // 6*near+2*far (2, hi)
|
||||
"paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
|
||||
"paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (hi)
|
||||
"psrlw $4,%%xmm2 \n" // ^ div by 16
|
||||
|
||||
"packuswb %%xmm2,%%xmm5 \n"
|
||||
"movdqu %%xmm5,(%1,%4) \n" // save below line
|
||||
|
||||
"lea 0x8(%0),%0 \n"
|
||||
"lea 0x10(%1),%1 \n" // 8 sample to 16 sample
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "r"((intptr_t)(src_stride)), // %3
|
||||
"r"((intptr_t)(dst_stride)) // %4
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALECOLUP2LINEAR_16_SSE2
|
||||
void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile(
|
||||
|
||||
"pxor %%xmm0,%%xmm0 \n" // 0
|
||||
"pcmpeqw %%xmm6,%%xmm6 \n"
|
||||
"psrlw $15,%%xmm6 \n"
|
||||
"psllw $1,%%xmm6 \n" // all 2
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm1 \n" // 01234567 (16)
|
||||
"movdqu 2(%0),%%xmm2 \n" // 12345678 (16)
|
||||
"movdqa %%xmm1,%%xmm4 \n"
|
||||
"punpcklwd %%xmm4,%%xmm4 \n" // 00112233 (16)
|
||||
"movdqa %%xmm2,%%xmm5 \n"
|
||||
"punpcklwd %%xmm5,%%xmm5 \n" // 11223344 (16)
|
||||
"paddw %%xmm5,%%xmm4 \n"
|
||||
"movdqa %%xmm1,%%xmm5 \n"
|
||||
"paddw %%xmm6,%%xmm4 \n"
|
||||
"punpcklwd %%xmm2,%%xmm5 \n" // 01122334 (16)
|
||||
"psllw $1,%%xmm5 \n"
|
||||
"paddw %%xmm4,%%xmm5 \n" // 3*near+far+2 (lo)
|
||||
"psrlw $2,%%xmm5 \n" // 3/4*near+1/4*far (lo)
|
||||
"movdqu %%xmm5,(%1) \n"
|
||||
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"punpckhwd %%xmm2,%%xmm3 \n" // 45566778 (16)
|
||||
"punpckhwd %%xmm1,%%xmm1 \n" // 44556677 (16)
|
||||
"punpckhwd %%xmm2,%%xmm2 \n" // 55667788 (16)
|
||||
"paddw %%xmm2,%%xmm1 \n"
|
||||
"paddw %%xmm6,%%xmm1 \n"
|
||||
"psllw $1,%%xmm3 \n"
|
||||
"paddw %%xmm3,%%xmm1 \n" // 3*near+far+2 (hi)
|
||||
"psrlw $2,%%xmm1 \n" // 3/4*near+1/4*far (hi)
|
||||
"movdqu %%xmm1,0x10(%1) \n"
|
||||
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
:
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_16_SSE2
|
||||
void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
asm volatile(
|
||||
|
||||
"pxor %%xmm0,%%xmm0 \n" // 0
|
||||
"pcmpeqw %%xmm7,%%xmm7 \n"
|
||||
"psrlw $15,%%xmm7 \n"
|
||||
"psllw $3,%%xmm7 \n" // all 8
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
// above line
|
||||
"movdqu (%0),%%xmm1 \n" // 01234567 (16)
|
||||
"movdqu 2(%0),%%xmm2 \n" // 12345678 (16)
|
||||
"movdqa %%xmm1,%%xmm4 \n"
|
||||
"punpcklwd %%xmm4,%%xmm4 \n" // 00112233 (16)
|
||||
"movdqa %%xmm2,%%xmm5 \n"
|
||||
"punpcklwd %%xmm5,%%xmm5 \n" // 11223344 (16)
|
||||
"paddw %%xmm5,%%xmm4 \n"
|
||||
"movdqa %%xmm1,%%xmm5 \n"
|
||||
"punpcklwd %%xmm2,%%xmm5 \n" // 01122334 (16)
|
||||
"paddw %%xmm5,%%xmm5 \n"
|
||||
"paddw %%xmm5,%%xmm4 \n" // 3*near+far (1, lo)
|
||||
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"punpckhwd %%xmm2,%%xmm3 \n" // 45566778 (16)
|
||||
"punpckhwd %%xmm1,%%xmm1 \n" // 44556677 (16)
|
||||
"punpckhwd %%xmm2,%%xmm2 \n" // 55667788 (16)
|
||||
"paddw %%xmm2,%%xmm1 \n"
|
||||
"paddw %%xmm3,%%xmm3 \n"
|
||||
"paddw %%xmm3,%%xmm1 \n" // 3*near+far (1, hi)
|
||||
|
||||
// below line
|
||||
"movdqu (%0,%3,2),%%xmm6 \n" // 01234567 (16)
|
||||
"movdqu 2(%0,%3,2),%%xmm2 \n" // 12345678 (16)
|
||||
"movdqa %%xmm6,%%xmm3 \n"
|
||||
"punpcklwd %%xmm3,%%xmm3 \n" // 00112233 (16)
|
||||
"movdqa %%xmm2,%%xmm5 \n"
|
||||
"punpcklwd %%xmm5,%%xmm5 \n" // 11223344 (16)
|
||||
"paddw %%xmm5,%%xmm3 \n"
|
||||
"movdqa %%xmm6,%%xmm5 \n"
|
||||
"punpcklwd %%xmm2,%%xmm5 \n" // 01122334 (16)
|
||||
"paddw %%xmm5,%%xmm5 \n"
|
||||
"paddw %%xmm3,%%xmm5 \n" // 3*near+far (2, lo)
|
||||
|
||||
"movdqa %%xmm6,%%xmm3 \n"
|
||||
"punpckhwd %%xmm2,%%xmm3 \n" // 45566778 (16)
|
||||
"punpckhwd %%xmm6,%%xmm6 \n" // 44556677 (16)
|
||||
"punpckhwd %%xmm2,%%xmm2 \n" // 55667788 (16)
|
||||
"paddw %%xmm6,%%xmm2 \n"
|
||||
"paddw %%xmm3,%%xmm3 \n"
|
||||
"paddw %%xmm3,%%xmm2 \n" // 3*near+far (2, hi)
|
||||
|
||||
// xmm4 xmm1
|
||||
// xmm5 xmm2
|
||||
|
||||
"movdqa %%xmm4,%%xmm3 \n"
|
||||
"movdqa %%xmm5,%%xmm6 \n"
|
||||
"psllw $1,%%xmm3 \n" // 6*near+2*far (1, lo)
|
||||
"paddw %%xmm7,%%xmm6 \n" // 3*near+far+8 (2, lo)
|
||||
"paddw %%xmm4,%%xmm3 \n" // 9*near+3*far (1, lo)
|
||||
"paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, lo)
|
||||
"psrlw $4,%%xmm3 \n" // ^ div by 16
|
||||
"movdqu %%xmm3,(%1) \n"
|
||||
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"movdqa %%xmm2,%%xmm6 \n"
|
||||
"psllw $1,%%xmm3 \n" // 6*near+2*far (1, hi)
|
||||
"paddw %%xmm7,%%xmm6 \n" // 3*near+far+8 (2, hi)
|
||||
"paddw %%xmm1,%%xmm3 \n" // 9*near+3*far (1, hi)
|
||||
"paddw %%xmm6,%%xmm3 \n" // 9 3 3 1 + 8 (1, hi)
|
||||
"psrlw $4,%%xmm3 \n" // ^ div by 16
|
||||
"movdqu %%xmm3,0x10(%1) \n"
|
||||
|
||||
"movdqa %%xmm5,%%xmm3 \n"
|
||||
"paddw %%xmm7,%%xmm4 \n" // 3*near+far+8 (1, lo)
|
||||
"psllw $1,%%xmm3 \n" // 6*near+2*far (2, lo)
|
||||
"paddw %%xmm3,%%xmm5 \n" // 9*near+3*far (2, lo)
|
||||
"paddw %%xmm4,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
|
||||
"psrlw $4,%%xmm5 \n" // ^ div by 16
|
||||
"movdqu %%xmm5,(%1,%4,2) \n"
|
||||
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"paddw %%xmm7,%%xmm1 \n" // 3*near+far+8 (1, hi)
|
||||
"psllw $1,%%xmm3 \n" // 6*near+2*far (2, hi)
|
||||
"paddw %%xmm3,%%xmm2 \n" // 9*near+3*far (2, hi)
|
||||
"paddw %%xmm1,%%xmm2 \n" // 9 3 3 1 + 8 (2, hi)
|
||||
"psrlw $4,%%xmm2 \n" // ^ div by 16
|
||||
"movdqu %%xmm2,0x10(%1,%4,2) \n"
|
||||
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "r"((intptr_t)(src_stride)), // %3
|
||||
"r"((intptr_t)(dst_stride)) // %4
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALECOLUP2LINEAR_SSSE3
|
||||
static const uvec8 kLinearMadd31_SSSE3 = {3, 1, 1, 3, 3, 1, 1, 3,
|
||||
3, 1, 1, 3, 3, 1, 1, 3};
|
||||
|
||||
void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile(
|
||||
|
||||
"pcmpeqw %%xmm4,%%xmm4 \n"
|
||||
"psrlw $15,%%xmm4 \n"
|
||||
"psllw $1,%%xmm4 \n" // all 2
|
||||
"movdqu %3,%%xmm3 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movq (%0),%%xmm0 \n" // 01234567
|
||||
"movq 1(%0),%%xmm1 \n" // 12345678
|
||||
"punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
|
||||
"punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
|
||||
"punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
|
||||
"pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi)
|
||||
"pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo)
|
||||
"paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
|
||||
"paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
|
||||
"psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
|
||||
"psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
|
||||
"vpackuswb %%xmm2,%%xmm0,%%xmm0 \n"
|
||||
"vmovdqu %%xmm0,(%1) \n"
|
||||
|
||||
"lea 0x8(%0),%0 \n"
|
||||
"lea 0x10(%1),%1 \n" // 8 sample to 16 sample
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "m"(kLinearMadd31_SSSE3) // %3
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_SSSE3
|
||||
void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
asm volatile(
|
||||
|
||||
"pcmpeqw %%xmm6,%%xmm6 \n"
|
||||
"psrlw $15,%%xmm6 \n"
|
||||
"psllw $3,%%xmm6 \n" // all 8
|
||||
"movdqu %5,%%xmm7 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movq (%0),%%xmm0 \n" // 01234567
|
||||
"movq 1(%0),%%xmm1 \n" // 12345678
|
||||
"punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
|
||||
"punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
|
||||
"punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
|
||||
"pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi)
|
||||
"pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo)
|
||||
|
||||
"movq (%0,%3),%%xmm1 \n"
|
||||
"movq 1(%0,%3),%%xmm4 \n"
|
||||
"punpcklwd %%xmm1,%%xmm1 \n"
|
||||
"punpcklwd %%xmm4,%%xmm4 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"punpckhdq %%xmm4,%%xmm3 \n"
|
||||
"punpckldq %%xmm4,%%xmm1 \n"
|
||||
"pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
|
||||
"pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
|
||||
|
||||
// xmm0 xmm2
|
||||
// xmm1 xmm3
|
||||
|
||||
"movdqa %%xmm0,%%xmm4 \n"
|
||||
"movdqa %%xmm1,%%xmm5 \n"
|
||||
"paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
|
||||
"paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
|
||||
"paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
|
||||
"paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
|
||||
"psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
|
||||
|
||||
"movdqa %%xmm1,%%xmm5 \n"
|
||||
"paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
|
||||
"paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
|
||||
"paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
|
||||
"paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
|
||||
"psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
|
||||
|
||||
"movdqa %%xmm2,%%xmm0 \n"
|
||||
"movdqa %%xmm3,%%xmm1 \n"
|
||||
"paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
|
||||
"paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
|
||||
"paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
|
||||
"paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
|
||||
"psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
|
||||
|
||||
"movdqa %%xmm3,%%xmm1 \n"
|
||||
"paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
|
||||
"paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
|
||||
"paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
|
||||
"paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
|
||||
"psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
|
||||
|
||||
"packuswb %%xmm0,%%xmm4 \n"
|
||||
"movdqu %%xmm4,(%1) \n" // store above
|
||||
"packuswb %%xmm1,%%xmm5 \n"
|
||||
"movdqu %%xmm5,(%1,%4) \n" // store below
|
||||
|
||||
"lea 0x8(%0),%0 \n"
|
||||
"lea 0x10(%1),%1 \n" // 8 sample to 16 sample
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "r"((intptr_t)(src_stride)), // %3
|
||||
"r"((intptr_t)(dst_stride)), // %4
|
||||
"m"(kLinearMadd31_SSSE3) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALECOLUP2LINEAR_AVX2
|
||||
static const lvec8 kLinearMadd31_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3, 3, 1, 1,
|
||||
3, 3, 1, 1, 3, 3, 1, 1, 3, 3, 1,
|
||||
1, 3, 3, 1, 1, 3, 3, 1, 1, 3};
|
||||
|
||||
void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile(
|
||||
|
||||
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpsrlw $15,%%ymm4,%%ymm4 \n"
|
||||
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
|
||||
"vmovdqu %3,%%ymm3 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
|
||||
"vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
|
||||
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0b11011000,%%ymm1,%%ymm1 \n"
|
||||
"vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
|
||||
"vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
|
||||
"vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
|
||||
"vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
|
||||
"vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
|
||||
"vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
|
||||
"vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
|
||||
"vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
|
||||
"vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
|
||||
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vmovdqu %%ymm0,(%1) \n"
|
||||
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"lea 0x20(%1),%1 \n" // 16 sample to 32 sample
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "m"(kLinearMadd31_AVX2) // %3
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_AVX2
|
||||
void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
asm volatile(
|
||||
|
||||
"vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
|
||||
"vpsrlw $15,%%ymm6,%%ymm6 \n"
|
||||
"vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
|
||||
"vmovdqu %5,%%ymm7 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
|
||||
"vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
|
||||
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0b11011000,%%ymm1,%%ymm1 \n"
|
||||
"vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
|
||||
"vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
|
||||
"vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
|
||||
"vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
|
||||
"vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
|
||||
|
||||
"vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
|
||||
"vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
|
||||
"vpermq $0b11011000,%%ymm2,%%ymm2 \n"
|
||||
"vpermq $0b11011000,%%ymm3,%%ymm3 \n"
|
||||
"vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n"
|
||||
"vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n"
|
||||
"vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n"
|
||||
"vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n"
|
||||
"vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
|
||||
"vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
|
||||
|
||||
// ymm0 ymm1
|
||||
// ymm2 ymm3
|
||||
|
||||
"vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
|
||||
"vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
|
||||
"vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
|
||||
"vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
|
||||
"vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
|
||||
|
||||
"vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
|
||||
"vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
|
||||
"vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
|
||||
"vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
|
||||
"vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
|
||||
|
||||
"vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
|
||||
"vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
|
||||
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
|
||||
"vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
|
||||
"vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
|
||||
|
||||
"vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
|
||||
"vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
|
||||
"vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
|
||||
"vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
|
||||
"vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
|
||||
|
||||
"vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
|
||||
"vmovdqu %%ymm4,(%1) \n" // store above
|
||||
"vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
|
||||
"vmovdqu %%ymm5,(%1,%4) \n" // store below
|
||||
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"lea 0x20(%1),%1 \n" // 16 sample to 32 sample
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "r"((intptr_t)(src_stride)), // %3
|
||||
"r"((intptr_t)(dst_stride)), // %4
|
||||
"m"(kLinearMadd31_AVX2) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALECOLUP2LINEAR_16_AVX2
|
||||
static const lvec16 kLinearMadd31_16_AVX2 = {3, 1, 1, 3, 3, 1, 1, 3,
|
||||
3, 1, 1, 3, 3, 1, 1, 3};
|
||||
|
||||
void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile(
|
||||
|
||||
"vmovdqu %3,%%ymm3 \n"
|
||||
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpsrlw $15,%%ymm4,%%ymm4 \n"
|
||||
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
|
||||
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
|
||||
|
||||
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
|
||||
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
|
||||
|
||||
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
|
||||
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
|
||||
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
|
||||
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
|
||||
"vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo)
|
||||
"vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
|
||||
"vpackssdw %%ymm1,%%ymm0,%%ymm0 \n" // 3*near+far
|
||||
"vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2
|
||||
"vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far
|
||||
"vmovdqu %%ymm0,(%1) \n"
|
||||
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "m"(kLinearMadd31_16_AVX2) // %3
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
|
||||
}
|
||||
|
||||
// This version can handle full 16bit range but is slower
|
||||
void ScaleRowUp2_Linear_16_AVX2_Full(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
asm volatile(
|
||||
|
||||
"vmovdqu %3,%%ymm3 \n"
|
||||
"vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpsrld $31,%%ymm4,%%ymm4 \n"
|
||||
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
|
||||
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
|
||||
|
||||
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
|
||||
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
|
||||
|
||||
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
|
||||
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
|
||||
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
|
||||
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
|
||||
"vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo)
|
||||
"vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
|
||||
"vpaddd %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
|
||||
"vpaddd %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
|
||||
"vpsrad $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
|
||||
"vpsrad $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
|
||||
"vpackssdw %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vmovdqu %%ymm0,(%1) \n"
|
||||
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "m"(kLinearMadd31_16_AVX2) // %3
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SCALEROWUP2LINEAR_16_AVX2
|
||||
void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
asm volatile(
|
||||
|
||||
"vmovdqu %5,%%ymm5 \n"
|
||||
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
|
||||
"vpsrlw $15,%%ymm4,%%ymm4 \n"
|
||||
"vpsllw $3,%%ymm4,%%ymm4 \n" // all 8
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
|
||||
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
|
||||
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
|
||||
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
|
||||
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
|
||||
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
|
||||
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
|
||||
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
|
||||
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
|
||||
"vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo)
|
||||
"vpmaddwd %%ymm5,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
|
||||
"vpackssdw %%ymm1,%%ymm0,%%ymm2 \n" // 3*near+far (1)
|
||||
|
||||
"vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b)
|
||||
"vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b)
|
||||
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
|
||||
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
|
||||
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
|
||||
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
|
||||
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm3 \n" // 2323343467677878
|
||||
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
|
||||
"vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (2, lo)
|
||||
"vpmaddwd %%ymm5,%%ymm3,%%ymm1 \n" // 3*near+far (2, hi)
|
||||
"vpackssdw %%ymm1,%%ymm0,%%ymm3 \n" // 3*near+far (2)
|
||||
|
||||
"vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1)
|
||||
"vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2)
|
||||
"vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1)
|
||||
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1)
|
||||
"vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
|
||||
"vmovdqu %%ymm0,(%1) \n" // store above
|
||||
|
||||
"vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2)
|
||||
"vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1)
|
||||
"vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2)
|
||||
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2)
|
||||
"vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
|
||||
"vmovdqu %%ymm0,(%1,%4,2) \n" // store below
|
||||
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "r"((intptr_t)(src_stride)), // %3
|
||||
"r"((intptr_t)(dst_stride)), // %4
|
||||
"m"(kLinearMadd31_16_AVX2) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
||||
}
|
||||
|
||||
// This version can handle full 16bit range but is slower.
|
||||
void ScaleRowUp2_Bilinear_16_AVX2_Full(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
asm volatile(
|
||||
|
||||
"vmovdqu %5,%%ymm7 \n"
|
||||
"vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
|
||||
"vpsrld $31,%%ymm6,%%ymm6 \n"
|
||||
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
|
||||
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
|
||||
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
|
||||
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
|
||||
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
|
||||
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
|
||||
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
|
||||
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
|
||||
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
|
||||
"vpmaddwd %%ymm7,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo)
|
||||
"vpmaddwd %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
|
||||
|
||||
"vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b)
|
||||
"vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b)
|
||||
"vpermq $0b11011000,%%ymm2,%%ymm2 \n" // 0123000045670000
|
||||
"vpermq $0b11011000,%%ymm3,%%ymm3 \n" // 1234000056780000
|
||||
"vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n" // 0101232345456767
|
||||
"vpunpckldq %%ymm3,%%ymm3,%%ymm3 \n" // 1212343456567878
|
||||
"vpunpckhqdq %%ymm3,%%ymm2,%%ymm4 \n" // 2323343467677878
|
||||
"vpunpcklqdq %%ymm3,%%ymm2,%%ymm3 \n" // 0101121245455656
|
||||
"vpmaddwd %%ymm7,%%ymm3,%%ymm2 \n" // 3*near+far (2, lo)
|
||||
"vpmaddwd %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
|
||||
|
||||
"vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
|
||||
"vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
|
||||
"vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
|
||||
"vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
|
||||
"vpsrad $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
|
||||
|
||||
"vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
|
||||
"vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
|
||||
"vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
|
||||
"vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
|
||||
"vpsrad $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
|
||||
|
||||
"vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
|
||||
"vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
|
||||
"vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
|
||||
"vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
|
||||
"vpsrad $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
|
||||
|
||||
"vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
|
||||
"vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
|
||||
"vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
|
||||
"vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
|
||||
"vpsrad $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
|
||||
|
||||
"vpackssdw %%ymm0,%%ymm4,%%ymm4 \n"
|
||||
"vmovdqu %%ymm4,(%1) \n" // store above
|
||||
"vpackssdw %%ymm2,%%ymm5,%%ymm5 \n"
|
||||
"vmovdqu %%ymm5,(%1,%4,2) \n" // store below
|
||||
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width) // %2
|
||||
: "r"((intptr_t)(src_stride)), // %3
|
||||
"r"((intptr_t)(dst_stride)), // %4
|
||||
"m"(kLinearMadd31_16_AVX2) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif
|
||||
|
||||
// Reads 16xN bytes and produces 16 shorts at a time.
|
||||
void ScaleAddRow_SSE2(const uint8_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
@ -946,8 +1776,8 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
|
||||
"x"(kFsub80), // %8
|
||||
"x"(kFadd40) // %9
|
||||
#else
|
||||
"m"(kFsub80), // %8
|
||||
"m"(kFadd40) // %9
|
||||
"m"(kFsub80), // %8
|
||||
"m"(kFadd40) // %9
|
||||
#endif
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
|
||||
@ -504,6 +504,200 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
|
||||
: "q0", "q1", "q2", "q3", "q13", "q14", "memory", "cc");
|
||||
}
|
||||
|
||||
void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
const uint8_t* src_temp = src_ptr + 1;
|
||||
asm volatile(
|
||||
|
||||
"vmov.u16 q15, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
"vld1.8 {d0}, [%0]! \n" // 01234567
|
||||
"vld1.8 {d2}, [%3]! \n" // 12345678
|
||||
|
||||
"vmovl.u8 q0, d0 \n" // 01234567 (16b)
|
||||
"vmovl.u8 q1, d2 \n" // 12345678 (16b)
|
||||
"vmovq q2, q0 \n"
|
||||
"vmla.u16 q2, q1, q15 \n" // 3*near+far (odd)
|
||||
"vmla.u16 q1, q0, q15 \n" // 3*near+far (even)
|
||||
|
||||
"vrshrn.u16 d0, q1, #2 \n" // 3/4*near+1/4*far (odd)
|
||||
"vrshrn.u16 d1, q2, #2 \n" // 3/4*near+1/4*far (even)
|
||||
|
||||
"vst2.8 {d0, d1}, [%1]! \n" // store
|
||||
"subs %2, %2, #16 \n" // 8 sample -> 16 sample
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(src_temp) // %3
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
||||
uint8_t* dst_ptr1 = dst_ptr + dst_stride;
|
||||
const uint8_t* src_temp = src_ptr + 1;
|
||||
const uint8_t* src_temp1 = src_ptr1 + 1;
|
||||
|
||||
asm volatile(
|
||||
|
||||
"vmov.u16 q15, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
"vld1.8 {d0}, [%0]! \n" // 01234567
|
||||
"vld1.8 {d2}, [%5]! \n" // 12345678
|
||||
|
||||
"vmovl.u8 q0, d0 \n" // 01234567 (16b)
|
||||
"vmovl.u8 q1, d2 \n" // 12345678 (16b)
|
||||
"vmovq q2, q0 \n"
|
||||
"vmla.u16 q0, q1, q15 \n" // 3*near+far (1, odd)
|
||||
"vmla.u16 q1, q2, q15 \n" // 3*near+far (1, even)
|
||||
|
||||
"vld1.8 {d4}, [%1]! \n" // 01234567
|
||||
"vld1.8 {d6}, [%6]! \n" // 12345678
|
||||
|
||||
"vmovl.u8 q2, d4 \n" // 01234567 (16b)
|
||||
"vmovl.u8 q3, d6 \n" // 12345678 (16b)
|
||||
"vmovq q4, q2 \n"
|
||||
"vmla.u16 q2, q3, q15 \n" // 3*near+far (2, odd)
|
||||
"vmla.u16 q3, q4, q15 \n" // 3*near+far (2, even)
|
||||
|
||||
// e o
|
||||
// q1 q0
|
||||
// q3 q2
|
||||
|
||||
"vmovq q4, q2 \n"
|
||||
"vmovq q5, q3 \n"
|
||||
"vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd)
|
||||
"vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even)
|
||||
"vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd)
|
||||
"vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even)
|
||||
|
||||
// e o
|
||||
// q5 q4
|
||||
// q1 q0
|
||||
|
||||
"vrshrn.u16 d2, q1, #4 \n" // 2, even
|
||||
"vrshrn.u16 d3, q0, #4 \n" // 2, odd
|
||||
"vrshrn.u16 d0, q5, #4 \n" // 1, even
|
||||
"vrshrn.u16 d1, q4, #4 \n" // 1, odd
|
||||
|
||||
"vst2.8 {d0, d1}, [%2]! \n" // store
|
||||
"vst2.8 {d2, d3}, [%3]! \n" // store
|
||||
"subs %4, %4, #16 \n" // 8 sample -> 16 sample
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_ptr1), // %1
|
||||
"+r"(dst_ptr), // %2
|
||||
"+r"(dst_ptr1), // %3
|
||||
"+r"(dst_width), // %4
|
||||
"+r"(src_temp), // %5
|
||||
"+r"(src_temp1) // %6
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
|
||||
"q15" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
const uint16_t* src_temp = src_ptr + 1;
|
||||
asm volatile(
|
||||
|
||||
"vmov.u16 q15, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
"vld1.16 {q1}, [%0]! \n" // 01234567 (16b)
|
||||
"vld1.16 {q0}, [%3]! \n" // 12345678 (16b)
|
||||
|
||||
"vmovq q2, q0 \n"
|
||||
"vmla.u16 q0, q1, q15 \n" // 3*near+far (odd)
|
||||
"vmla.u16 q1, q2, q15 \n" // 3*near+far (even)
|
||||
|
||||
"vrshr.u16 q0, q0, #2 \n" // 3/4*near+1/4*far (odd)
|
||||
"vrshr.u16 q1, q1, #2 \n" // 3/4*near+1/4*far (even)
|
||||
|
||||
"vst2.16 {d0, d1, d2, d3}, [%1]! \n" // store
|
||||
"subs %2, %2, #16 \n" // 8 sample -> 16 sample
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(dst_ptr), // %1
|
||||
"+r"(dst_width), // %2
|
||||
"+r"(src_temp) // %3
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q15" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
const uint16_t* src_ptr1 = src_ptr + src_stride;
|
||||
uint16_t* dst_ptr1 = dst_ptr + dst_stride;
|
||||
const uint16_t* src_temp = src_ptr + 1;
|
||||
const uint16_t* src_temp1 = src_ptr1 + 1;
|
||||
|
||||
asm volatile(
|
||||
|
||||
"vmov.u16 q15, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
"add %5, %0, #2 \n"
|
||||
"vld1.16 {q0}, [%0]! \n" // 01234567 (16b)
|
||||
"vld1.16 {q1}, [%5]! \n" // 12345678 (16b)
|
||||
|
||||
"vmovq q2, q0 \n"
|
||||
"vmla.u16 q0, q1, q15 \n" // 3*near+far (odd)
|
||||
"vmla.u16 q1, q2, q15 \n" // 3*near+far (even)
|
||||
|
||||
"add %5, %1, #2 \n"
|
||||
"vld1.16 {q2}, [%1]! \n" // 01234567 (16b)
|
||||
"vld1.16 {q3}, [%6]! \n" // 12345678 (16b)
|
||||
|
||||
"vmovq q4, q2 \n"
|
||||
"vmla.u16 q2, q3, q15 \n" // 3*near+far (odd)
|
||||
"vmla.u16 q3, q4, q15 \n" // 3*near+far (even)
|
||||
|
||||
"vmovq q4, q2 \n"
|
||||
"vmovq q5, q3 \n"
|
||||
"vmla.u16 q4, q0, q15 \n" // 9 3 3 1 (1, odd)
|
||||
"vmla.u16 q5, q1, q15 \n" // 9 3 3 1 (1, even)
|
||||
"vmla.u16 q0, q2, q15 \n" // 9 3 3 1 (2, odd)
|
||||
"vmla.u16 q1, q3, q15 \n" // 9 3 3 1 (2, even)
|
||||
|
||||
"vrshr.u16 q2, q1, #4 \n" // 2, even
|
||||
"vrshr.u16 q3, q0, #4 \n" // 2, odd
|
||||
"vrshr.u16 q0, q5, #4 \n" // 1, even
|
||||
"vrshr.u16 q1, q4, #4 \n" // 1, odd
|
||||
|
||||
"vst2.16 {d0, d1, d2, d3}, [%2]! \n" // store
|
||||
"vst2.16 {d4, d5, d6, d7}, [%3]! \n" // store
|
||||
"subs %4, %4, #16 \n" // 8 sample -> 16 sample
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_ptr1), // %1
|
||||
"+r"(dst_ptr), // %2
|
||||
"+r"(dst_ptr1), // %3
|
||||
"+r"(dst_width), // %4
|
||||
"+r"(src_temp), // %5
|
||||
"+r"(src_temp1) // %6
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5",
|
||||
"q15" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
// Add a row of bytes to a row of shorts. Used for box filter.
|
||||
// Reads 16 bytes and accumulates to 16 shorts at a time.
|
||||
void ScaleAddRow_NEON(const uint8_t* src_ptr,
|
||||
|
||||
@ -535,6 +535,196 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
|
||||
"v19", "v30", "v31", "memory", "cc");
|
||||
}
|
||||
|
||||
void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst_ptr,
|
||||
int dst_width) {
|
||||
const uint8_t* src_temp = src_ptr + 1;
|
||||
asm volatile(
|
||||
|
||||
"movi v31.8b, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
"ldr d0, [%0], #8 \n" // 01234567
|
||||
"ldr d1, [%1], #8 \n" // 12345678
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
|
||||
"ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b)
|
||||
"ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b)
|
||||
|
||||
"umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (odd)
|
||||
"umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (even)
|
||||
|
||||
"rshrn v2.8b, v2.8h, #2 \n" // 3/4*near+1/4*far (odd)
|
||||
"rshrn v1.8b, v3.8h, #2 \n" // 3/4*near+1/4*far (even)
|
||||
|
||||
"st2 {v1.8b, v2.8b}, [%2], #16 \n" // store
|
||||
"subs %w3, %w3, #16 \n" // 8 sample -> 16 sample
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_temp), // %1
|
||||
"+r"(dst_ptr), // %2
|
||||
"+r"(dst_width) // %3
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v31" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
const uint8_t* src_ptr1 = src_ptr + src_stride;
|
||||
uint8_t* dst_ptr1 = dst_ptr + dst_stride;
|
||||
const uint8_t* src_temp = src_ptr + 1;
|
||||
const uint8_t* src_temp1 = src_ptr1 + 1;
|
||||
|
||||
asm volatile(
|
||||
|
||||
"movi v31.8b, #3 \n"
|
||||
"movi v30.8h, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
"ldr d0, [%0], #8 \n" // 01234567
|
||||
"ldr d1, [%2], #8 \n" // 12345678
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
|
||||
"ushll v2.8h, v0.8b, #0 \n" // 01234567 (16b)
|
||||
"ushll v3.8h, v1.8b, #0 \n" // 12345678 (16b)
|
||||
"umlal v2.8h, v1.8b, v31.8b \n" // 3*near+far (1, odd)
|
||||
"umlal v3.8h, v0.8b, v31.8b \n" // 3*near+far (1, even)
|
||||
|
||||
"ldr d0, [%1], #8 \n"
|
||||
"ldr d1, [%3], #8 \n"
|
||||
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
|
||||
|
||||
"ushll v4.8h, v0.8b, #0 \n" // 01234567 (16b)
|
||||
"ushll v5.8h, v1.8b, #0 \n" // 12345678 (16b)
|
||||
"umlal v4.8h, v1.8b, v31.8b \n" // 3*near+far (2, odd)
|
||||
"umlal v5.8h, v0.8b, v31.8b \n" // 3*near+far (2, even)
|
||||
|
||||
"mov v0.8h, v4.8h \n"
|
||||
"mov v1.8h, v5.8h \n"
|
||||
"mla v4.8h, v2.8h, v30.8h \n" // 9 3 3 1 (1, odd)
|
||||
"mla v5.8h, v3.8h, v30.8h \n" // 9 3 3 1 (1, even)
|
||||
"mla v2.8h, v0.8h, v30.8h \n" // 9 3 3 1 (2, odd)
|
||||
"mla v3.8h, v1.8h, v30.8h \n" // 9 3 3 1 (2, even)
|
||||
|
||||
"rshrn v2.8b, v2.8h, #4 \n" // 2, odd
|
||||
"rshrn v1.8b, v3.8h, #4 \n" // 2, even
|
||||
"rshrn v4.8b, v4.8h, #4 \n" // 1, odd
|
||||
"rshrn v3.8b, v5.8h, #4 \n" // 1, even
|
||||
|
||||
"st2 {v1.8b, v2.8b}, [%5], #16 \n" // store 1
|
||||
"st2 {v3.8b, v4.8b}, [%4], #16 \n" // store 2
|
||||
"subs %w6, %w6, #16 \n" // 8 sample -> 16 sample
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_ptr1), // %1
|
||||
"+r"(src_temp), // %2
|
||||
"+r"(src_temp1), // %3
|
||||
"+r"(dst_ptr), // %4
|
||||
"+r"(dst_ptr1), // %5
|
||||
"+r"(dst_width) // %6
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v30",
|
||||
"v31" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
int dst_width) {
|
||||
const uint16_t* src_temp = src_ptr + 1;
|
||||
asm volatile(
|
||||
|
||||
"movi v31.8h, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v0.8h}, [%0], #16 \n" // 01234567 (16b)
|
||||
"ld1 {v1.8h}, [%1], #16 \n" // 12345678 (16b)
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
|
||||
"mov v2.8h, v0.8h \n"
|
||||
"mla v0.8h, v1.8h, v31.8h \n" // 3*near+far (odd)
|
||||
"mla v1.8h, v2.8h, v31.8h \n" // 3*near+far (even)
|
||||
|
||||
"urshr v2.8h, v0.8h, #2 \n" // 3/4*near+1/4*far (odd)
|
||||
"urshr v1.8h, v1.8h, #2 \n" // 3/4*near+1/4*far (even)
|
||||
|
||||
"st2 {v1.8h, v2.8h}, [%2], #32 \n" // store
|
||||
"subs %w3, %w3, #16 \n" // 8 sample -> 16 sample
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_temp), // %1
|
||||
"+r"(dst_ptr), // %2
|
||||
"+r"(dst_width) // %3
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v31" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint16_t* dst_ptr,
|
||||
ptrdiff_t dst_stride,
|
||||
int dst_width) {
|
||||
const uint16_t* src_ptr1 = src_ptr + src_stride;
|
||||
uint16_t* dst_ptr1 = dst_ptr + dst_stride;
|
||||
const uint16_t* src_temp = src_ptr + 1;
|
||||
const uint16_t* src_temp1 = src_ptr1 + 1;
|
||||
|
||||
asm volatile(
|
||||
|
||||
"movi v31.8h, #3 \n"
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v2.8h}, [%0], #16 \n" // 01234567 (16b)
|
||||
"ld1 {v3.8h}, [%2], #16 \n" // 12345678 (16b)
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
|
||||
"mov v0.8h, v2.8h \n"
|
||||
"mla v2.8h, v3.8h, v31.8h \n" // 3*near+far (odd)
|
||||
"mla v3.8h, v0.8h, v31.8h \n" // 3*near+far (even)
|
||||
|
||||
"ld1 {v4.8h}, [%1], #16 \n" // 01234567 (16b)
|
||||
"ld1 {v5.8h}, [%3], #16 \n" // 12345678 (16b)
|
||||
"prfm pldl1keep, [%1, 448] \n" // prefetch 7 lines ahead
|
||||
|
||||
"mov v0.8h, v4.8h \n"
|
||||
"mla v4.8h, v5.8h, v31.8h \n" // 3*near+far (odd)
|
||||
"mla v5.8h, v0.8h, v31.8h \n" // 3*near+far (even)
|
||||
|
||||
"mov v0.8h, v4.8h \n"
|
||||
"mov v1.8h, v5.8h \n"
|
||||
"mla v4.8h, v2.8h, v31.8h \n" // 9 3 3 1 (1, odd)
|
||||
"mla v5.8h, v3.8h, v31.8h \n" // 9 3 3 1 (1, even)
|
||||
"mla v2.8h, v0.8h, v31.8h \n" // 9 3 3 1 (2, odd)
|
||||
"mla v3.8h, v1.8h, v31.8h \n" // 9 3 3 1 (2, even)
|
||||
|
||||
"urshr v2.8h, v2.8h, #4 \n" // 2, odd
|
||||
"urshr v1.8h, v3.8h, #4 \n" // 2, even
|
||||
"urshr v4.8h, v4.8h, #4 \n" // 1, odd
|
||||
"urshr v3.8h, v5.8h, #4 \n" // 1, even
|
||||
|
||||
"st2 {v3.8h, v4.8h}, [%4], #32 \n" // store 1
|
||||
"st2 {v1.8h, v2.8h}, [%5], #32 \n" // store 2
|
||||
|
||||
"subs %w6, %w6, #16 \n" // 8 sample -> 16 sample
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ptr), // %0
|
||||
"+r"(src_ptr1), // %1
|
||||
"+r"(src_temp), // %2
|
||||
"+r"(src_temp1), // %3
|
||||
"+r"(dst_ptr), // %4
|
||||
"+r"(dst_ptr1), // %5
|
||||
"+r"(dst_width) // %6
|
||||
:
|
||||
: "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5",
|
||||
"v31" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
// Add a row of bytes to a row of shorts. Used for box filter.
|
||||
// Reads 16 bytes and accumulates to 16 shorts at a time.
|
||||
void ScaleAddRow_NEON(const uint8_t* src_ptr,
|
||||
|
||||
@ -49,7 +49,8 @@ namespace libyuv {
|
||||
|
||||
#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
|
||||
DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF) \
|
||||
DST_SUBSAMP_X, DST_SUBSAMP_Y, W1280, N, NEG, OFF, \
|
||||
SRC_DEPTH) \
|
||||
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
|
||||
static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \
|
||||
static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \
|
||||
@ -81,6 +82,16 @@ namespace libyuv {
|
||||
MemRandomize(src_y + OFF, kWidth * kHeight * SRC_BPC); \
|
||||
MemRandomize(src_u + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \
|
||||
MemRandomize(src_v + OFF, kSrcHalfWidth * kSrcHalfHeight * SRC_BPC); \
|
||||
SRC_T* src_y_p = reinterpret_cast<SRC_T*>(src_y + OFF); \
|
||||
SRC_T* src_u_p = reinterpret_cast<SRC_T*>(src_u + OFF); \
|
||||
SRC_T* src_v_p = reinterpret_cast<SRC_T*>(src_v + OFF); \
|
||||
for (int i = 0; i < kWidth * kHeight; ++i) { \
|
||||
src_y_p[i] = src_y_p[i] & ((1 << SRC_DEPTH) - 1); \
|
||||
} \
|
||||
for (int i = 0; i < kSrcHalfWidth * kSrcHalfHeight; ++i) { \
|
||||
src_u_p[i] = src_u_p[i] & ((1 << SRC_DEPTH) - 1); \
|
||||
src_v_p[i] = src_v_p[i] & ((1 << SRC_DEPTH) - 1); \
|
||||
} \
|
||||
memset(dst_y_c, 1, kWidth* kHeight* DST_BPC); \
|
||||
memset(dst_u_c, 2, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
||||
memset(dst_v_c, 3, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
||||
@ -89,9 +100,7 @@ namespace libyuv {
|
||||
memset(dst_v_opt, 103, kDstHalfWidth* kDstHalfHeight* DST_BPC); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
SRC_FMT_PLANAR##To##FMT_PLANAR( \
|
||||
reinterpret_cast<SRC_T*>(src_y + OFF), kWidth, \
|
||||
reinterpret_cast<SRC_T*>(src_u + OFF), kSrcHalfWidth, \
|
||||
reinterpret_cast<SRC_T*>(src_v + OFF), kSrcHalfWidth, \
|
||||
src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth, \
|
||||
reinterpret_cast<DST_T*>(dst_y_c), kWidth, \
|
||||
reinterpret_cast<DST_T*>(dst_u_c), kDstHalfWidth, \
|
||||
reinterpret_cast<DST_T*>(dst_v_c), kDstHalfWidth, kWidth, \
|
||||
@ -99,9 +108,7 @@ namespace libyuv {
|
||||
MaskCpuFlags(benchmark_cpu_info_); \
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) { \
|
||||
SRC_FMT_PLANAR##To##FMT_PLANAR( \
|
||||
reinterpret_cast<SRC_T*>(src_y + OFF), kWidth, \
|
||||
reinterpret_cast<SRC_T*>(src_u + OFF), kSrcHalfWidth, \
|
||||
reinterpret_cast<SRC_T*>(src_v + OFF), kSrcHalfWidth, \
|
||||
src_y_p, kWidth, src_u_p, kSrcHalfWidth, src_v_p, kSrcHalfWidth, \
|
||||
reinterpret_cast<DST_T*>(dst_y_opt), kWidth, \
|
||||
reinterpret_cast<DST_T*>(dst_u_opt), kDstHalfWidth, \
|
||||
reinterpret_cast<DST_T*>(dst_v_opt), kDstHalfWidth, kWidth, \
|
||||
@ -127,34 +134,39 @@ namespace libyuv {
|
||||
|
||||
#define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, \
|
||||
SRC_SUBSAMP_Y, FMT_PLANAR, DST_T, DST_BPC, \
|
||||
DST_SUBSAMP_X, DST_SUBSAMP_Y) \
|
||||
DST_SUBSAMP_X, DST_SUBSAMP_Y, SRC_DEPTH) \
|
||||
TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
|
||||
FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
|
||||
benchmark_width_ - 4, _Any, +, 0) \
|
||||
benchmark_width_ - 4, _Any, +, 0, SRC_DEPTH) \
|
||||
TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
|
||||
FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
|
||||
benchmark_width_, _Unaligned, +, 1) \
|
||||
benchmark_width_, _Unaligned, +, 1, SRC_DEPTH) \
|
||||
TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
|
||||
FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
|
||||
benchmark_width_, _Invert, -, 0) \
|
||||
benchmark_width_, _Invert, -, 0, SRC_DEPTH) \
|
||||
TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_T, SRC_BPC, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
|
||||
FMT_PLANAR, DST_T, DST_BPC, DST_SUBSAMP_X, DST_SUBSAMP_Y, \
|
||||
benchmark_width_, _Opt, +, 0)
|
||||
benchmark_width_, _Opt, +, 0, SRC_DEPTH)
|
||||
|
||||
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2)
|
||||
TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2)
|
||||
TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2)
|
||||
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1)
|
||||
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1)
|
||||
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2)
|
||||
TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1)
|
||||
TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1)
|
||||
TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2)
|
||||
TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2)
|
||||
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2)
|
||||
TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2)
|
||||
TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2)
|
||||
TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2)
|
||||
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
|
||||
TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I420, uint8_t, 1, 2, 2, 8)
|
||||
TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I420, uint8_t, 1, 2, 2, 8)
|
||||
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I422, uint8_t, 1, 2, 1, 8)
|
||||
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I444, uint8_t, 1, 1, 1, 8)
|
||||
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I420Mirror, uint8_t, 1, 2, 2, 8)
|
||||
TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I422, uint8_t, 1, 2, 1, 8)
|
||||
TESTPLANARTOP(I422, uint8_t, 1, 2, 1, I444, uint8_t, 1, 1, 1, 8)
|
||||
TESTPLANARTOP(I444, uint8_t, 1, 1, 1, I444, uint8_t, 1, 1, 1, 8)
|
||||
TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I010, uint16_t, 2, 2, 2, 10)
|
||||
TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I420, uint8_t, 1, 2, 2, 10)
|
||||
TESTPLANARTOP(I420, uint8_t, 1, 2, 2, I010, uint16_t, 2, 2, 2, 8)
|
||||
TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10)
|
||||
TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10)
|
||||
TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8)
|
||||
TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10)
|
||||
TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10)
|
||||
TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12)
|
||||
TESTPLANARTOP(I212, uint16_t, 2, 2, 1, I412, uint16_t, 2, 1, 1, 12)
|
||||
|
||||
// Test Android 420 to I420
|
||||
#define TESTAPLANARTOPI(SRC_FMT_PLANAR, PIXEL_STRIDE, SRC_SUBSAMP_X, \
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user