J420ToI420 using planar 8 bit scaling

- Add Convert8To8Plane which scale and add 8 bit values allowing full range
  YUV to be converted to limited range YUV

libyuv_test '--gunit_filter=*J420ToI420*' --gunit_also_run_disabled_tests --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1

Samsung S23
J420ToI420_Opt (45 ms)
I420ToI420_Opt (37 ms)

Skylake
J420ToI420_Opt (596 ms)
I420ToI420_Opt (99 ms)

Bug: 381327032
Change-Id: I380c3fa783491f2e3727af28b0ea9ce16d2bb8a4
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6182631
Reviewed-by: Wan-Teh Chang <wtc@google.com>
This commit is contained in:
Frank Barchard 2025-01-21 15:56:56 -08:00
parent ef52c1658a
commit 26277baf96
15 changed files with 428 additions and 96 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1899
Version: 1900
License: BSD
License File: LICENSE
Shipped: yes

View File

@ -598,6 +598,23 @@ int I400ToI420(const uint8_t* src_y,
int width,
int height);
// Convert J420 to I420.
LIBYUV_API
int J420ToI420(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
int src_stride_u,
const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// Convert I400 (grey) to NV21.
LIBYUV_API
int I400ToNV21(const uint8_t* src_y,

View File

@ -78,6 +78,16 @@ void Convert8To16Plane(const uint8_t* src_y,
int width,
int height);
LIBYUV_API
void Convert8To8Plane(const uint8_t* src_y,
int src_stride_y,
uint8_t* dst_y,
int dst_stride_y,
int scale, // 220 for Y, 225 for U,V
int bias, // 16
int width,
int height);
// Set a plane of data to a 32 bit value.
LIBYUV_API
void SetPlane(uint8_t* dst_y,

View File

@ -507,6 +507,7 @@ extern "C" {
// The following are available on AArch64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_CONVERT8TO8ROW_NEON
#define HAS_ARGBTOAR30ROW_NEON
#define HAS_ABGRTOAR30ROW_NEON
#define HAS_I210ALPHATOARGBROW_NEON
@ -3641,6 +3642,22 @@ void Convert16To8Row_SME(const uint16_t* src_y,
int scale,
int width);
void Convert8To8Row_C(const uint8_t* src_y,
uint8_t* dst_y,
int scale,
int bias,
int width);
void Convert8To8Row_NEON(const uint8_t* src_y,
uint8_t* dst_y,
int scale,
int bias,
int width);
void Convert8To8Row_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int scale,
int bias,
int width);
void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width);

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1899
#define LIBYUV_VERSION 1900
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -4356,6 +4356,78 @@ int P010ToNV12(const uint16_t* src_y,
1, 1, 16);
}
static int Planar8bitTo8bit(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
int src_stride_u,
const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height,
int subsample_x,
int subsample_y,
int scale_y,
int bias_y,
int scale_uv,
int bias_uv) {
int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
uv_height = -uv_height;
src_y = src_y + (height - 1) * src_stride_y;
src_u = src_u + (uv_height - 1) * src_stride_u;
src_v = src_v + (uv_height - 1) * src_stride_v;
src_stride_y = -src_stride_y;
src_stride_u = -src_stride_u;
src_stride_v = -src_stride_v;
}
// Convert Y plane.
if (dst_y) {
Convert8To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale_y, bias_y,
width, height);
}
// Convert UV planes.
Convert8To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, scale_uv, bias_uv,
uv_width, uv_height);
Convert8To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, scale_uv, bias_uv,
uv_width, uv_height);
return 0;
}
LIBYUV_API
int J420ToI420(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
int src_stride_u,
const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
return Planar8bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_y, dst_stride_y, dst_u,
dst_stride_u, dst_v, dst_stride_v, width, height, 1,
1, 220, 16, 225, 16);
}
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv

View File

@ -450,7 +450,7 @@ static SAFEBUFFERS int GetCpuFlags(void) {
((cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0) |
((cpu_info7[3] & 0x02000000) ? kCpuHasAMXINT8 : 0);
if (cpu_info0[0] >= 0x24 && (cpu_einfo7[3] & 0x00080000)) {
cpu_info |= ((cpu_info24[1] & 0xFF) >= 2) ? kCpuHasAVX10_2: 0;
cpu_info |= ((cpu_info24[1] & 0xFF) >= 2) ? kCpuHasAVX10_2 : 0;
}
}
}

View File

@ -234,6 +234,81 @@ void Convert8To16Plane(const uint8_t* src_y,
}
}
// Convert a plane of 8 bit data to 8 bit
LIBYUV_API
void Convert8To8Plane(const uint8_t* src_y,
int src_stride_y,
uint8_t* dst_y,
int dst_stride_y,
int scale, // 220 for Y, 225 to UV
int bias, // 16
int width,
int height) {
int y;
void (*Convert8To8Row)(const uint8_t* src_y, uint8_t* dst_y, int scale,
int bias, int width) = Convert8To8Row_C;
if (width <= 0 || height == 0) {
return;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_y = dst_y + (height - 1) * dst_stride_y;
dst_stride_y = -dst_stride_y;
}
// Coalesce rows.
if (src_stride_y == width && dst_stride_y == width) {
width *= height;
height = 1;
src_stride_y = dst_stride_y = 0;
}
#if defined(HAS_CONVERT8TO8ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
Convert8To8Row = Convert8To8Row_Any_NEON;
if (IS_ALIGNED(width, 32)) {
Convert8To8Row = Convert8To8Row_NEON;
}
}
#endif
#if defined(HAS_CONVERT8TO8ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
Convert8To8Row = Convert8To8Row_SME;
}
#endif
#if defined(HAS_CONVERT8TO8ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
Convert8To8Row = Convert8To8Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
Convert8To8Row = Convert8To8Row_SSSE3;
}
}
#endif
#if defined(HAS_CONVERT8TO8ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
Convert8To8Row = Convert8To8Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
Convert8To8Row = Convert8To8Row_AVX2;
}
}
#endif
#if defined(HAS_CONVERT8TO8ROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
Convert8To8Row = Convert8To8Row_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
Convert8To8Row = Convert8To8Row_AVX512BW;
}
}
#endif
// Convert plane
for (y = 0; y < height; ++y) {
Convert8To8Row(src_y, dst_y, scale, bias, width);
src_y += src_stride_y;
dst_y += dst_stride_y;
}
}
// Copy I422.
LIBYUV_API
int I422Copy(const uint8_t* src_y,

View File

@ -1780,6 +1780,34 @@ ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15)
#endif
#undef ANY11C
// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
#define ANY11SB(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \
void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int bias, \
int width) { \
SIMD_ALIGNED(STYPE vin[64]); \
SIMD_ALIGNED(DTYPE vout[64]); \
memset(vin, 0, sizeof(vin)); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, scale, bias, n); \
} \
memcpy(vin, src_ptr + n, r * SBPP); \
ANY_SIMD(vin, vout, scale, bias, MASK + 1); \
memcpy(dst_ptr + n, vout, r * BPP); \
}
#ifdef HAS_CONVERT8TO8ROW_NEON
ANY11SB(Convert8To8Row_Any_NEON,
Convert8To8Row_NEON,
1,
1,
uint8_t,
uint8_t,
31)
#endif
#undef ANY11B
// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts.
#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \
void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \

View File

@ -3240,6 +3240,24 @@ void Convert8To16Row_C(const uint8_t* src_y,
}
}
// Use scale to convert J420 to I420
// scale parameter is 8.8 fixed point but limited to 0 to 255
// Function is based on DivideRow, but adds a bias
// Does not clamp
void Convert8To8Row_C(const uint8_t* src_y,
uint8_t* dst_y,
int scale,
int bias,
int width) {
int x;
assert(scale >= 0);
assert(scale <= 255);
for (x = 0; x < width; ++x) {
dst_y[x] = ((src_y[x] * scale) >> 8) + bias;
}
}
void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) {
memcpy(dst, src, count);
}

View File

@ -290,23 +290,22 @@ void I210ToAR30Row_NEON(const uint16_t* src_y,
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
uint16_t limit = 0x3ff0;
uint16_t alpha = 0xc000;
asm volatile(
YUVTORGB_SETUP
asm volatile(YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"dup v23.8h, %w[alpha] \n"
"1: \n" READYUV210 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[alpha] "r"(alpha) // %[alpha]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[alpha] "r"(alpha) // %[alpha]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void I410ToAR30Row_NEON(const uint16_t* src_y,
@ -319,23 +318,22 @@ void I410ToAR30Row_NEON(const uint16_t* src_y,
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
uint16_t limit = 0x3ff0;
uint16_t alpha = 0xc000;
asm volatile(
YUVTORGB_SETUP
asm volatile(YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"dup v23.8h, %w[alpha] \n"
"1: \n" READYUV410 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[alpha] "r"(alpha) // %[alpha]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[alpha] "r"(alpha) // %[alpha]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void I212ToAR30Row_NEON(const uint16_t* src_y,
@ -347,22 +345,21 @@ void I212ToAR30Row_NEON(const uint16_t* src_y,
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
const uint16_t limit = 0x3ff0;
asm volatile(
YUVTORGB_SETUP
asm volatile(YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"movi v23.8h, #0xc0, lsl #8 \n" // A
"1: \n" READYUV212 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit) // %[limit]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit) // %[limit]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void I210ToARGBRow_NEON(const uint16_t* src_y,
@ -374,7 +371,8 @@ void I210ToARGBRow_NEON(const uint16_t* src_y,
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"1: \n" READYUV210 NVTORGB RGBTORGB8
"1: \n" READYUV210 NVTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
@ -397,7 +395,8 @@ void I410ToARGBRow_NEON(const uint16_t* src_y,
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"1: \n" READYUV410 NVTORGB RGBTORGB8
"1: \n" READYUV410 NVTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
@ -422,7 +421,8 @@ void I212ToARGBRow_NEON(const uint16_t* src_y,
asm volatile(
YUVTORGB_SETUP
"movi v19.8b, #255 \n"
"1: \n" READYUV212 NVTORGB RGBTORGB8
"1: \n" READYUV212 NVTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n"
@ -526,22 +526,23 @@ void P210ToAR30Row_NEON(const uint16_t* src_y,
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
const uint16_t limit = 0x3ff0;
asm volatile(YUVTORGB_SETUP
asm volatile(
YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"movi v23.8h, #0xc0, lsl #8 \n" // A
"ldr q2, [%[kIndices]] \n"
"1: \n" READYUVP210 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_uv), // %[src_uv]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
: [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_uv), // %[src_uv]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void P410ToAR30Row_NEON(const uint16_t* src_y,
@ -552,22 +553,23 @@ void P410ToAR30Row_NEON(const uint16_t* src_y,
const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
uint16_t limit = 0x3ff0;
asm volatile(YUVTORGB_SETUP
asm volatile(
YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n"
"movi v23.8h, #0xc0, lsl #8 \n" // A
"ldr q2, [%[kIndices]] \n"
"1: \n" READYUVP410 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_uv), // %[src_uv]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
: [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_uv), // %[src_uv]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit]
[kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23");
}
void I422ToAR30Row_NEON(const uint8_t* src_y,
@ -820,7 +822,8 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
READYUV422 I4XXTORGB RGBTORGB8_TOP
"subs %w[width], %w[width], #8 \n" //
ARGBTOARGB1555_FROM_TOP
"st1 {v19.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels RGB1555.
"st1 {v19.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels
// RGB1555.
"b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u]
@ -3460,7 +3463,8 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
"movi v6.16b, #66 \n" // R * 0.2578 coefficient
"movi v7.16b, #16 \n" // Add 16 constant
"1: \n"
"ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 pixels.
"ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555
// pixels.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
RGB555TOARGB
"umull v16.8h, v0.8b, v4.8b \n" // B
@ -3492,7 +3496,8 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
"movi v26.16b, #66 \n" // R * 0.2578 coefficient
"movi v27.16b, #16 \n" // Add 16 constant
"1: \n"
"ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444 pixels.
"ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444
// pixels.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
ARGB4444TORGB
"umull v16.8h, v0.8b, v24.8b \n" // B
@ -4136,7 +4141,8 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
uint32_t value) {
asm volatile(
"dup v0.4s, %w3 \n" // duplicate scale value.
"zip1 v0.16b, v0.16b, v0.16b \n" // v0.16b aarrggbbaarrggbb.
"zip1 v0.16b, v0.16b, v0.16b \n" // v0.16b
// aarrggbbaarrggbb.
"ushr v0.8h, v0.8h, #1 \n" // scale / 2.
// 8 pixel loop.
@ -5277,11 +5283,11 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
"dup v2.8h, %w3 \n"
"1: \n"
"ldp q0, q1, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"mul v0.8h, v0.8h, v2.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"mul v1.8h, v1.8h, v2.8h \n"
"stp q0, q1, [%1], #32 \n" // store 16 pixels
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
@ -5298,6 +5304,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
"dup v4.8h, %w3 \n"
"1: \n"
"ldp q2, q3, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"umull v0.4s, v2.4h, v4.4h \n"
"umull2 v1.4s, v2.8h, v4.8h \n"
"umull v2.4s, v3.4h, v4.4h \n"
@ -5306,7 +5313,6 @@ void DivideRow_16_NEON(const uint16_t* src_y,
"uzp2 v0.8h, v0.8h, v1.8h \n"
"uzp2 v1.8h, v2.8h, v3.8h \n"
"stp q0, q1, [%1], #32 \n" // store 16 pixels
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
@ -5332,11 +5338,11 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
"dup v2.8h, %w3 \n"
"1: \n"
"ldp q0, q1, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"uqshl v0.8h, v0.8h, v2.8h \n"
"uqshl v1.8h, v1.8h, v2.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"uzp2 v0.16b, v0.16b, v1.16b \n"
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"str q0, [%1], #16 \n" // store 16 pixels
"b.gt 1b \n"
: "+r"(src_y), // %0
@ -5346,6 +5352,40 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
: "cc", "memory", "v0", "v1", "v2");
}
// Use scale to convert J420 to I420
// scale parameter is 8.8 fixed point but limited to 0 to 255
// Function is based on DivideRow, but adds a bias
// Does not clamp
void Convert8To8Row_NEON(const uint8_t* src_y,
uint8_t* dst_y,
int scale,
int bias,
int width) {
asm volatile(
"dup v4.16b, %w3 \n" // scale
"dup v5.16b, %w4 \n" // bias
"1: \n"
"ldp q2, q3, [%0], #32 \n"
"subs %w2, %w2, #32 \n" // 32 pixels per loop
"umull v0.8h, v2.8b, v4.8b \n"
"umull2 v1.8h, v2.16b, v4.16b \n"
"umull v2.8h, v3.8b, v4.8b \n"
"umull2 v3.8h, v3.16b, v4.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"uzp2 v0.16b, v0.16b, v1.16b \n"
"uzp2 v1.16b, v2.16b, v3.16b \n"
"add v0.16b, v0.16b, v5.16b \n" // add bias (16)
"add v1.16b, v1.16b, v5.16b \n"
"stp q0, q1, [%1], #32 \n" // store 16 pixels
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale), // %3
"r"(bias) // %4
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus

View File

@ -2336,9 +2336,9 @@ int I420Scale(const uint8_t* src_y,
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
int r;
if (!src_y || !src_u || !src_v || src_width <= 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
}
@ -2381,9 +2381,9 @@ int I420Scale_16(const uint16_t* src_y,
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
int r;
if (!src_y || !src_u || !src_v || src_width <= 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
}
@ -2426,9 +2426,9 @@ int I420Scale_12(const uint16_t* src_y,
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
int r;
if (!src_y || !src_u || !src_v || src_width <= 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
}
@ -2470,9 +2470,9 @@ int I444Scale(const uint8_t* src_y,
enum FilterMode filtering) {
int r;
if (!src_y || !src_u || !src_v || src_width <= 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
}
@ -2511,9 +2511,9 @@ int I444Scale_16(const uint16_t* src_y,
enum FilterMode filtering) {
int r;
if (!src_y || !src_u || !src_v || src_width <= 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
}
@ -2552,9 +2552,9 @@ int I444Scale_12(const uint16_t* src_y,
enum FilterMode filtering) {
int r;
if (!src_y || !src_u || !src_v || src_width <= 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
}
@ -2598,9 +2598,9 @@ int I422Scale(const uint8_t* src_y,
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
int r;
if (!src_y || !src_u || !src_v || src_width <= 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
}
@ -2641,9 +2641,9 @@ int I422Scale_16(const uint16_t* src_y,
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
int r;
if (!src_y || !src_u || !src_v || src_width <= 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
}
@ -2684,9 +2684,9 @@ int I422Scale_12(const uint16_t* src_y,
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
int r;
if (!src_y || !src_u || !src_v || src_width <= 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
dst_width <= 0 || dst_height <= 0) {
return -1;
}

View File

@ -1369,7 +1369,8 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
"uzp2 v1.8h, v2.8h, v3.8h \n"
"uzp2 v2.8h, v4.8h, v5.8h \n"
"uzp2 v3.8h, v6.8h, v7.8h \n"
"subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration.
"subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per
// iteration.
"stp q0, q1, [%[dst_ptr]] \n"
"stp q2, q3, [%[dst_ptr], #32] \n"
"add %[dst_ptr], %[dst_ptr], #64 \n"

View File

@ -188,6 +188,7 @@ TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10)
TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10)
TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8)
TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H012, uint16_t, 2, 2, 2, 8)
TESTPLANARTOP(J420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10)
TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10)
TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12)
@ -2107,6 +2108,28 @@ TEST_F(LibYUVConvertTest, TestRGB24ToI420) {
}
#endif
TEST_F(LibYUVConvertTest, TestJ420ToI420) {
const uint8_t src_y[12] = {0, 0, 128, 128, 255, 255,
0, 0, 128, 128, 255, 255};
const uint8_t src_u[3] = {0, 128, 255};
const uint8_t src_v[3] = {0, 128, 255};
uint8_t dst_y[12];
uint8_t dst_u[3];
uint8_t dst_v[3];
ASSERT_EQ(J420ToI420(src_y, 6, src_u, 3, src_v, 3, dst_y, 6, dst_u, 3, dst_v,
3, 6, 2),
0);
EXPECT_EQ(dst_y[0], 16);
EXPECT_EQ(dst_y[2], 126);
EXPECT_EQ(dst_y[4], 235);
EXPECT_EQ(dst_u[0], 16);
EXPECT_EQ(dst_u[1], 128);
EXPECT_EQ(dst_u[2], 240);
EXPECT_EQ(dst_v[0], 16);
EXPECT_EQ(dst_v[1], 128);
EXPECT_EQ(dst_v[2], 240);
}
#endif // !defined(LEAN_TESTS)
} // namespace libyuv

View File

@ -3802,6 +3802,37 @@ TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
free_aligned_buffer_page_end(dst_pixels_y_c);
}
TEST_F(LibYUVPlanarTest, Convert8To8Plane) {
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels_y, kPixels);
align_buffer_page_end(dst_pixels_y_opt, kPixels);
align_buffer_page_end(dst_pixels_y_c, kPixels);
MemRandomize(src_pixels_y, kPixels);
memset(dst_pixels_y_opt, 0, kPixels);
memset(dst_pixels_y_c, 1, kPixels);
MaskCpuFlags(disable_cpu_flags_);
Convert8To8Plane(src_pixels_y, benchmark_width_, dst_pixels_y_c,
benchmark_width_, 220, 16, benchmark_width_,
benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
Convert8To8Plane(src_pixels_y, benchmark_width_, dst_pixels_y_opt,
benchmark_width_, 220, 16, benchmark_width_,
benchmark_height_);
}
for (int i = 0; i < kPixels; ++i) {
EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
}
free_aligned_buffer_page_end(src_pixels_y);
free_aligned_buffer_page_end(dst_pixels_y_opt);
free_aligned_buffer_page_end(dst_pixels_y_c);
}
TEST_F(LibYUVPlanarTest, YUY2ToY) {
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels_y, kPixels * 2);