J420ToI420 using planar 8 bit scaling

- Add Convert8To8Plane which scale and add 8 bit values allowing full range
  YUV to be converted to limited range YUV

libyuv_test '--gunit_filter=*J420ToI420*' --gunit_also_run_disabled_tests --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1

Samsung S23
J420ToI420_Opt (45 ms)
I420ToI420_Opt (37 ms)

Skylake
J420ToI420_Opt (596 ms)
I420ToI420_Opt (99 ms)

Bug: 381327032
Change-Id: I380c3fa783491f2e3727af28b0ea9ce16d2bb8a4
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6182631
Reviewed-by: Wan-Teh Chang <wtc@google.com>
This commit is contained in:
Frank Barchard 2025-01-21 15:56:56 -08:00
parent ef52c1658a
commit 26277baf96
15 changed files with 428 additions and 96 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/ URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1899 Version: 1900
License: BSD License: BSD
License File: LICENSE License File: LICENSE
Shipped: yes Shipped: yes

View File

@ -598,6 +598,23 @@ int I400ToI420(const uint8_t* src_y,
int width, int width,
int height); int height);
// Convert J420 to I420.
LIBYUV_API
int J420ToI420(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
int src_stride_u,
const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// Convert I400 (grey) to NV21. // Convert I400 (grey) to NV21.
LIBYUV_API LIBYUV_API
int I400ToNV21(const uint8_t* src_y, int I400ToNV21(const uint8_t* src_y,

View File

@ -78,6 +78,16 @@ void Convert8To16Plane(const uint8_t* src_y,
int width, int width,
int height); int height);
LIBYUV_API
void Convert8To8Plane(const uint8_t* src_y,
int src_stride_y,
uint8_t* dst_y,
int dst_stride_y,
int scale, // 220 for Y, 225 for U,V
int bias, // 16
int width,
int height);
// Set a plane of data to a 32 bit value. // Set a plane of data to a 32 bit value.
LIBYUV_API LIBYUV_API
void SetPlane(uint8_t* dst_y, void SetPlane(uint8_t* dst_y,

View File

@ -507,6 +507,7 @@ extern "C" {
// The following are available on AArch64 platforms: // The following are available on AArch64 platforms:
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#define HAS_CONVERT8TO8ROW_NEON
#define HAS_ARGBTOAR30ROW_NEON #define HAS_ARGBTOAR30ROW_NEON
#define HAS_ABGRTOAR30ROW_NEON #define HAS_ABGRTOAR30ROW_NEON
#define HAS_I210ALPHATOARGBROW_NEON #define HAS_I210ALPHATOARGBROW_NEON
@ -3641,6 +3642,22 @@ void Convert16To8Row_SME(const uint16_t* src_y,
int scale, int scale,
int width); int width);
void Convert8To8Row_C(const uint8_t* src_y,
uint8_t* dst_y,
int scale,
int bias,
int width);
void Convert8To8Row_NEON(const uint8_t* src_y,
uint8_t* dst_y,
int scale,
int bias,
int width);
void Convert8To8Row_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int scale,
int bias,
int width);
void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width); void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width); void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width); void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width);

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1899 #define LIBYUV_VERSION 1900
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -4356,6 +4356,78 @@ int P010ToNV12(const uint16_t* src_y,
1, 1, 16); 1, 1, 16);
} }
static int Planar8bitTo8bit(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
int src_stride_u,
const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height,
int subsample_x,
int subsample_y,
int scale_y,
int bias_y,
int scale_uv,
int bias_uv) {
int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
uv_height = -uv_height;
src_y = src_y + (height - 1) * src_stride_y;
src_u = src_u + (uv_height - 1) * src_stride_u;
src_v = src_v + (uv_height - 1) * src_stride_v;
src_stride_y = -src_stride_y;
src_stride_u = -src_stride_u;
src_stride_v = -src_stride_v;
}
// Convert Y plane.
if (dst_y) {
Convert8To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale_y, bias_y,
width, height);
}
// Convert UV planes.
Convert8To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, scale_uv, bias_uv,
uv_width, uv_height);
Convert8To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, scale_uv, bias_uv,
uv_width, uv_height);
return 0;
}
LIBYUV_API
int J420ToI420(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,
int src_stride_u,
const uint8_t* src_v,
int src_stride_v,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
return Planar8bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_y, dst_stride_y, dst_u,
dst_stride_u, dst_v, dst_stride_v, width, height, 1,
1, 220, 16, 225, 16);
}
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv

View File

@ -450,7 +450,7 @@ static SAFEBUFFERS int GetCpuFlags(void) {
((cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0) | ((cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0) |
((cpu_info7[3] & 0x02000000) ? kCpuHasAMXINT8 : 0); ((cpu_info7[3] & 0x02000000) ? kCpuHasAMXINT8 : 0);
if (cpu_info0[0] >= 0x24 && (cpu_einfo7[3] & 0x00080000)) { if (cpu_info0[0] >= 0x24 && (cpu_einfo7[3] & 0x00080000)) {
cpu_info |= ((cpu_info24[1] & 0xFF) >= 2) ? kCpuHasAVX10_2: 0; cpu_info |= ((cpu_info24[1] & 0xFF) >= 2) ? kCpuHasAVX10_2 : 0;
} }
} }
} }

View File

@ -234,6 +234,81 @@ void Convert8To16Plane(const uint8_t* src_y,
} }
} }
// Convert a plane of 8 bit data to 8 bit
LIBYUV_API
void Convert8To8Plane(const uint8_t* src_y,
int src_stride_y,
uint8_t* dst_y,
int dst_stride_y,
int scale, // 220 for Y, 225 to UV
int bias, // 16
int width,
int height) {
int y;
void (*Convert8To8Row)(const uint8_t* src_y, uint8_t* dst_y, int scale,
int bias, int width) = Convert8To8Row_C;
if (width <= 0 || height == 0) {
return;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_y = dst_y + (height - 1) * dst_stride_y;
dst_stride_y = -dst_stride_y;
}
// Coalesce rows.
if (src_stride_y == width && dst_stride_y == width) {
width *= height;
height = 1;
src_stride_y = dst_stride_y = 0;
}
#if defined(HAS_CONVERT8TO8ROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
Convert8To8Row = Convert8To8Row_Any_NEON;
if (IS_ALIGNED(width, 32)) {
Convert8To8Row = Convert8To8Row_NEON;
}
}
#endif
#if defined(HAS_CONVERT8TO8ROW_SME)
if (TestCpuFlag(kCpuHasSME)) {
Convert8To8Row = Convert8To8Row_SME;
}
#endif
#if defined(HAS_CONVERT8TO8ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
Convert8To8Row = Convert8To8Row_Any_SSSE3;
if (IS_ALIGNED(width, 8)) {
Convert8To8Row = Convert8To8Row_SSSE3;
}
}
#endif
#if defined(HAS_CONVERT8TO8ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
Convert8To8Row = Convert8To8Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
Convert8To8Row = Convert8To8Row_AVX2;
}
}
#endif
#if defined(HAS_CONVERT8TO8ROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
Convert8To8Row = Convert8To8Row_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
Convert8To8Row = Convert8To8Row_AVX512BW;
}
}
#endif
// Convert plane
for (y = 0; y < height; ++y) {
Convert8To8Row(src_y, dst_y, scale, bias, width);
src_y += src_stride_y;
dst_y += dst_stride_y;
}
}
// Copy I422. // Copy I422.
LIBYUV_API LIBYUV_API
int I422Copy(const uint8_t* src_y, int I422Copy(const uint8_t* src_y,

View File

@ -1780,6 +1780,34 @@ ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15)
#endif #endif
#undef ANY11C #undef ANY11C
// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
#define ANY11SB(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \
void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int bias, \
int width) { \
SIMD_ALIGNED(STYPE vin[64]); \
SIMD_ALIGNED(DTYPE vout[64]); \
memset(vin, 0, sizeof(vin)); /* for msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_ptr, dst_ptr, scale, bias, n); \
} \
memcpy(vin, src_ptr + n, r * SBPP); \
ANY_SIMD(vin, vout, scale, bias, MASK + 1); \
memcpy(dst_ptr + n, vout, r * BPP); \
}
#ifdef HAS_CONVERT8TO8ROW_NEON
ANY11SB(Convert8To8Row_Any_NEON,
Convert8To8Row_NEON,
1,
1,
uint8_t,
uint8_t,
31)
#endif
#undef ANY11B
// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts. // Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts.
#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \ #define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \
void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \ void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \

View File

@ -3240,6 +3240,24 @@ void Convert8To16Row_C(const uint8_t* src_y,
} }
} }
// Use scale to convert J420 to I420
// scale parameter is 8.8 fixed point but limited to 0 to 255
// Function is based on DivideRow, but adds a bias
// Does not clamp
void Convert8To8Row_C(const uint8_t* src_y,
uint8_t* dst_y,
int scale,
int bias,
int width) {
int x;
assert(scale >= 0);
assert(scale <= 255);
for (x = 0; x < width; ++x) {
dst_y[x] = ((src_y[x] * scale) >> 8) + bias;
}
}
void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) { void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) {
memcpy(dst, src, count); memcpy(dst, src, count);
} }

View File

@ -290,23 +290,22 @@ void I210ToAR30Row_NEON(const uint16_t* src_y,
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
uint16_t limit = 0x3ff0; uint16_t limit = 0x3ff0;
uint16_t alpha = 0xc000; uint16_t alpha = 0xc000;
asm volatile( asm volatile(YUVTORGB_SETUP
YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n" "dup v22.8h, %w[limit] \n"
"dup v23.8h, %w[alpha] \n" "dup v23.8h, %w[alpha] \n"
"1: \n" READYUV210 NVTORGB "1: \n" READYUV210 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30 "subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n" "b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y] : [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u] [src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v] [src_v] "+r"(src_v), // %[src_v]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30] [dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width] [width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit] [limit] "r"(limit), // %[limit]
[alpha] "r"(alpha) // %[alpha] [alpha] "r"(alpha) // %[alpha]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23"); : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
} }
void I410ToAR30Row_NEON(const uint16_t* src_y, void I410ToAR30Row_NEON(const uint16_t* src_y,
@ -319,23 +318,22 @@ void I410ToAR30Row_NEON(const uint16_t* src_y,
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
uint16_t limit = 0x3ff0; uint16_t limit = 0x3ff0;
uint16_t alpha = 0xc000; uint16_t alpha = 0xc000;
asm volatile( asm volatile(YUVTORGB_SETUP
YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n" "dup v22.8h, %w[limit] \n"
"dup v23.8h, %w[alpha] \n" "dup v23.8h, %w[alpha] \n"
"1: \n" READYUV410 NVTORGB "1: \n" READYUV410 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30 "subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n" "b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y] : [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u] [src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v] [src_v] "+r"(src_v), // %[src_v]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30] [dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width] [width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit] [limit] "r"(limit), // %[limit]
[alpha] "r"(alpha) // %[alpha] [alpha] "r"(alpha) // %[alpha]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23"); : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
} }
void I212ToAR30Row_NEON(const uint16_t* src_y, void I212ToAR30Row_NEON(const uint16_t* src_y,
@ -347,22 +345,21 @@ void I212ToAR30Row_NEON(const uint16_t* src_y,
const uvec8* uv_coeff = &yuvconstants->kUVCoeff; const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
const uint16_t limit = 0x3ff0; const uint16_t limit = 0x3ff0;
asm volatile( asm volatile(YUVTORGB_SETUP
YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n" "dup v22.8h, %w[limit] \n"
"movi v23.8h, #0xc0, lsl #8 \n" // A "movi v23.8h, #0xc0, lsl #8 \n" // A
"1: \n" READYUV212 NVTORGB "1: \n" READYUV212 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30 "subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n" "b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y] : [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u] [src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v] [src_v] "+r"(src_v), // %[src_v]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30] [dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width] [width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit) // %[limit] [limit] "r"(limit) // %[limit]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23"); : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
} }
void I210ToARGBRow_NEON(const uint16_t* src_y, void I210ToARGBRow_NEON(const uint16_t* src_y,
@ -374,7 +371,8 @@ void I210ToARGBRow_NEON(const uint16_t* src_y,
asm volatile( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"movi v19.8b, #255 \n" "movi v19.8b, #255 \n"
"1: \n" READYUV210 NVTORGB RGBTORGB8 "1: \n" READYUV210 NVTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n" "subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
@ -397,7 +395,8 @@ void I410ToARGBRow_NEON(const uint16_t* src_y,
asm volatile( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"movi v19.8b, #255 \n" "movi v19.8b, #255 \n"
"1: \n" READYUV410 NVTORGB RGBTORGB8 "1: \n" READYUV410 NVTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n" "subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
@ -422,7 +421,8 @@ void I212ToARGBRow_NEON(const uint16_t* src_y,
asm volatile( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"movi v19.8b, #255 \n" "movi v19.8b, #255 \n"
"1: \n" READYUV212 NVTORGB RGBTORGB8 "1: \n" READYUV212 NVTORGB
RGBTORGB8
"subs %w[width], %w[width], #8 \n" "subs %w[width], %w[width], #8 \n"
"st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n" "st4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
"b.gt 1b \n" "b.gt 1b \n"
@ -526,22 +526,23 @@ void P210ToAR30Row_NEON(const uint16_t* src_y,
const uvec8* uv_coeff = &yuvconstants->kUVCoeff; const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
const uint16_t limit = 0x3ff0; const uint16_t limit = 0x3ff0;
asm volatile(YUVTORGB_SETUP asm volatile(
YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n" "dup v22.8h, %w[limit] \n"
"movi v23.8h, #0xc0, lsl #8 \n" // A "movi v23.8h, #0xc0, lsl #8 \n" // A
"ldr q2, [%[kIndices]] \n" "ldr q2, [%[kIndices]] \n"
"1: \n" READYUVP210 NVTORGB "1: \n" READYUVP210 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30 "subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n" "b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y] : [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_uv), // %[src_uv] [src_uv] "+r"(src_uv), // %[src_uv]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30] [dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width] [width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit] [limit] "r"(limit), // %[limit]
[kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices] [kIndices] "r"(kP210LoadShuffleIndices) // %[kIndices]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23"); : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
} }
void P410ToAR30Row_NEON(const uint16_t* src_y, void P410ToAR30Row_NEON(const uint16_t* src_y,
@ -552,22 +553,23 @@ void P410ToAR30Row_NEON(const uint16_t* src_y,
const uvec8* uv_coeff = &yuvconstants->kUVCoeff; const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias; const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
uint16_t limit = 0x3ff0; uint16_t limit = 0x3ff0;
asm volatile(YUVTORGB_SETUP asm volatile(
YUVTORGB_SETUP
"dup v22.8h, %w[limit] \n" "dup v22.8h, %w[limit] \n"
"movi v23.8h, #0xc0, lsl #8 \n" // A "movi v23.8h, #0xc0, lsl #8 \n" // A
"ldr q2, [%[kIndices]] \n" "ldr q2, [%[kIndices]] \n"
"1: \n" READYUVP410 NVTORGB "1: \n" READYUVP410 NVTORGB
"subs %w[width], %w[width], #8 \n" STOREAR30 "subs %w[width], %w[width], #8 \n" STOREAR30
"b.gt 1b \n" "b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y] : [src_y] "+r"(src_y), // %[src_y]
[src_uv] "+r"(src_uv), // %[src_uv] [src_uv] "+r"(src_uv), // %[src_uv]
[dst_ar30] "+r"(dst_ar30), // %[dst_ar30] [dst_ar30] "+r"(dst_ar30), // %[dst_ar30]
[width] "+r"(width) // %[width] [width] "+r"(width) // %[width]
: [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff] : [kUVCoeff] "r"(uv_coeff), // %[kUVCoeff]
[kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias] [kRGBCoeffBias] "r"(rgb_coeff), // %[kRGBCoeffBias]
[limit] "r"(limit), // %[limit] [limit] "r"(limit), // %[limit]
[kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices] [kIndices] "r"(kP410LoadShuffleIndices) // %[kIndices]
: "cc", "memory", YUVTORGB_REGS, "v22", "v23"); : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
} }
void I422ToAR30Row_NEON(const uint8_t* src_y, void I422ToAR30Row_NEON(const uint8_t* src_y,
@ -820,7 +822,8 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
READYUV422 I4XXTORGB RGBTORGB8_TOP READYUV422 I4XXTORGB RGBTORGB8_TOP
"subs %w[width], %w[width], #8 \n" // "subs %w[width], %w[width], #8 \n" //
ARGBTOARGB1555_FROM_TOP ARGBTOARGB1555_FROM_TOP
"st1 {v19.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels RGB1555. "st1 {v19.8h}, [%[dst_argb1555]], #16 \n" // store 8 pixels
// RGB1555.
"b.gt 1b \n" "b.gt 1b \n"
: [src_y] "+r"(src_y), // %[src_y] : [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u] [src_u] "+r"(src_u), // %[src_u]
@ -3460,7 +3463,8 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
"movi v6.16b, #66 \n" // R * 0.2578 coefficient "movi v6.16b, #66 \n" // R * 0.2578 coefficient
"movi v7.16b, #16 \n" // Add 16 constant "movi v7.16b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555 pixels. "ldp q0, q3, [%0], #32 \n" // load 16 ARGB1555
// pixels.
"subs %w2, %w2, #16 \n" // 16 processed per loop. "subs %w2, %w2, #16 \n" // 16 processed per loop.
RGB555TOARGB RGB555TOARGB
"umull v16.8h, v0.8b, v4.8b \n" // B "umull v16.8h, v0.8b, v4.8b \n" // B
@ -3492,7 +3496,8 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
"movi v26.16b, #66 \n" // R * 0.2578 coefficient "movi v26.16b, #66 \n" // R * 0.2578 coefficient
"movi v27.16b, #16 \n" // Add 16 constant "movi v27.16b, #16 \n" // Add 16 constant
"1: \n" "1: \n"
"ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444 pixels. "ldp q0, q3, [%0], #32 \n" // load 16 ARGB4444
// pixels.
"subs %w2, %w2, #16 \n" // 16 processed per loop. "subs %w2, %w2, #16 \n" // 16 processed per loop.
ARGB4444TORGB ARGB4444TORGB
"umull v16.8h, v0.8b, v24.8b \n" // B "umull v16.8h, v0.8b, v24.8b \n" // B
@ -4136,7 +4141,8 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
uint32_t value) { uint32_t value) {
asm volatile( asm volatile(
"dup v0.4s, %w3 \n" // duplicate scale value. "dup v0.4s, %w3 \n" // duplicate scale value.
"zip1 v0.16b, v0.16b, v0.16b \n" // v0.16b aarrggbbaarrggbb. "zip1 v0.16b, v0.16b, v0.16b \n" // v0.16b
// aarrggbbaarrggbb.
"ushr v0.8h, v0.8h, #1 \n" // scale / 2. "ushr v0.8h, v0.8h, #1 \n" // scale / 2.
// 8 pixel loop. // 8 pixel loop.
@ -5277,11 +5283,11 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
"dup v2.8h, %w3 \n" "dup v2.8h, %w3 \n"
"1: \n" "1: \n"
"ldp q0, q1, [%0], #32 \n" "ldp q0, q1, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"mul v0.8h, v0.8h, v2.8h \n" "mul v0.8h, v0.8h, v2.8h \n"
"prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%0, 448] \n"
"mul v1.8h, v1.8h, v2.8h \n" "mul v1.8h, v1.8h, v2.8h \n"
"stp q0, q1, [%1], #32 \n" // store 16 pixels "stp q0, q1, [%1], #32 \n" // store 16 pixels
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
@ -5298,6 +5304,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
"dup v4.8h, %w3 \n" "dup v4.8h, %w3 \n"
"1: \n" "1: \n"
"ldp q2, q3, [%0], #32 \n" "ldp q2, q3, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"umull v0.4s, v2.4h, v4.4h \n" "umull v0.4s, v2.4h, v4.4h \n"
"umull2 v1.4s, v2.8h, v4.8h \n" "umull2 v1.4s, v2.8h, v4.8h \n"
"umull v2.4s, v3.4h, v4.4h \n" "umull v2.4s, v3.4h, v4.4h \n"
@ -5306,7 +5313,6 @@ void DivideRow_16_NEON(const uint16_t* src_y,
"uzp2 v0.8h, v0.8h, v1.8h \n" "uzp2 v0.8h, v0.8h, v1.8h \n"
"uzp2 v1.8h, v2.8h, v3.8h \n" "uzp2 v1.8h, v2.8h, v3.8h \n"
"stp q0, q1, [%1], #32 \n" // store 16 pixels "stp q0, q1, [%1], #32 \n" // store 16 pixels
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
@ -5332,11 +5338,11 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
"dup v2.8h, %w3 \n" "dup v2.8h, %w3 \n"
"1: \n" "1: \n"
"ldp q0, q1, [%0], #32 \n" "ldp q0, q1, [%0], #32 \n"
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"uqshl v0.8h, v0.8h, v2.8h \n" "uqshl v0.8h, v0.8h, v2.8h \n"
"uqshl v1.8h, v1.8h, v2.8h \n" "uqshl v1.8h, v1.8h, v2.8h \n"
"prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%0, 448] \n"
"uzp2 v0.16b, v0.16b, v1.16b \n" "uzp2 v0.16b, v0.16b, v1.16b \n"
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"str q0, [%1], #16 \n" // store 16 pixels "str q0, [%1], #16 \n" // store 16 pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
@ -5346,6 +5352,40 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
: "cc", "memory", "v0", "v1", "v2"); : "cc", "memory", "v0", "v1", "v2");
} }
// Use scale to convert J420 to I420
// scale parameter is 8.8 fixed point but limited to 0 to 255
// Function is based on DivideRow, but adds a bias
// Does not clamp
void Convert8To8Row_NEON(const uint8_t* src_y,
uint8_t* dst_y,
int scale,
int bias,
int width) {
asm volatile(
"dup v4.16b, %w3 \n" // scale
"dup v5.16b, %w4 \n" // bias
"1: \n"
"ldp q2, q3, [%0], #32 \n"
"subs %w2, %w2, #32 \n" // 32 pixels per loop
"umull v0.8h, v2.8b, v4.8b \n"
"umull2 v1.8h, v2.16b, v4.16b \n"
"umull v2.8h, v3.8b, v4.8b \n"
"umull2 v3.8h, v3.16b, v4.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"uzp2 v0.16b, v0.16b, v1.16b \n"
"uzp2 v1.16b, v2.16b, v3.16b \n"
"add v0.16b, v0.16b, v5.16b \n" // add bias (16)
"add v1.16b, v1.16b, v5.16b \n"
"stp q0, q1, [%1], #32 \n" // store 16 pixels
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale), // %3
"r"(bias) // %4
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -2336,9 +2336,9 @@ int I420Scale(const uint8_t* src_y,
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
int r; int r;
if (!src_y || !src_u || !src_v || src_width <= 0 || if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { dst_width <= 0 || dst_height <= 0) {
return -1; return -1;
} }
@ -2381,9 +2381,9 @@ int I420Scale_16(const uint16_t* src_y,
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
int r; int r;
if (!src_y || !src_u || !src_v || src_width <= 0 || if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { dst_width <= 0 || dst_height <= 0) {
return -1; return -1;
} }
@ -2426,9 +2426,9 @@ int I420Scale_12(const uint16_t* src_y,
int dst_halfheight = SUBSAMPLE(dst_height, 1, 1); int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
int r; int r;
if (!src_y || !src_u || !src_v || src_width <= 0 || if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { dst_width <= 0 || dst_height <= 0) {
return -1; return -1;
} }
@ -2470,9 +2470,9 @@ int I444Scale(const uint8_t* src_y,
enum FilterMode filtering) { enum FilterMode filtering) {
int r; int r;
if (!src_y || !src_u || !src_v || src_width <= 0 || if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { dst_width <= 0 || dst_height <= 0) {
return -1; return -1;
} }
@ -2511,9 +2511,9 @@ int I444Scale_16(const uint16_t* src_y,
enum FilterMode filtering) { enum FilterMode filtering) {
int r; int r;
if (!src_y || !src_u || !src_v || src_width <= 0 || if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { dst_width <= 0 || dst_height <= 0) {
return -1; return -1;
} }
@ -2552,9 +2552,9 @@ int I444Scale_12(const uint16_t* src_y,
enum FilterMode filtering) { enum FilterMode filtering) {
int r; int r;
if (!src_y || !src_u || !src_v || src_width <= 0 || if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { dst_width <= 0 || dst_height <= 0) {
return -1; return -1;
} }
@ -2598,9 +2598,9 @@ int I422Scale(const uint8_t* src_y,
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
int r; int r;
if (!src_y || !src_u || !src_v || src_width <= 0 || if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { dst_width <= 0 || dst_height <= 0) {
return -1; return -1;
} }
@ -2641,9 +2641,9 @@ int I422Scale_16(const uint16_t* src_y,
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
int r; int r;
if (!src_y || !src_u || !src_v || src_width <= 0 || if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { dst_width <= 0 || dst_height <= 0) {
return -1; return -1;
} }
@ -2684,9 +2684,9 @@ int I422Scale_12(const uint16_t* src_y,
int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1); int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
int r; int r;
if (!src_y || !src_u || !src_v || src_width <= 0 || if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y || src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
!dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) { dst_width <= 0 || dst_height <= 0) {
return -1; return -1;
} }

View File

@ -1369,7 +1369,8 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
"uzp2 v1.8h, v2.8h, v3.8h \n" "uzp2 v1.8h, v2.8h, v3.8h \n"
"uzp2 v2.8h, v4.8h, v5.8h \n" "uzp2 v2.8h, v4.8h, v5.8h \n"
"uzp2 v3.8h, v6.8h, v7.8h \n" "uzp2 v3.8h, v6.8h, v7.8h \n"
"subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per iteration. "subs %w[dst_width], %w[dst_width], #32 \n" // 32 elems per
// iteration.
"stp q0, q1, [%[dst_ptr]] \n" "stp q0, q1, [%[dst_ptr]] \n"
"stp q2, q3, [%[dst_ptr], #32] \n" "stp q2, q3, [%[dst_ptr], #32] \n"
"add %[dst_ptr], %[dst_ptr], #64 \n" "add %[dst_ptr], %[dst_ptr], #64 \n"

View File

@ -188,6 +188,7 @@ TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10)
TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10) TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10)
TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8) TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8)
TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H012, uint16_t, 2, 2, 2, 8) TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H012, uint16_t, 2, 2, 2, 8)
TESTPLANARTOP(J420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10) TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10)
TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10) TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10)
TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12) TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12)
@ -2107,6 +2108,28 @@ TEST_F(LibYUVConvertTest, TestRGB24ToI420) {
} }
#endif #endif
TEST_F(LibYUVConvertTest, TestJ420ToI420) {
const uint8_t src_y[12] = {0, 0, 128, 128, 255, 255,
0, 0, 128, 128, 255, 255};
const uint8_t src_u[3] = {0, 128, 255};
const uint8_t src_v[3] = {0, 128, 255};
uint8_t dst_y[12];
uint8_t dst_u[3];
uint8_t dst_v[3];
ASSERT_EQ(J420ToI420(src_y, 6, src_u, 3, src_v, 3, dst_y, 6, dst_u, 3, dst_v,
3, 6, 2),
0);
EXPECT_EQ(dst_y[0], 16);
EXPECT_EQ(dst_y[2], 126);
EXPECT_EQ(dst_y[4], 235);
EXPECT_EQ(dst_u[0], 16);
EXPECT_EQ(dst_u[1], 128);
EXPECT_EQ(dst_u[2], 240);
EXPECT_EQ(dst_v[0], 16);
EXPECT_EQ(dst_v[1], 128);
EXPECT_EQ(dst_v[2], 240);
}
#endif // !defined(LEAN_TESTS) #endif // !defined(LEAN_TESTS)
} // namespace libyuv } // namespace libyuv

View File

@ -3802,6 +3802,37 @@ TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
free_aligned_buffer_page_end(dst_pixels_y_c); free_aligned_buffer_page_end(dst_pixels_y_c);
} }
TEST_F(LibYUVPlanarTest, Convert8To8Plane) {
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels_y, kPixels);
align_buffer_page_end(dst_pixels_y_opt, kPixels);
align_buffer_page_end(dst_pixels_y_c, kPixels);
MemRandomize(src_pixels_y, kPixels);
memset(dst_pixels_y_opt, 0, kPixels);
memset(dst_pixels_y_c, 1, kPixels);
MaskCpuFlags(disable_cpu_flags_);
Convert8To8Plane(src_pixels_y, benchmark_width_, dst_pixels_y_c,
benchmark_width_, 220, 16, benchmark_width_,
benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
Convert8To8Plane(src_pixels_y, benchmark_width_, dst_pixels_y_opt,
benchmark_width_, 220, 16, benchmark_width_,
benchmark_height_);
}
for (int i = 0; i < kPixels; ++i) {
EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
}
free_aligned_buffer_page_end(src_pixels_y);
free_aligned_buffer_page_end(dst_pixels_y_opt);
free_aligned_buffer_page_end(dst_pixels_y_c);
}
TEST_F(LibYUVPlanarTest, YUY2ToY) { TEST_F(LibYUVPlanarTest, YUY2ToY) {
const int kPixels = benchmark_width_ * benchmark_height_; const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels_y, kPixels * 2); align_buffer_page_end(src_pixels_y, kPixels * 2);