mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
HalfMergeUVPlane function and optimized I444ToNV12 and I444ToNV21
Bug: libyuv:858 Change-Id: Ie1f03a9acaff02ee8059cf1e5c2c2e5afcde8592 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2154608 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
parent
d4c3f45eb6
commit
2f48ffd42b
@ -105,6 +105,19 @@ void MergeUVPlane(const uint8_t* src_u,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Scale U and V to half width and height and merge into interleaved UV plane.
|
||||
// width and height are source size, allowing odd sizes.
|
||||
// Use for converting I444 or I422 to NV12.
|
||||
LIBYUV_API
|
||||
void HalfMergeUVPlane(const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_uv,
|
||||
int dst_stride_uv,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Swap U and V channels in interleaved UV plane.
|
||||
LIBYUV_API
|
||||
void SwapUVPlane(const uint8_t* src_uv,
|
||||
|
||||
@ -273,6 +273,7 @@ extern "C" {
|
||||
#define HAS_ARGBTOAR30ROW_SSSE3
|
||||
#define HAS_CONVERT16TO8ROW_SSSE3
|
||||
#define HAS_CONVERT8TO16ROW_SSE2
|
||||
#define HAS_HALFMERGEUVROW_SSSE3
|
||||
// I210 is for H010. 2 = 422. I for 601 vs H for 709.
|
||||
#define HAS_I210TOAR30ROW_SSSE3
|
||||
#define HAS_I210TOARGBROW_SSSE3
|
||||
@ -343,7 +344,6 @@ extern "C" {
|
||||
#define HAS_ARGBTOUVJROW_NEON
|
||||
#define HAS_ARGBTOUVROW_NEON
|
||||
#define HAS_ARGBTOYJROW_NEON
|
||||
#define HAS_RGBATOYJROW_NEON
|
||||
#define HAS_ARGBTOYROW_NEON
|
||||
#define HAS_AYUVTOUVROW_NEON
|
||||
#define HAS_AYUVTOVUROW_NEON
|
||||
@ -353,6 +353,7 @@ extern "C" {
|
||||
#define HAS_BYTETOFLOATROW_NEON
|
||||
#define HAS_COPYROW_NEON
|
||||
#define HAS_HALFFLOATROW_NEON
|
||||
#define HAS_HALFMERGEUVROW_NEON
|
||||
#define HAS_I400TOARGBROW_NEON
|
||||
#define HAS_I422ALPHATOARGBROW_NEON
|
||||
#define HAS_I422TOARGB1555ROW_NEON
|
||||
@ -375,19 +376,20 @@ extern "C" {
|
||||
#define HAS_NV21TORGB24ROW_NEON
|
||||
#define HAS_NV21TOYUV24ROW_NEON
|
||||
#define HAS_RAWTOARGBROW_NEON
|
||||
#define HAS_RAWTORGBAROW_NEON
|
||||
#define HAS_RAWTORGB24ROW_NEON
|
||||
#define HAS_RAWTORGBAROW_NEON
|
||||
#define HAS_RAWTOUVROW_NEON
|
||||
#define HAS_RAWTOYROW_NEON
|
||||
#define HAS_RAWTOYJROW_NEON
|
||||
#define HAS_RAWTOYROW_NEON
|
||||
#define HAS_RGB24TOARGBROW_NEON
|
||||
#define HAS_RGB24TOUVROW_NEON
|
||||
#define HAS_RGB24TOYROW_NEON
|
||||
#define HAS_RGB24TOYJROW_NEON
|
||||
#define HAS_RGB24TOYROW_NEON
|
||||
#define HAS_RGB565TOARGBROW_NEON
|
||||
#define HAS_RGB565TOUVROW_NEON
|
||||
#define HAS_RGB565TOYROW_NEON
|
||||
#define HAS_RGBATOUVROW_NEON
|
||||
#define HAS_RGBATOYJROW_NEON
|
||||
#define HAS_RGBATOYROW_NEON
|
||||
#define HAS_SETROW_NEON
|
||||
#define HAS_SPLITRGBROW_NEON
|
||||
@ -1712,6 +1714,27 @@ void MergeUVRow_Any_MMI(const uint8_t* y_buf,
|
||||
uint8_t* dst_ptr,
|
||||
int width);
|
||||
|
||||
void HalfMergeUVRow_C(const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_uv,
|
||||
int width);
|
||||
|
||||
void HalfMergeUVRow_NEON(const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_uv,
|
||||
int width);
|
||||
|
||||
void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_uv,
|
||||
int width);
|
||||
|
||||
void SplitRGBRow_C(const uint8_t* src_rgb,
|
||||
uint8_t* dst_r,
|
||||
uint8_t* dst_g,
|
||||
|
||||
@ -426,7 +426,41 @@ int I444ToI420(const uint8_t* src_y,
|
||||
dst_v, dst_stride_v, width, height, width, height);
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Implement row conversion.
|
||||
LIBYUV_API
|
||||
int I444ToNV12(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_uv,
|
||||
int dst_stride_uv,
|
||||
int width,
|
||||
int height) {
|
||||
if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
|
||||
height == 0) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_y = src_y + (height - 1) * src_stride_y;
|
||||
src_u = src_u + (height - 1) * src_stride_u;
|
||||
src_v = src_v + (height - 1) * src_stride_v;
|
||||
src_stride_y = -src_stride_y;
|
||||
src_stride_u = -src_stride_u;
|
||||
src_stride_v = -src_stride_v;
|
||||
}
|
||||
if (dst_y) {
|
||||
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
|
||||
}
|
||||
HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
|
||||
dst_stride_uv, width, height);
|
||||
return 0;
|
||||
}
|
||||
|
||||
LIBYUV_API
|
||||
int I444ToNV21(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
@ -440,30 +474,9 @@ int I444ToNV21(const uint8_t* src_y,
|
||||
int dst_stride_vu,
|
||||
int width,
|
||||
int height) {
|
||||
int halfwidth = (width + 1) >> 1;
|
||||
int halfheight = (height + 1) >> 1;
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
halfheight = (height + 1) >> 1;
|
||||
src_y = src_y + (height - 1) * src_stride_y;
|
||||
src_u = src_u + (height - 1) * src_stride_u;
|
||||
src_v = src_v + (height - 1) * src_stride_v;
|
||||
src_stride_y = -src_stride_y;
|
||||
src_stride_u = -src_stride_u;
|
||||
src_stride_v = -src_stride_v;
|
||||
}
|
||||
// Allocate u and v buffers
|
||||
align_buffer_64(plane_u, halfwidth * halfheight * 2);
|
||||
uint8_t* plane_v = plane_u + halfwidth * halfheight;
|
||||
|
||||
I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
|
||||
dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
|
||||
height);
|
||||
MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
|
||||
halfwidth, halfheight);
|
||||
free_aligned_buffer_64(plane_u);
|
||||
return 0;
|
||||
return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
|
||||
src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
|
||||
width, height);
|
||||
}
|
||||
|
||||
// I400 is greyscale typically used in MJPG
|
||||
@ -498,46 +511,6 @@ int I400ToI420(const uint8_t* src_y,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Implement row conversion.
|
||||
LIBYUV_API
|
||||
int I444ToNV12(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_uv,
|
||||
int dst_stride_uv,
|
||||
int width,
|
||||
int height) {
|
||||
int halfwidth = (width + 1) >> 1;
|
||||
int halfheight = (height + 1) >> 1;
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
halfheight = (height + 1) >> 1;
|
||||
src_y = src_y + (height - 1) * src_stride_y;
|
||||
src_u = src_u + (height - 1) * src_stride_u;
|
||||
src_v = src_v + (height - 1) * src_stride_v;
|
||||
src_stride_y = -src_stride_y;
|
||||
src_stride_u = -src_stride_u;
|
||||
src_stride_v = -src_stride_v;
|
||||
}
|
||||
// Allocate u and v buffers
|
||||
align_buffer_64(plane_u, halfwidth * halfheight * 2);
|
||||
uint8_t* plane_v = plane_u + halfwidth * halfheight;
|
||||
|
||||
I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
|
||||
dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
|
||||
height);
|
||||
MergeUVPlane(plane_u, halfwidth, plane_v, halfwidth, dst_uv, dst_stride_uv,
|
||||
halfwidth, halfheight);
|
||||
free_aligned_buffer_64(plane_u);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// I400 is greyscale typically used in MJPG
|
||||
LIBYUV_API
|
||||
int I400ToNV21(const uint8_t* src_y,
|
||||
|
||||
@ -488,7 +488,6 @@ int I420ToUYVY(const uint8_t* src_y,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// TODO(fbarchard): test negative height for invert.
|
||||
LIBYUV_API
|
||||
int I420ToNV12(const uint8_t* src_y,
|
||||
int src_stride_y,
|
||||
@ -502,12 +501,23 @@ int I420ToNV12(const uint8_t* src_y,
|
||||
int dst_stride_uv,
|
||||
int width,
|
||||
int height) {
|
||||
int halfwidth = (width + 1) / 2;
|
||||
int halfheight = (height + 1) / 2;
|
||||
if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
|
||||
height == 0) {
|
||||
return -1;
|
||||
}
|
||||
int halfwidth = (width + 1) / 2;
|
||||
int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2;
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
halfheight = (height + 1) >> 1;
|
||||
src_y = src_y + (height - 1) * src_stride_y;
|
||||
src_u = src_u + (halfheight - 1) * src_stride_u;
|
||||
src_v = src_v + (halfheight - 1) * src_stride_v;
|
||||
src_stride_y = -src_stride_y;
|
||||
src_stride_u = -src_stride_u;
|
||||
src_stride_v = -src_stride_v;
|
||||
}
|
||||
if (dst_y) {
|
||||
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
|
||||
}
|
||||
|
||||
@ -4103,6 +4103,52 @@ int UYVYToNV12(const uint8_t* src_uyvy,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// width and height are src size allowing odd size handling.
|
||||
LIBYUV_API
|
||||
void HalfMergeUVPlane(const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_uv,
|
||||
int dst_stride_uv,
|
||||
int width,
|
||||
int height) {
|
||||
int y;
|
||||
void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u,
|
||||
const uint8_t* src_v, int src_stride_v,
|
||||
uint8_t* dst_uv, int width) = HalfMergeUVRow_C;
|
||||
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_u = src_u + (height - 1) * src_stride_u;
|
||||
src_v = src_v + (height - 1) * src_stride_v;
|
||||
src_stride_u = -src_stride_u;
|
||||
src_stride_v = -src_stride_v;
|
||||
}
|
||||
#if defined(HAS_HALFMERGEUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
|
||||
HalfMergeUVRow = HalfMergeUVRow_NEON;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_HALFMERGEUVROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
|
||||
HalfMergeUVRow = HalfMergeUVRow_SSSE3;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height - 1; y += 2) {
|
||||
// Merge a row of U and V into a row of UV.
|
||||
HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
|
||||
src_u += src_stride_u * 2;
|
||||
src_v += src_stride_v * 2;
|
||||
dst_uv += dst_stride_uv;
|
||||
}
|
||||
if (height & 1) {
|
||||
HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -3563,6 +3563,30 @@ void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
}
|
||||
}
|
||||
|
||||
void HalfMergeUVRow_C(const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_uv,
|
||||
int width) {
|
||||
int x;
|
||||
for (x = 0; x < width - 1; x += 2) {
|
||||
dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] +
|
||||
src_u[src_stride_u + 1] + 2) >>
|
||||
2;
|
||||
dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] +
|
||||
src_v[src_stride_v + 1] + 2) >>
|
||||
2;
|
||||
src_u += 2;
|
||||
src_v += 2;
|
||||
dst_uv += 2;
|
||||
}
|
||||
if (width & 1) {
|
||||
dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1;
|
||||
dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -1078,6 +1078,8 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
}
|
||||
#endif
|
||||
|
||||
// clang-format off
|
||||
|
||||
// TODO(mraptis): Consider passing R, G, B multipliers as parameter.
|
||||
// round parameter is register containing value to add before shift.
|
||||
#define RGBTOY(round) \
|
||||
@ -1102,10 +1104,8 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
"phaddw %%xmm0,%%xmm6 \n" \
|
||||
"phaddw %%xmm2,%%xmm1 \n" \
|
||||
"prefetcht0 1280(%0) \n" \
|
||||
"paddw %%" #round \
|
||||
",%%xmm6 \n" \
|
||||
"paddw %%" #round \
|
||||
",%%xmm1 \n" \
|
||||
"paddw %%" #round ",%%xmm6 \n" \
|
||||
"paddw %%" #round ",%%xmm1 \n" \
|
||||
"psrlw $0x8,%%xmm6 \n" \
|
||||
"psrlw $0x8,%%xmm1 \n" \
|
||||
"packuswb %%xmm1,%%xmm6 \n" \
|
||||
@ -1132,10 +1132,8 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \
|
||||
"vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \
|
||||
"prefetcht0 1280(%0) \n" \
|
||||
"vpaddw %%" #round \
|
||||
",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
|
||||
"vpaddw %%" #round \
|
||||
",%%ymm2,%%ymm2 \n" \
|
||||
"vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \
|
||||
"vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \
|
||||
"vpsrlw $0x8,%%ymm0,%%ymm0 \n" \
|
||||
"vpsrlw $0x8,%%ymm2,%%ymm2 \n" \
|
||||
"vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \
|
||||
@ -1146,6 +1144,8 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
"jg 1b \n" \
|
||||
"vzeroupper \n"
|
||||
|
||||
// clang-format on
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
|
||||
void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
@ -7005,6 +7005,53 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
}
|
||||
#endif // HAS_SWAPUVROW_AVX2
|
||||
|
||||
void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_uv,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||
"psrlw $0xf,%%xmm4 \n"
|
||||
"packuswb %%xmm4,%%xmm4 \n"
|
||||
"pxor %%xmm5,%%xmm5 \n"
|
||||
"1: \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n" // load 16 U values
|
||||
"movdqu (%1),%%xmm1 \n" // load 16 V values
|
||||
"movdqu 0(%0,%4,1),%%xmm2 \n" // 16 from next row
|
||||
"movdqu 0(%1,%5,1),%%xmm3 \n"
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm0 \n" // half size
|
||||
"pmaddubsw %%xmm4,%%xmm1 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm2 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm3 \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
"paddw %%xmm2,%%xmm0 \n"
|
||||
"paddw %%xmm3,%%xmm1 \n"
|
||||
"psrlw $0x1,%%xmm0 \n"
|
||||
"psrlw $0x1,%%xmm1 \n"
|
||||
"pavgw %%xmm5,%%xmm0 \n"
|
||||
"pavgw %%xmm5,%%xmm1 \n"
|
||||
"packuswb %%xmm0,%%xmm0 \n"
|
||||
"packuswb %%xmm1,%%xmm1 \n"
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"movdqu %%xmm0,(%2) \n" // store 8 UV pixels
|
||||
"lea 0x10(%2),%2 \n"
|
||||
"sub $0x10,%3 \n" // 16 src pixels per loop
|
||||
"jg 1b \n"
|
||||
: "+r"(src_u), // %0
|
||||
"+r"(src_v), // %1
|
||||
"+r"(dst_uv), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"((intptr_t)(src_stride_u)), // %4
|
||||
"r"((intptr_t)(src_stride_v)) // %5
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
||||
}
|
||||
|
||||
#endif // defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -2984,6 +2984,39 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
: "cc", "memory", "q0", "q1", "q2");
|
||||
}
|
||||
|
||||
void HalfMergeUVRow_NEON(const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_uv,
|
||||
int width) {
|
||||
const uint8_t* src_u_1 = src_u + src_stride_u;
|
||||
const uint8_t* src_v_1 = src_v + src_stride_v;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // load 16 U values
|
||||
"vld1.8 {q1}, [%2]! \n" // load 16 V values
|
||||
"vld1.8 {q2}, [%1]! \n"
|
||||
"vld1.8 {q3}, [%3]! \n"
|
||||
"vpaddl.u8 q0, q0 \n" // half size
|
||||
"vpaddl.u8 q1, q1 \n"
|
||||
"vpadal.u8 q0, q2 \n"
|
||||
"vpadal.u8 q1, q3 \n"
|
||||
"vqrshrn.u16 d0, q0, #2 \n"
|
||||
"vqrshrn.u16 d1, q1, #2 \n"
|
||||
"subs %5, %5, #16 \n" // 16 src pixels per loop
|
||||
"vst2.8 {d0, d1}, [%4]! \n" // store 8 UV pixels
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_u), // %0
|
||||
"+r"(src_u_1), // %1
|
||||
"+r"(src_v), // %2
|
||||
"+r"(src_v_1), // %3
|
||||
"+r"(dst_uv), // %4
|
||||
"+r"(width) // %5
|
||||
:
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3");
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -3188,11 +3188,12 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
|
||||
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
|
||||
"uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
|
||||
"uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
|
||||
"uqrshrn v2.8b, v1.8h, #2 \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop.
|
||||
"st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ayuv), // %0
|
||||
"+r"(src_ayuv_1), // %1
|
||||
@ -3210,18 +3211,18 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
|
||||
asm volatile(
|
||||
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
|
||||
// pixels.
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
|
||||
"uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
|
||||
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
|
||||
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
|
||||
"uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
|
||||
"uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
|
||||
"uqrshrn v1.8b, v1.8h, #2 \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"subs %w3, %w3, #16 \n" // 16 processed per loop.
|
||||
"st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_ayuv), // %0
|
||||
"+r"(src_ayuv_1), // %1
|
||||
@ -3265,6 +3266,41 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
: "cc", "memory", "v0", "v1", "v2");
|
||||
}
|
||||
|
||||
void HalfMergeUVRow_NEON(const uint8_t* src_u,
|
||||
int src_stride_u,
|
||||
const uint8_t* src_v,
|
||||
int src_stride_v,
|
||||
uint8_t* dst_uv,
|
||||
int width) {
|
||||
const uint8_t* src_u_1 = src_u + src_stride_u;
|
||||
const uint8_t* src_v_1 = src_v + src_stride_v;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld1 {v0.16b}, [%0], #16 \n" // load 16 U values
|
||||
"ld1 {v1.16b}, [%2], #16 \n" // load 16 V values
|
||||
"ld1 {v2.16b}, [%1], #16 \n"
|
||||
"ld1 {v3.16b}, [%3], #16 \n"
|
||||
"uaddlp v0.8h, v0.16b \n" // half size
|
||||
"uaddlp v1.8h, v1.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead
|
||||
"uadalp v0.8h, v2.16b \n"
|
||||
"uadalp v1.8h, v3.16b \n"
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"uqrshrn v0.8b, v0.8h, #2 \n"
|
||||
"uqrshrn v1.8b, v1.8h, #2 \n"
|
||||
"subs %w5, %w5, #16 \n" // 16 src pixels per loop
|
||||
"st2 {v0.8b, v1.8b}, [%4], #16 \n" // store 8 UV pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_u), // %0
|
||||
"+r"(src_u_1), // %1
|
||||
"+r"(src_v), // %2
|
||||
"+r"(src_v_1), // %3
|
||||
"+r"(dst_uv), // %4
|
||||
"+r"(width) // %5
|
||||
:
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3");
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
#include "libyuv/cpu_id.h"
|
||||
#include "libyuv/planar_functions.h"
|
||||
#include "libyuv/rotate.h"
|
||||
#include "libyuv/scale.h"
|
||||
|
||||
#ifdef ENABLE_ROW_TESTS
|
||||
// row.h defines SIMD_ALIGNED, overriding unit_test.h
|
||||
@ -3479,4 +3480,50 @@ TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) {
|
||||
free_aligned_buffer_page_end(orig_pixels);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) {
|
||||
// Round count up to multiple of 16
|
||||
int dst_width = (benchmark_width_ + 1) / 2;
|
||||
int dst_height = (benchmark_height_ + 1) / 2;
|
||||
align_buffer_page_end(src_pixels_u, benchmark_width_ * benchmark_height_);
|
||||
align_buffer_page_end(src_pixels_v, benchmark_width_ * benchmark_height_);
|
||||
align_buffer_page_end(tmp_pixels_u, dst_width * dst_height);
|
||||
align_buffer_page_end(tmp_pixels_v, dst_width * dst_height);
|
||||
align_buffer_page_end(dst_pixels_uv_opt, dst_width * 2 * dst_height);
|
||||
align_buffer_page_end(dst_pixels_uv_c, dst_width * 2 * dst_height);
|
||||
|
||||
MemRandomize(src_pixels_u, benchmark_width_ * benchmark_height_);
|
||||
MemRandomize(src_pixels_v, benchmark_width_ * benchmark_height_);
|
||||
MemRandomize(tmp_pixels_u, dst_width * dst_height);
|
||||
MemRandomize(tmp_pixels_v, dst_width * dst_height);
|
||||
MemRandomize(dst_pixels_uv_opt, dst_width * 2 * dst_height);
|
||||
MemRandomize(dst_pixels_uv_c, dst_width * 2 * dst_height);
|
||||
|
||||
ScalePlane(src_pixels_u, benchmark_width_, benchmark_width_,
|
||||
benchmark_height_,
|
||||
|
||||
tmp_pixels_u, dst_width, dst_width, dst_height, kFilterBilinear);
|
||||
ScalePlane(src_pixels_v, benchmark_width_, benchmark_width_,
|
||||
benchmark_height_, tmp_pixels_v, dst_width, dst_width, dst_height,
|
||||
kFilterBilinear);
|
||||
MergeUVPlane(tmp_pixels_u, dst_width, tmp_pixels_v, dst_width,
|
||||
dst_pixels_uv_c, dst_width * 2, dst_width, dst_height);
|
||||
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||
HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v,
|
||||
benchmark_width_, dst_pixels_uv_opt, dst_width * 2,
|
||||
benchmark_width_, benchmark_height_);
|
||||
}
|
||||
|
||||
for (int i = 0; i < dst_width * 2 * dst_height; ++i) {
|
||||
EXPECT_EQ(dst_pixels_uv_c[i], dst_pixels_uv_opt[i]);
|
||||
}
|
||||
|
||||
free_aligned_buffer_page_end(src_pixels_u);
|
||||
free_aligned_buffer_page_end(src_pixels_v);
|
||||
free_aligned_buffer_page_end(tmp_pixels_u);
|
||||
free_aligned_buffer_page_end(tmp_pixels_v);
|
||||
free_aligned_buffer_page_end(dst_pixels_uv_opt);
|
||||
free_aligned_buffer_page_end(dst_pixels_uv_c);
|
||||
}
|
||||
|
||||
} // namespace libyuv
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user