YUY2ToNV12 using YUY2ToY and YUY2ToNVUV

- Optimized YUY2ToNV12 that reduces it from 3 steps to 2 steps
  - Was SplitUV, memcpy Y, InterpolateUV
  - Now YUY2ToY, YUY2ToNVUV
- rollback LIBYUV_UNLIMITED_DATA

3840x2160 1000 iterations:

Pixel 2 Cortex A73
Was YUY2ToNV12_Opt (6515 ms)
Now YUY2ToNV12_Opt (3350 ms)

AB7 Mediatek P35 Cortex A53
Was YUY2ToNV12_Opt (6435 ms)
Now YUY2ToNV12_Opt (3301 ms)

Skylake AVX2 x64
Was YUY2ToNV12_Opt (1872 ms)
Now YUY2ToNV12_Opt (1657 ms)

SSE2 x64
Was YUY2ToNV12_Opt (2008 ms)
Now YUY2ToNV12_Opt (1691 ms)

Windows Skylake AVX2 32 bit x86
Was YUY2ToNV12_Opt (2161 ms)
Now YUY2ToNV12_Opt (1628 ms)

Bug: libyuv:943
Change-Id: I6c2ba2ae765413426baf770b837de114f808f6d0
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3929843
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2022-09-30 15:12:37 -07:00 committed by libyuv LUCI CQ
parent 9ba40a8f03
commit 00950840d1
9 changed files with 253 additions and 104 deletions

View File

@ -236,8 +236,8 @@ extern "C" {
#define HAS_UYVYTOUVROW_AVX2
#define HAS_UYVYTOYROW_AVX2
#define HAS_YUY2TOARGBROW_AVX2
#define HAS_YUY2TOUV422ROW_AVX2
#define HAS_YUY2TOUVROW_AVX2
#define HAS_YUY2TOUV422ROW_AVX2
#define HAS_YUY2TOYROW_AVX2
// Effects:
@ -326,6 +326,7 @@ extern "C" {
#define HAS_SPLITXRGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3
#define HAS_SWAPUVROW_SSSE3
#define HAS_YUY2TONVUVROW_SSE2
#if defined(__x86_64__) || !defined(__pic__)
// TODO(fbarchard): fix build error on android_full_debug=1
@ -391,6 +392,7 @@ extern "C" {
#define HAS_SPLITXRGBROW_AVX2
#define HAS_SPLITUVROW_16_AVX2
#define HAS_SWAPUVROW_AVX2
#define HAS_YUY2TONVUVROW_AVX2
#if defined(__x86_64__) || !defined(__pic__)
// TODO(fbarchard): fix build error on android_full_debug=1
@ -524,8 +526,9 @@ extern "C" {
#define HAS_UYVYTOUVROW_NEON
#define HAS_UYVYTOYROW_NEON
#define HAS_YUY2TOARGBROW_NEON
#define HAS_YUY2TOUV422ROW_NEON
#define HAS_YUY2TOUVROW_NEON
#define HAS_YUY2TONVUVROW_NEON
#define HAS_YUY2TOUV422ROW_NEON
#define HAS_YUY2TOYROW_NEON
// Effects:
@ -4724,6 +4727,10 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_uv,
int width);
void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
@ -4734,6 +4741,10 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_uv,
int width);
void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
@ -4744,6 +4755,10 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_uv,
int width);
void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
@ -4774,6 +4789,10 @@ void YUY2ToUVRow_C(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void YUY2ToNVUVRow_C(const uint8_t* src_yuy2,
int src_stride_yuy2,
uint8_t* dst_uv,
int width);
void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
uint8_t* dst_u,
uint8_t* dst_v,
@ -4784,6 +4803,10 @@ void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void YUY2ToNVUVRow_Any_AVX2(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_uv,
int width);
void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
@ -4794,6 +4817,10 @@ void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void YUY2ToNVUVRow_Any_SSE2(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_uv,
int width);
void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
@ -4804,6 +4831,10 @@ void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void YUY2ToNVUVRow_Any_NEON(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_uv,
int width);
void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,

View File

@ -5095,9 +5095,6 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
return 0;
}
// TODO(fbarchard): Consider if width is even Y channel can be split
// directly. A SplitUVRow_Odd function could copy the remaining chroma.
LIBYUV_API
int YUY2ToNV12(const uint8_t* src_yuy2,
int src_stride_yuy2,
@ -5108,13 +5105,10 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
int width,
int height) {
int y;
int halfwidth = (width + 1) >> 1;
void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
int width) = SplitUVRow_C;
void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
ptrdiff_t src_stride, int dst_width,
int source_y_fraction) = InterpolateRow_C;
void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
YUY2ToYRow_C;
void (*YUY2ToNVUVRow)(const uint8_t* src_yuy2, int stride_yuy2,
uint8_t* dst_uv, int width) = YUY2ToNVUVRow_C;
if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
return -1;
}
@ -5125,109 +5119,83 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
src_stride_yuy2 = -src_stride_yuy2;
}
#if defined(HAS_SPLITUVROW_SSE2)
#if defined(HAS_YUY2TOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
SplitUVRow = SplitUVRow_Any_SSE2;
YUY2ToYRow = YUY2ToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
SplitUVRow = SplitUVRow_SSE2;
YUY2ToYRow = YUY2ToYRow_SSE2;
}
}
#endif
#if defined(HAS_SPLITUVROW_AVX2)
#if defined(HAS_YUY2TOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
SplitUVRow = SplitUVRow_Any_AVX2;
YUY2ToYRow = YUY2ToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
SplitUVRow = SplitUVRow_AVX2;
YUY2ToYRow = YUY2ToYRow_AVX2;
}
}
#endif
#if defined(HAS_SPLITUVROW_NEON)
#if defined(HAS_YUY2TOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SplitUVRow = SplitUVRow_Any_NEON;
YUY2ToYRow = YUY2ToYRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
SplitUVRow = SplitUVRow_NEON;
YUY2ToYRow = YUY2ToYRow_NEON;
}
}
#endif
#if defined(HAS_SPLITUVROW_MSA)
#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
SplitUVRow = SplitUVRow_Any_MSA;
YUY2ToYRow = YUY2ToYRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
SplitUVRow = SplitUVRow_MSA;
YUY2ToYRow = YUY2ToYRow_MSA;
}
}
#endif
#if defined(HAS_SPLITUVROW_LSX)
if (TestCpuFlag(kCpuHasLSX)) {
SplitUVRow = SplitUVRow_Any_LSX;
#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
YUY2ToYRow = YUY2ToYRow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
SplitUVRow = SplitUVRow_LSX;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
InterpolateRow = InterpolateRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
InterpolateRow = InterpolateRow_SSSE3;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
InterpolateRow = InterpolateRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
InterpolateRow = InterpolateRow_AVX2;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
InterpolateRow = InterpolateRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
InterpolateRow = InterpolateRow_NEON;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
InterpolateRow = InterpolateRow_Any_MSA;
if (IS_ALIGNED(width, 32)) {
InterpolateRow = InterpolateRow_MSA;
}
}
#endif
#if defined(HAS_INTERPOLATEROW_LSX)
if (TestCpuFlag(kCpuHasLSX)) {
InterpolateRow = InterpolateRow_Any_LSX;
if (IS_ALIGNED(width, 32)) {
InterpolateRow = InterpolateRow_LSX;
YUY2ToYRow = YUY2ToYRow_LASX;
}
}
#endif
{
int awidth = halfwidth * 2;
// row of y and 2 rows of uv
align_buffer_64(rows, awidth * 3);
#if defined(HAS_YUY2TONVUVROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
YUY2ToNVUVRow = YUY2ToNVUVRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
YUY2ToNVUVRow = YUY2ToNVUVRow_SSE2;
}
}
#endif
#if defined(HAS_YUY2TONVUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
YUY2ToNVUVRow = YUY2ToNVUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
YUY2ToNVUVRow = YUY2ToNVUVRow_AVX2;
}
}
#endif
#if defined(HAS_YUY2TONVUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
YUY2ToNVUVRow = YUY2ToNVUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
YUY2ToNVUVRow = YUY2ToNVUVRow_NEON;
}
}
#endif
for (y = 0; y < height - 1; y += 2) {
// Split Y from UV.
SplitUVRow(src_yuy2, rows, rows + awidth, awidth);
memcpy(dst_y, rows, width);
SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth);
memcpy(dst_y + dst_stride_y, rows, width);
InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
src_yuy2 += src_stride_yuy2 * 2;
dst_y += dst_stride_y * 2;
dst_uv += dst_stride_uv;
}
if (height & 1) {
// Split Y from UV.
SplitUVRow(src_yuy2, rows, dst_uv, awidth);
memcpy(dst_y, rows, width);
}
free_aligned_buffer_64(rows);
for (y = 0; y < height - 1; y += 2) {
YUY2ToYRow(src_yuy2, dst_y, width);
YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
YUY2ToNVUVRow(src_yuy2, src_stride_yuy2, dst_uv, width);
src_yuy2 += src_stride_yuy2 * 2;
dst_y += dst_stride_y * 2;
dst_uv += dst_stride_uv;
}
if (height & 1) {
YUY2ToYRow(src_yuy2, dst_y, width);
YUY2ToNVUVRow(src_yuy2, 0, dst_uv, width);
}
return 0;
}

View File

@ -673,6 +673,35 @@ ANY21(SobelXYRow_Any_LSX, SobelXYRow_LSX, 0, 1, 1, 4, 15)
#endif
#undef ANY21
// Any 2 planes to 1 with stride
// width is measured in source pixels. 4 bytes contains 2 pixels
#define ANY21S(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
void NAMEANY(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, \
int width) { \
SIMD_ALIGNED(uint8_t temp[32 * 3]); \
memset(temp, 0, 32 * 2); /* for msan */ \
int awidth = (width + 1) / 2; \
int r = awidth & MASK; \
int n = awidth & ~MASK; \
if (n > 0) { \
ANY_SIMD(src_yuy2, stride_yuy2, dst_uv, n * 2); \
} \
memcpy(temp, src_yuy2 + n * SBPP, r * SBPP); \
memcpy(temp + 32, src_yuy2 + stride_yuy2 + n * SBPP, r * SBPP); \
ANY_SIMD(temp, 32, temp + 64, MASK + 1); \
memcpy(dst_uv + n * BPP, temp + 64, r * BPP); \
}
#ifdef HAS_YUY2TONVUVROW_NEON
ANY21S(YUY2ToNVUVRow_Any_NEON, YUY2ToNVUVRow_NEON, 4, 2, 7)
#endif
#ifdef HAS_YUY2TONVUVROW_SSE2
ANY21S(YUY2ToNVUVRow_Any_SSE2, YUY2ToNVUVRow_SSE2, 4, 2, 7)
#endif
#ifdef HAS_YUY2TONVUVROW_AVX2
ANY21S(YUY2ToNVUVRow_Any_AVX2, YUY2ToNVUVRow_AVX2, 4, 2, 15)
#endif
// Any 2 planes to 1 with yuvconstants
#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \

View File

@ -22,9 +22,13 @@ extern "C" {
#endif
// This macro controls YUV to RGB using unsigned math to extend range of
// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B.
// Enable this macro for backwards compatiability with limited range 0 to 2.
// LIBYUV_LIMITED_DATA
// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B:
// LIBYUV_UNLIMITED_DATA
// Macros to enable unlimited data for each colorspace
// LIBYUV_UNLIMITED_BT601
// LIBYUV_UNLIMITED_BT709
// LIBYUV_UNLIMITED_BT2020
// The following macro from row_win makes the C code match the row_win code,
// which is 7 bit fixed point for ARGBToI420:
@ -1480,7 +1484,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
// KR = 0.299; KB = 0.114
// U and V contributions to R,G,B.
#if !defined(LIBYUV_LIMITED_DATA)
#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT601)
#define UB 129 /* round(2.018 * 64) */
#else
#define UB 128 /* max(128, round(2.018 * 64)) */
@ -1534,7 +1538,7 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR)
// KR = 0.2126, KB = 0.0722
// U and V contributions to R,G,B.
#if !defined(LIBYUV_LIMITED_DATA)
#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT709)
#define UB 135 /* round(2.112 * 64) */
#else
#define UB 128 /* max(128, round(2.112 * 64)) */
@ -1588,7 +1592,7 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR)
// KR = 0.2627; KB = 0.0593
// U and V contributions to R,G,B.
#if !defined(LIBYUV_LIMITED_DATA)
#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT2020)
#define UB 137 /* round(2.142 * 64) */
#else
#define UB 128 /* max(128, round(2.142 * 64)) */
@ -3094,6 +3098,21 @@ void YUY2ToUVRow_C(const uint8_t* src_yuy2,
}
}
// Filter 2 rows of YUY2 UV's (422) into UV (NV12).
void YUY2ToNVUVRow_C(const uint8_t* src_yuy2,
int src_stride_yuy2,
uint8_t* dst_uv,
int width) {
// Output a row of UV values, filtering 2 rows of YUY2.
int x;
for (x = 0; x < width; x += 2) {
dst_uv[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
dst_uv[1] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
src_yuy2 += 4;
dst_uv += 2;
}
}
// Copy row of YUY2 UV's (422) into U and V (422).
void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
uint8_t* dst_u,

View File

@ -6739,6 +6739,33 @@ void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
: "memory", "cc", "xmm0", "xmm1", "xmm5");
}
void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_uv,
int width) {
asm volatile(LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n"
"movdqu 0x00(%0,%3,1),%%xmm2 \n"
"movdqu 0x10(%0,%3,1),%%xmm3 \n"
"lea 0x20(%0),%0 \n"
"pavgb %%xmm2,%%xmm0 \n"
"pavgb %%xmm3,%%xmm1 \n"
"psrlw $0x8,%%xmm0 \n"
"psrlw $0x8,%%xmm1 \n"
"packuswb %%xmm1,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x10(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_uv), // %1
"+r"(width) // %2
: "r"((intptr_t)(stride_yuy2)) // %3
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
}
void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_u,
@ -6939,6 +6966,35 @@ void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
: "memory", "cc", "xmm0", "xmm1", "xmm5");
}
void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_uv,
int width) {
asm volatile(
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vpavgb 0x00(%0,%3,1),%%ymm0,%%ymm0 \n"
"vpavgb 0x20(%0,%3,1),%%ymm1,%%ymm1 \n"
"lea 0x40(%0),%0 \n"
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x20,%2 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_yuy2), // %0
"+r"(dst_uv), // %1
"+r"(width) // %2
: "r"((intptr_t)(stride_yuy2)) // %3
: "memory", "cc", "xmm0", "xmm1");
}
void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_u,

View File

@ -1583,6 +1583,29 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
);
}
void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_uv,
int width) {
asm volatile(
"add %1, %0, %1 \n" // stride + src_yuy2
"1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
"vld2.8 {q2, q3}, [%1]! \n" // load next row YUY2.
"vrhadd.u8 q4, q1, q3 \n" // average rows of UV
"vst1.8 {q4}, [%2]! \n" // store 8 UV.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(stride_yuy2), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
:
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
"d7" // Clobber List
);
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,

View File

@ -1808,6 +1808,29 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
);
}
void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
int stride_yuy2,
uint8_t* dst_uv,
int width) {
const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile(
"1: \n"
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
"ld2 {v2.16b,v3.16b}, [%1], #32 \n" // load next row
"urhadd v4.16b, v1.16b, v3.16b \n" // average rows of UV
"prfm pldl1keep, [%0, 448] \n"
"st1 {v4.16b}, [%2], #16 \n" // store 8 UV.
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(src_yuy2b), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
);
}
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb,

View File

@ -32,10 +32,10 @@ namespace libyuv {
#endif
#define ERROR_R 1
#define ERROR_G 1
#if defined(LIBYUV_LIMITED_DATA)
#define ERROR_B 18
#else
#ifdef LIBYUV_UNLIMITED_DATA
#define ERROR_B 1
#else
#define ERROR_B 18
#endif
#define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF) \
@ -502,10 +502,10 @@ TEST_F(LibYUVColorTest, TestYUV) {
YUVToRGB(240, 0, 0, &r1, &g1, &b1);
EXPECT_EQ(57, r1);
EXPECT_EQ(255, g1);
#if defined(LIBYUV_LIMITED_DATA)
EXPECT_EQ(5, b1);
#else
#ifdef LIBYUV_UNLIMITED_DATA
EXPECT_EQ(3, b1);
#else
EXPECT_EQ(5, b1);
#endif
for (int i = 0; i < 256; ++i) {

View File

@ -2620,10 +2620,10 @@ TEST_F(LibYUVConvertTest, TestMJPGToARGB) {
// Test result matches known hash value.
uint32_t dst_argb_hash = HashDjb2(dst_argb, width * height, 5381);
#if defined(LIBYUV_LIMITED_DATA)
EXPECT_EQ(dst_argb_hash, 2355976473u);
#else
#ifdef LIBYUV_UNLIMITED_DATA
EXPECT_EQ(dst_argb_hash, 3900633302u);
#else
EXPECT_EQ(dst_argb_hash, 2355976473u);
#endif
free_aligned_buffer_page_end(dst_argb);