Add AYUVToNV12 and NV21ToNV12

BUG=libyuv:832
TESTED=out/Release/libyuv_unittest --gtest_filter=*ToNV12* --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1

R=rrwinterton@gmail.com

Change-Id: Id03b4613211fb6a6e163d10daa7c692fe31e36d8
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1560080
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2019-04-12 10:20:44 -07:00 committed by Commit Bot
parent 4bd08cbc0e
commit 413a8d8041
20 changed files with 710 additions and 281 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1725 Version: 1727
License: BSD License: BSD
License File: LICENSE License File: LICENSE

View File

@ -226,6 +226,17 @@ int UYVYToI420(const uint8_t* src_uyvy,
int width, int width,
int height); int height);
// Convert AYUV to NV12.
LIBYUV_API
int AYUVToNV12(const uint8_t* src_ayuv,
int src_stride_ayuv,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height);
// Convert AYUV to NV21. // Convert AYUV to NV21.
LIBYUV_API LIBYUV_API
int AYUVToNV21(const uint8_t* src_ayuv, int AYUVToNV21(const uint8_t* src_ayuv,

View File

@ -224,6 +224,19 @@ int UYVYToNV12(const uint8_t* src_uyvy,
int width, int width,
int height); int height);
// Convert NV21 to NV12.
LIBYUV_API
int NV21ToNV12(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_vu,
int src_stride_vu,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height);
LIBYUV_API LIBYUV_API
int YUY2ToY(const uint8_t* src_yuy2, int YUY2ToY(const uint8_t* src_yuy2,
int src_stride_yuy2, int src_stride_yuy2,

View File

@ -332,6 +332,7 @@ extern "C" {
#define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOUVROW_NEON
#define HAS_ARGBTOYJROW_NEON #define HAS_ARGBTOYJROW_NEON
#define HAS_ARGBTOYROW_NEON #define HAS_ARGBTOYROW_NEON
#define HAS_AYUVTOUVROW_NEON
#define HAS_AYUVTOVUROW_NEON #define HAS_AYUVTOVUROW_NEON
#define HAS_AYUVTOYROW_NEON #define HAS_AYUVTOYROW_NEON
#define HAS_BGRATOUVROW_NEON #define HAS_BGRATOUVROW_NEON
@ -375,6 +376,7 @@ extern "C" {
#define HAS_SETROW_NEON #define HAS_SETROW_NEON
#define HAS_SPLITRGBROW_NEON #define HAS_SPLITRGBROW_NEON
#define HAS_SPLITUVROW_NEON #define HAS_SPLITUVROW_NEON
#define HAS_UVToVUROW_NEON
#define HAS_UYVYTOARGBROW_NEON #define HAS_UYVYTOARGBROW_NEON
#define HAS_UYVYTOUV422ROW_NEON #define HAS_UYVYTOUV422ROW_NEON
#define HAS_UYVYTOUVROW_NEON #define HAS_UYVYTOUVROW_NEON
@ -3370,17 +3372,34 @@ void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width); int width);
void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void UVToVURow_Any_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width); void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
void AYUVToVURow_C(const uint8_t* src_ayuv, int stride_ayuv, void AYUVToUVRow_C(const uint8_t* src_ayuv,
int stride_ayuv,
uint8_t* dst_uv,
int width);
void AYUVToVURow_C(const uint8_t* src_ayuv,
int stride_ayuv,
uint8_t* dst_vu, uint8_t* dst_vu,
int width); int width);
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width); void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
void AYUVToVURow_NEON(const uint8_t* src_ayuv, int stride_ayuv, void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
int stride_ayuv,
uint8_t* dst_uv,
int width);
void AYUVToVURow_NEON(const uint8_t* src_ayuv,
int stride_ayuv,
uint8_t* dst_vu, uint8_t* dst_vu,
int width); int width);
void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width); void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv, int stride_ayuv, void AYUVToUVRow_Any_NEON(const uint8_t* src_ayuv,
int stride_ayuv,
uint8_t* dst_uv,
int width);
void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv,
int stride_ayuv,
uint8_t* dst_vu, uint8_t* dst_vu,
int width); int width);
@ -4010,7 +4029,6 @@ void FloatDivToByteRow_NEON(const float* src_weights,
uint8_t* dst_mask, uint8_t* dst_mask,
int width); int width);
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1725 #define LIBYUV_VERSION 1727
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -880,6 +880,75 @@ int UYVYToI420(const uint8_t* src_uyvy,
return 0; return 0;
} }
// Convert AYUV to NV12.
LIBYUV_API
int AYUVToNV12(const uint8_t* src_ayuv,
int src_stride_ayuv,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height) {
int y;
void (*AYUVToUVRow)(const uint8_t* src_ayuv, int src_stride_ayuv,
uint8_t* dst_uv, int width) = AYUVToUVRow_C;
void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
AYUVToYRow_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv;
src_stride_ayuv = -src_stride_ayuv;
}
// place holders for future intel code
#if defined(HAS_AYUVTOYROW_SSE2)
if (TestCpuFlag(kCpuHasSSE2)) {
AYUVToUVRow = AYUVToUVRow_Any_SSE2;
AYUVToYRow = AYUVToYRow_Any_SSE2;
if (IS_ALIGNED(width, 16)) {
AYUVToUVRow = AYUVToUVRow_SSE2;
AYUVToYRow = AYUVToYRow_SSE2;
}
}
#endif
#if defined(HAS_AYUVTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
AYUVToUVRow = AYUVToUVRow_Any_AVX2;
AYUVToYRow = AYUVToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
AYUVToUVRow = AYUVToUVRow_AVX2;
AYUVToYRow = AYUVToYRow_AVX2;
}
}
#endif
#if defined(HAS_AYUVTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
AYUVToYRow = AYUVToYRow_Any_NEON;
AYUVToUVRow = AYUVToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
AYUVToYRow = AYUVToYRow_NEON;
AYUVToUVRow = AYUVToUVRow_NEON;
}
}
#endif
for (y = 0; y < height - 1; y += 2) {
AYUVToUVRow(src_ayuv, src_stride_ayuv, dst_uv, width);
AYUVToYRow(src_ayuv, dst_y, width);
AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width);
src_ayuv += src_stride_ayuv * 2;
dst_y += dst_stride_y * 2;
dst_uv += dst_stride_uv;
}
if (height & 1) {
AYUVToUVRow(src_ayuv, 0, dst_uv, width);
AYUVToYRow(src_ayuv, dst_y, width);
}
return 0;
}
// Convert AYUV to NV21. // Convert AYUV to NV21.
LIBYUV_API LIBYUV_API
int AYUVToNV21(const uint8_t* src_ayuv, int AYUVToNV21(const uint8_t* src_ayuv,
@ -892,8 +961,7 @@ int AYUVToNV21(const uint8_t* src_ayuv,
int height) { int height) {
int y; int y;
void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv, void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv,
uint8_t* dst_vu, int width) = uint8_t* dst_vu, int width) = AYUVToVURow_C;
AYUVToVURow_C;
void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) = void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) =
AYUVToYRow_C; AYUVToYRow_C;
// Negative height means invert the image. // Negative height means invert the image.
@ -2235,7 +2303,6 @@ int Android420ToI420(const uint8_t* src_y,
return 0; return 0;
} }
#ifdef __cplusplus #ifdef __cplusplus
} // extern "C" } // extern "C"
} // namespace libyuv } // namespace libyuv

View File

@ -2008,10 +2008,8 @@ int NV21ToYUV24(const uint8_t* src_y,
int width, int width,
int height) { int height) {
int y; int y;
void (*NV21ToYUV24Row)(const uint8_t* src_y, void (*NV21ToYUV24Row)(const uint8_t* src_y, const uint8_t* src_vu,
const uint8_t* src_vu, uint8_t* dst_yuv24, int width) = NV21ToYUV24Row_C;
uint8_t* dst_yuv24,
int width) = NV21ToYUV24Row_C;
if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) { if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) {
return -1; return -1;
} }

View File

@ -440,7 +440,6 @@ void MergeUVPlane(const uint8_t* src_u,
int y; int y;
void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
uint8_t* dst_uv, int width) = MergeUVRow_C; uint8_t* dst_uv, int width) = MergeUVRow_C;
// Coalesce rows.
// Negative height means invert the image. // Negative height means invert the image.
if (height < 0) { if (height < 0) {
height = -height; height = -height;
@ -504,6 +503,63 @@ void MergeUVPlane(const uint8_t* src_u,
} }
} }
// Convert NV21 to NV12.
LIBYUV_API
int NV21ToNV12(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_vu,
int src_stride_vu,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_uv,
int dst_stride_uv,
int width,
int height) {
int y;
void (*UVToVURow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
UVToVURow_C;
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if (!src_vu || !dst_uv || width <= 0 || height == 0) {
return -1;
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
halfheight = (height + 1) >> 1;
src_y = src_y + (height - 1) * src_stride_y;
src_vu = src_vu + (halfheight - 1) * src_stride_vu;
src_stride_y = -src_stride_y;
src_stride_vu = -src_stride_vu;
}
// Coalesce rows.
if (src_stride_vu == halfwidth * 2 && dst_stride_uv == halfwidth * 2) {
halfwidth *= halfheight;
halfheight = 1;
src_stride_vu = dst_stride_uv = 0;
}
#if defined(HAS_UVToVUROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
UVToVURow = UVToVURow_Any_NEON;
if (IS_ALIGNED(halfwidth, 16)) {
UVToVURow = UVToVURow_NEON;
}
}
#endif
if (dst_y) {
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
for (y = 0; y < halfheight; ++y) {
UVToVURow(src_vu, dst_uv, halfwidth);
src_vu += src_stride_vu;
dst_uv += dst_stride_uv;
}
return 0;
}
// Support function for NV12 etc RGB channels. // Support function for NV12 etc RGB channels.
// Width and height are plane sizes (typically half pixel width). // Width and height are plane sizes (typically half pixel width).
LIBYUV_API LIBYUV_API

View File

@ -707,10 +707,12 @@ ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31)
#ifdef HAS_UYVYTOYROW_MMI #ifdef HAS_UYVYTOYROW_MMI
ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15) ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15)
#endif #endif
#ifdef HAS_AYUVTOYROW_NEON #ifdef HAS_AYUVTOYROW_NEON
ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15) ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
#endif #endif
#ifdef HAS_AYUVTOYROW_NEON
ANY11(UVToVURow_Any_NEON, UVToVURow_NEON, 0, 2, 2, 15)
#endif
#ifdef HAS_RGB24TOARGBROW_NEON #ifdef HAS_RGB24TOARGBROW_NEON
ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
#endif #endif
@ -1416,6 +1418,7 @@ ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15)
} }
#ifdef HAS_AYUVTOVUROW_NEON #ifdef HAS_AYUVTOVUROW_NEON
ANY11S(AYUVToUVRow_Any_NEON, AYUVToUVRow_NEON, 0, 4, 15)
ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15) ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15)
#endif #endif
#undef ANY11S #undef ANY11S

View File

@ -3236,14 +3236,13 @@ void NV21ToYUV24Row_C(const uint8_t* src_y,
const uint8_t* src_vu, const uint8_t* src_vu,
uint8_t* dst_yuv24, uint8_t* dst_yuv24,
int width) { int width) {
int x; int x;
for (x = 0; x < width - 1; x += 2) { for (x = 0; x < width - 1; x += 2) {
dst_yuv24[0] = src_vu[0]; // V dst_yuv24[0] = src_vu[0]; // V
dst_yuv24[1] = src_vu[1]; // U dst_yuv24[1] = src_vu[1]; // U
dst_yuv24[2] = src_y[0]; // Y0 dst_yuv24[2] = src_y[0]; // Y0
dst_yuv24[3] = src_vu[0]; // V dst_yuv24[3] = src_vu[0]; // V
dst_yuv24[4] = src_vu[1]; // U dst_yuv24[4] = src_vu[1]; // U
dst_yuv24[5] = src_y[1]; // Y1 dst_yuv24[5] = src_y[1]; // Y1
src_y += 2; src_y += 2;
src_vu += 2; src_vu += 2;
@ -3256,6 +3255,33 @@ void NV21ToYUV24Row_C(const uint8_t* src_y,
} }
} }
// Filter 2 rows of AYUV UV's (444) into UV (420).
void AYUVToUVRow_C(const uint8_t* src_ayuv,
int src_stride_ayuv,
uint8_t* dst_uv,
int width) {
// Output a row of UV values, filtering 2x2 rows of AYUV.
int x;
for (x = 0; x < width; x += 2) {
dst_uv[0] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
src_ayuv[src_stride_ayuv + 5] + 2) >>
2;
dst_uv[1] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
src_ayuv[src_stride_ayuv + 4] + 2) >>
2;
src_ayuv += 8;
dst_uv += 2;
}
if (width & 1) {
dst_uv[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
src_ayuv[src_stride_ayuv + 0] + 2) >>
2;
dst_uv[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
src_ayuv[src_stride_ayuv + 1] + 2) >>
2;
}
}
// Filter 2 rows of AYUV UV's (444) into VU (420). // Filter 2 rows of AYUV UV's (444) into VU (420).
void AYUVToVURow_C(const uint8_t* src_ayuv, void AYUVToVURow_C(const uint8_t* src_ayuv,
int src_stride_ayuv, int src_stride_ayuv,
@ -3264,15 +3290,23 @@ void AYUVToVURow_C(const uint8_t* src_ayuv,
// Output a row of VU values, filtering 2x2 rows of AYUV. // Output a row of VU values, filtering 2x2 rows of AYUV.
int x; int x;
for (x = 0; x < width; x += 2) { for (x = 0; x < width; x += 2) {
dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] + src_ayuv[src_stride_ayuv + 4] + 2) >> 2; dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] +
dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] + src_ayuv[src_stride_ayuv + 5] + 2) >> 2; src_ayuv[src_stride_ayuv + 4] + 2) >>
2;
dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] +
src_ayuv[src_stride_ayuv + 5] + 2) >>
2;
src_ayuv += 8; src_ayuv += 8;
dst_vu += 2; dst_vu += 2;
} }
if (width & 1) { if (width & 1) {
dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + src_ayuv[src_stride_ayuv + 0] + 2) >> 2; dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] +
dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + src_ayuv[src_stride_ayuv + 1] + 2) >> 2; src_ayuv[src_stride_ayuv + 0] + 2) >>
} 2;
dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] +
src_ayuv[src_stride_ayuv + 1] + 2) >>
2;
}
} }
// Copy row of AYUV Y's into Y // Copy row of AYUV Y's into Y
@ -3280,11 +3314,23 @@ void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
// Output a row of Y values. // Output a row of Y values.
int x; int x;
for (x = 0; x < width; ++x) { for (x = 0; x < width; ++x) {
dst_y[x] = src_ayuv[2]; // v,u,y,a dst_y[x] = src_ayuv[2]; // v,u,y,a
src_ayuv += 4; src_ayuv += 4;
} }
} }
void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
int x;
for (x = 0; x < width; ++x) {
uint8_t u = src_uv[0];
uint8_t v = src_uv[1];
dst_vu[0] = v;
dst_vu[1] = u;
src_uv += 2;
dst_vu += 2;
}
}
// divide values by weights and provide mask to indicate weight of 0. // divide values by weights and provide mask to indicate weight of 0.
void FloatDivToByteRow_C(const float* src_weights, void FloatDivToByteRow_C(const float* src_weights,
const float* src_values, const float* src_values,

View File

@ -5238,7 +5238,7 @@ void ARGBMultiplyRow_AVX2(const uint8_t* src_argb0,
, ,
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif #endif
); );
} }
#endif // HAS_ARGBMULTIPLYROW_AVX2 #endif // HAS_ARGBMULTIPLYROW_AVX2
@ -6669,7 +6669,6 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
} }
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
#ifdef HAS_NV21TOYUV24ROW_AVX2 #ifdef HAS_NV21TOYUV24ROW_AVX2
// begin NV21ToYUV24Row_C avx2 constants // begin NV21ToYUV24Row_C avx2 constants
@ -6723,48 +6722,54 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
const uint8_t* src_vu, const uint8_t* src_vu,
uint8_t* dst_yuv24, uint8_t* dst_yuv24,
int width) { int width) {
uint8_t* src_y_ptr; uint8_t* src_y_ptr;
uint64_t src_offset = 0; uint64_t src_offset = 0;
uint64_t width64; uint64_t width64;
width64 = width; width64 = width;
src_y_ptr = (uint8_t *) src_y; src_y_ptr = (uint8_t*)src_y;
asm volatile( asm volatile(
"vmovdqu %5, %%ymm0 \n" //init blend value "vmovdqu %5, %%ymm0 \n" // init blend value
"vmovdqu %6, %%ymm1 \n" //init blend value "vmovdqu %6, %%ymm1 \n" // init blend value
"vmovdqu %7, %%ymm2 \n" //init blend value "vmovdqu %7, %%ymm2 \n" // init blend value
// "sub $0x20, %3 \n" //sub 32 from width for final loop // "sub $0x20, %3 \n" //sub 32 from width for final loop
LABELALIGN LABELALIGN
"1: \n" //label 1 "1: \n" // label 1
"vmovdqu (%0,%4), %%ymm3 \n" //src_y "vmovdqu (%0,%4), %%ymm3 \n" // src_y
"vmovdqu 1(%1,%4), %%ymm4 \n" //src_uv+1 "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1
"vmovdqu (%1), %%ymm5 \n" //src_uv "vmovdqu (%1), %%ymm5 \n" // src_uv
"vpshufb %8, %%ymm3, %%ymm13 \n" //y, kSHUF0 for shuf "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf
"vpshufb %9, %%ymm4, %%ymm14 \n" //uv+1, kSHUF1 for shuf "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for
"vpshufb %10, %%ymm5, %%ymm15 \n" //uv, kSHUF2 for shuf // shuf
"vpshufb %11, %%ymm3, %%ymm3 \n" //y kSHUF3 for shuf "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for
"vpshufb %12, %%ymm4, %%ymm4 \n" //uv+1 kSHUF4 for shuf // shuf
"vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" //blend 0 "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf
"vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" //blend 0 "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for
"vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" //blend 2 // shuf
"vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" //blend 1 "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0
"vpshufb %13, %%ymm5, %%ymm15 \n" //shuffle const "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0
"vpor %%ymm4, %%ymm3, %%ymm5 \n" //get results "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2
"vmovdqu %%ymm12, 0x20(%2) \n" //store dst_yuv+20h "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1
"vpor %%ymm15, %%ymm5, %%ymm3 \n" //get results "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const
"add $0x20, %4 \n" //add to src buffer ptr "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results
"vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" //insert "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h
"vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" //insert "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results
"vmovdqu %%ymm4, (%2) \n" //store dst_yuv "add $0x20, %4 \n" // add to src buffer
"vmovdqu %%ymm5, 0x40(%2) \n" //store dst_yuv+40h // ptr
"add $0x60,%2 \n" //add to dst buffer ptr "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert
// "cmp %3, %4 \n" //(width64 - 32 bytes) and src_offset "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert
"sub $0x20,%3 \n" // 32 pixels per loop "vmovdqu %%ymm4, (%2) \n" // store dst_yuv
"vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h
"add $0x60,%2 \n" // add to dst buffer
// ptr
// "cmp %3, %4 \n" //(width64 -
// 32 bytes) and src_offset
"sub $0x20,%3 \n" // 32 pixels per loop
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" //sse-avx2 transistions "vzeroupper \n" // sse-avx2
// transistions
: "+r"(src_y), //%0 : "+r"(src_y), //%0
"+r"(src_vu), //%1 "+r"(src_vu), //%1
@ -6780,7 +6785,8 @@ void NV21ToYUV24Row_AVX2(const uint8_t* src_y,
"m"(kSHUF3), //%11 "m"(kSHUF3), //%11
"m"(kSHUF4), //%12 "m"(kSHUF4), //%12
"m"(kSHUF5) //%13 "m"(kSHUF5) //%13
: "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12", "xmm13", "xmm14", "xmm15"); : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12",
"xmm13", "xmm14", "xmm15");
} }
#endif // HAS_NV21TOYUV24ROW_AVX2 #endif // HAS_NV21TOYUV24ROW_AVX2

View File

@ -561,7 +561,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
"+r"(width) // %3 // Output registers "+r"(width) // %3 // Output registers
: // Input registers : // Input registers
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "q0", "q1" // Clobber List
); );
} }
// Reads 16 U's and V's and writes out 16 pairs of UV. // Reads 16 U's and V's and writes out 16 pairs of UV.
@ -582,7 +582,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
"+r"(width) // %3 // Output registers "+r"(width) // %3 // Output registers
: // Input registers : // Input registers
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "q0", "q1" // Clobber List
); );
} }
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
@ -607,7 +607,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
"+r"(width) // %4 "+r"(width) // %4
: // Input registers : // Input registers
: "cc", "memory", "d0", "d1", "d2" // Clobber List : "cc", "memory", "d0", "d1", "d2" // Clobber List
); );
} }
// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
@ -632,7 +632,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
"+r"(width) // %4 "+r"(width) // %4
: // Input registers : // Input registers
: "cc", "memory", "q0", "q1", "q2" // Clobber List : "cc", "memory", "q0", "q1", "q2" // Clobber List
); );
} }
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
@ -648,7 +648,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
"+r"(width) // %2 // Output registers "+r"(width) // %2 // Output registers
: // Input registers : // Input registers
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "q0", "q1" // Clobber List
); );
} }
// SetRow writes 'width' bytes using an 8 bit value repeated. // SetRow writes 'width' bytes using an 8 bit value repeated.
@ -761,7 +761,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
); );
} }
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
@ -778,7 +778,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
); );
} }
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
@ -795,7 +795,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "d1", "d2", "d3" // Clobber List : "cc", "memory", "d1", "d2", "d3" // Clobber List
); );
} }
#define RGB565TOARGB \ #define RGB565TOARGB \
@ -826,7 +826,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
); );
} }
#define ARGB1555TOARGB \ #define ARGB1555TOARGB \
@ -872,7 +872,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
); );
} }
#define ARGB4444TOARGB \ #define ARGB4444TOARGB \
@ -901,7 +901,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "q0", "q1", "q2" // Clobber List : "cc", "memory", "q0", "q1", "q2" // Clobber List
); );
} }
void ARGBToRGB24Row_NEON(const uint8_t* src_argb, void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
@ -919,7 +919,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
); );
} }
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
@ -935,7 +935,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
); );
} }
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
@ -950,7 +950,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "q0", "q1" // Clobber List
); );
} }
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
@ -965,7 +965,7 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "q0", "q1" // Clobber List
); );
} }
void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
@ -985,7 +985,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
"+r"(width) // %3 "+r"(width) // %3
: :
: "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
); );
} }
void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
@ -1005,7 +1005,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
"+r"(width) // %3 "+r"(width) // %3
: :
: "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
); );
} }
void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
@ -1032,7 +1032,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
: :
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
"d7" // Clobber List "d7" // Clobber List
); );
} }
void UYVYToUVRow_NEON(const uint8_t* src_uyvy, void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
@ -1059,7 +1059,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
: :
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
"d7" // Clobber List "d7" // Clobber List
); );
} }
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
@ -1081,7 +1081,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
"+r"(width) // %2 "+r"(width) // %2
: "r"(shuffler) // %3 : "r"(shuffler) // %3
: "cc", "memory", "q0", "q1", "q2" // Clobber List : "cc", "memory", "q0", "q1", "q2" // Clobber List
); );
} }
void I422ToYUY2Row_NEON(const uint8_t* src_y, void I422ToYUY2Row_NEON(const uint8_t* src_y,
@ -1241,7 +1241,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
); );
} }
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@ -2564,7 +2564,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
: "r"(2), // %5 : "r"(2), // %5
"r"(6) // %6 "r"(6) // %6
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "q0", "q1" // Clobber List
); );
} }
// SobelY as a matrix is // SobelY as a matrix is
@ -2601,7 +2601,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
: "r"(1), // %4 : "r"(1), // %4
"r"(6) // %5 "r"(6) // %5
: "cc", "memory", "q0", "q1" // Clobber List : "cc", "memory", "q0", "q1" // Clobber List
); );
} }
// %y passes a float as a scalar vector for vector * scalar multiply. // %y passes a float as a scalar vector for vector * scalar multiply.
@ -2690,70 +2690,120 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
const uint8_t* src_vu, const uint8_t* src_vu,
uint8_t* dst_yuv24, uint8_t* dst_yuv24,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q2}, [%0]! \n" // load 16 Y values "vld1.8 {q2}, [%0]! \n" // load 16 Y values
"vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
"vmov d1, d0 \n" "vmov d1, d0 \n"
"vzip.u8 d0, d1 \n" // VV "vzip.u8 d0, d1 \n" // VV
"vmov d3, d2 \n" "vmov d3, d2 \n"
"vzip.u8 d2, d3 \n" // UU "vzip.u8 d2, d3 \n" // UU
"subs %3, %3, #16 \n" // 16 pixels per loop "subs %3, %3, #16 \n" // 16 pixels per loop
"vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels
"vst3.8 {d1, d3, d5}, [%2]! \n" "vst3.8 {d1, d3, d5}, [%2]! \n"
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_vu), // %1 "+r"(src_vu), // %1
"+r"(dst_yuv24), // %2 "+r"(dst_yuv24), // %2
"+r"(width) // %3 "+r"(width) // %3
: :
: "cc", "memory", "q0", "q1", "q2"); : "cc", "memory", "q0", "q1", "q2");
}
void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
int src_stride_ayuv,
uint8_t* dst_uv,
int width) {
asm volatile(
"add %1, %0, %1 \n" // src_stride + src_AYUV
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
// pixels.
"vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
// pixels.
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
// pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vqrshrun.s16 d1, q0, #2 \n" // 2x2 average
"vqrshrun.s16 d0, q1, #2 \n"
"subs %3, %3, #16 \n" // 16 processed per loop.
"vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels UV.
"bgt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_stride_ayuv), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
} }
void AYUVToVURow_NEON(const uint8_t* src_ayuv, void AYUVToVURow_NEON(const uint8_t* src_ayuv,
int src_stride_ayuv, int src_stride_ayuv,
uint8_t* dst_vu, uint8_t* dst_vu,
int width) { int width) {
asm volatile ( asm volatile(
"add %1, %0, %1 \n" // src_stride + src_AYUV "add %1, %0, %1 \n" // src_stride + src_AYUV
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV
"vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. // pixels.
"vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts.
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV pixels. "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts.
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV pixels. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. // pixels.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV
"vqrshrun.s16 d0, q0, #2 \n" // 2x2 average // pixels.
"vqrshrun.s16 d1, q1, #2 \n" "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"subs %3, %3, #16 \n" // 16 processed per loop. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU. "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average
"bgt 1b \n" "vqrshrun.s16 d1, q1, #2 \n"
: "+r"(src_ayuv), // %0 "subs %3, %3, #16 \n" // 16 processed per loop.
"+r"(src_stride_ayuv), // %1 "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU.
"+r"(dst_vu), // %2 "bgt 1b \n"
"+r"(width) // %3 : "+r"(src_ayuv), // %0
: "+r"(src_stride_ayuv), // %1
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" "+r"(dst_vu), // %2
); "+r"(width) // %3
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
} }
// Copy row of AYUV Y's into Y. // Copy row of AYUV Y's into Y.
// Similar to ARGBExtractAlphaRow_NEON // Similar to ARGBExtractAlphaRow_NEON
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
"vst1.8 {q2}, [%1]! \n" // store 16 Y's. "vst1.8 {q2}, [%1]! \n" // store 16 Y's.
"bgt 1b \n" "bgt 1b \n"
: "+r"(src_ayuv), // %0 : "+r"(src_ayuv), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "q0", "q1", "q2", "q3"); : "cc", "memory", "q0", "q1", "q2", "q3");
}
// Convert biplanar UV channel of NV12 to NV21
void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile(
"1: \n"
"vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
"vld2.8 {d1, d3}, [%0]! \n"
"vorr.u8 q2, q0, q0 \n" // move U after V
"subs %2, %2, #16 \n" // 16 pixels per loop
"vst2.8 {q1, q2}, [%1]! \n" // store 16 VU pixels
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
"+r"(width) // %2
:
: "cc", "memory", "q0", "q1", "q2");
} }
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..

View File

@ -608,7 +608,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
"+r"(width) // %3 // Output registers "+r"(width) // %3 // Output registers
: // Input registers : // Input registers
: "cc", "memory", "v0", "v1" // Clobber List : "cc", "memory", "v0", "v1" // Clobber List
); );
} }
// Reads 16 U's and V's and writes out 16 pairs of UV. // Reads 16 U's and V's and writes out 16 pairs of UV.
@ -629,7 +629,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
"+r"(width) // %3 // Output registers "+r"(width) // %3 // Output registers
: // Input registers : // Input registers
: "cc", "memory", "v0", "v1" // Clobber List : "cc", "memory", "v0", "v1" // Clobber List
); );
} }
// Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b. // Reads 16 packed RGB and write to planar dst_r, dst_g, dst_b.
@ -653,7 +653,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
"+r"(width) // %4 "+r"(width) // %4
: // Input registers : // Input registers
: "cc", "memory", "v0", "v1", "v2" // Clobber List : "cc", "memory", "v0", "v1", "v2" // Clobber List
); );
} }
// Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time // Reads 16 planar R's, G's and B's and writes out 16 packed RGB at a time
@ -677,7 +677,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
"+r"(width) // %4 "+r"(width) // %4
: // Input registers : // Input registers
: "cc", "memory", "v0", "v1", "v2" // Clobber List : "cc", "memory", "v0", "v1", "v2" // Clobber List
); );
} }
// Copy multiple of 32. // Copy multiple of 32.
@ -693,7 +693,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
"+r"(width) // %2 // Output registers "+r"(width) // %2 // Output registers
: // Input registers : // Input registers
: "cc", "memory", "v0", "v1" // Clobber List : "cc", "memory", "v0", "v1" // Clobber List
); );
} }
// SetRow writes 'width' bytes using an 8 bit value repeated. // SetRow writes 'width' bytes using an 8 bit value repeated.
@ -800,7 +800,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
); );
} }
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
@ -818,7 +818,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
); );
} }
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
@ -835,7 +835,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
); );
} }
#define RGB565TOARGB \ #define RGB565TOARGB \
@ -867,7 +867,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
); );
} }
#define ARGB1555TOARGB \ #define ARGB1555TOARGB \
@ -924,7 +924,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
#define ARGB4444TOARGB \ #define ARGB4444TOARGB \
@ -955,7 +955,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
); );
} }
void ARGBToRGB24Row_NEON(const uint8_t* src_argb, void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
@ -973,7 +973,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
); );
} }
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
@ -990,7 +990,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
); );
} }
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
@ -1005,7 +1005,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "v0", "v1" // Clobber List : "cc", "memory", "v0", "v1" // Clobber List
); );
} }
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
@ -1020,7 +1020,7 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "v0", "v1" // Clobber List : "cc", "memory", "v0", "v1" // Clobber List
); );
} }
void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
@ -1040,7 +1040,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
"+r"(width) // %3 "+r"(width) // %3
: :
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
@ -1060,7 +1060,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
"+r"(width) // %3 "+r"(width) // %3
: :
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
@ -1087,7 +1087,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
: :
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v7" // Clobber List "v7" // Clobber List
); );
} }
void UYVYToUVRow_NEON(const uint8_t* src_uyvy, void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
@ -1114,7 +1114,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
: :
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v7" // Clobber List "v7" // Clobber List
); );
} }
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
@ -1135,7 +1135,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
"+r"(width) // %2 "+r"(width) // %2
: "r"(shuffler) // %3 : "r"(shuffler) // %3
: "cc", "memory", "v0", "v1", "v2" // Clobber List : "cc", "memory", "v0", "v1", "v2" // Clobber List
); );
} }
void I422ToYUY2Row_NEON(const uint8_t* src_y, void I422ToYUY2Row_NEON(const uint8_t* src_y,
@ -1298,7 +1298,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
"+r"(width) // %2 "+r"(width) // %2
: :
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@ -1863,7 +1863,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
"v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27",
"v28" "v28"
); );
} }
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
@ -2611,7 +2611,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
: "r"(2LL), // %5 : "r"(2LL), // %5
"r"(6LL) // %6 "r"(6LL) // %6
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
// SobelY as a matrix is // SobelY as a matrix is
@ -2648,7 +2648,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
: "r"(1LL), // %4 : "r"(1LL), // %4
"r"(6LL) // %5 "r"(6LL) // %5
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
// Caveat - rounds float to half float whereas scaling version truncates. // Caveat - rounds float to half float whereas scaling version truncates.
@ -2879,23 +2879,51 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
// Convert biplanar NV21 to packed YUV24 // Convert biplanar NV21 to packed YUV24
void NV21ToYUV24Row_NEON(const uint8_t* src_y, void NV21ToYUV24Row_NEON(const uint8_t* src_y,
const uint8_t* src_vu, const uint8_t* src_vu,
uint8_t* dst_yuv24, uint8_t* dst_yuv24,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
"ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
"zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
"zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values
"subs %w3, %w3, #16 \n" // 16 pixels per loop "subs %w3, %w3, #16 \n" // 16 pixels per loop
"st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_y), // %0 : "+r"(src_y), // %0
"+r"(src_vu), // %1 "+r"(src_vu), // %1
"+r"(dst_yuv24), // %2 "+r"(dst_yuv24), // %2
"+r"(width) // %3 "+r"(width) // %3
: :
: "cc", "memory", "v0", "v1", "v2"); : "cc", "memory", "v0", "v1", "v2");
}
void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
int src_stride_ayuv,
uint8_t* dst_uv,
int width) {
const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
asm volatile(
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
// pixels.
"uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
"uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
"uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
"uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average
"uqrshrn v2.8b, v1.8h, #2 \n"
"subs %w3, %w3, #16 \n" // 16 processed per loop.
"st2 {v2.8b,v3.8b}, [%2], #16 \n" // store 8 pixels UV.
"b.gt 1b \n"
: "+r"(src_ayuv), // %0
"+r"(src_ayuv_1), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
} }
void AYUVToVURow_NEON(const uint8_t* src_ayuv, void AYUVToVURow_NEON(const uint8_t* src_ayuv,
@ -2905,40 +2933,41 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
asm volatile( asm volatile(
"1: \n" "1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
"uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. // pixels.
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
"uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
"uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts.
"uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts.
"uqrshrn v1.8b, v1.8h, #2 \n" "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average
"subs %w3, %w3, #16 \n" // 16 processed per loop. "uqrshrn v1.8b, v1.8h, #2 \n"
"st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. "subs %w3, %w3, #16 \n" // 16 processed per loop.
"b.gt 1b \n" "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU.
: "+r"(src_ayuv), // %0 "b.gt 1b \n"
"+r"(src_ayuv_1), // %1 : "+r"(src_ayuv), // %0
"+r"(dst_vu), // %2 "+r"(src_ayuv_1), // %1
"+r"(width) // %3 "+r"(dst_vu), // %2
: "+r"(width) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" :
); : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
} }
// Copy row of AYUV Y's into Y // Copy row of AYUV Y's into Y
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
"subs %w2, %w2, #16 \n" // 16 pixels per loop // pixels
"st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels "subs %w2, %w2, #16 \n" // 16 pixels per loop
"b.gt 1b \n" "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels
: "+r"(src_ayuv), // %0 "b.gt 1b \n"
"+r"(dst_y), // %1 : "+r"(src_ayuv), // %0
"+r"(width) // %2 "+r"(dst_y), // %1
: "+r"(width) // %2
: "cc", "memory", "v0", "v1", "v2", "v3"); :
: "cc", "memory", "v0", "v1", "v2", "v3");
} }
void FloatDivToByteRow_NEON(const float* src_weights, void FloatDivToByteRow_NEON(const float* src_weights,
@ -2962,7 +2991,7 @@ void FloatDivToByteRow_NEON(const float* src_weights,
"uqxtn v1.4h, v1.4s \n" // 8 shorts "uqxtn v1.4h, v1.4s \n" // 8 shorts
"uqxtn2 v1.8h, v2.4s \n" "uqxtn2 v1.8h, v2.4s \n"
"uqxtn v1.8b, v1.8h \n" // 8 bytes "uqxtn v1.8b, v1.8h \n" // 8 bytes
"st1 {v1.8b}, [%2], #8 \n" // store 8 byte out "st1 {v1.8b}, [%2], #8 \n" // store 8 byte out
"fcmgt v5.4s, v1.4s, v0.4s \n" // cmp weight to zero "fcmgt v5.4s, v1.4s, v0.4s \n" // cmp weight to zero
@ -2974,15 +3003,31 @@ void FloatDivToByteRow_NEON(const float* src_weights,
"st1 {v5.8b}, [%3], #8 \n" // store 8 byte mask "st1 {v5.8b}, [%3], #8 \n" // store 8 byte mask
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_weights), // %0 : "+r"(src_weights), // %0
"+r"(src_values), // %1 "+r"(src_values), // %1
"+r"(dst_out), // %2 "+r"(dst_out), // %2
"+r"(dst_mask), // %3 "+r"(dst_mask), // %3
"+r"(width) // %4 "+r"(width) // %4
: :
: "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
} }
// Convert biplanar UV channel of NV12 to NV21
void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile(
"1: \n"
"ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values
"orr v2.16b, v0.16b, v0.16b \n" // move U after V
"subs %w2, %w2, #16 \n" // 16 pixels per loop
"st2 {v1.16b, v2.16b}, [%1], #32 \n" // store 16 VU pixels
"b.gt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_vu), // %1
"+r"(width) // %2
:
: "cc", "memory", "v0", "v1", "v2");
}
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus #ifdef __cplusplus

View File

@ -4222,7 +4222,7 @@ __declspec(naked) void ARGBBlendRow_SSSE3(const uint8_t* src_argb0,
add ecx, 4 - 1 add ecx, 4 - 1
jl convertloop1b jl convertloop1b
// 1 pixel loop. // 1 pixel loop.
convertloop1: convertloop1:
movd xmm3, [eax] // src argb movd xmm3, [eax] // src argb
lea eax, [eax + 4] lea eax, [eax + 4]
@ -5360,7 +5360,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
add ecx, 4 - 1 add ecx, 4 - 1
jl l1b jl l1b
// 1 pixel loop // 1 pixel loop
l1: l1:
movdqu xmm0, [eax] movdqu xmm0, [eax]
psubd xmm0, [eax + edx * 4] psubd xmm0, [eax + edx * 4]
@ -5448,7 +5448,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
add ecx, 4 - 1 add ecx, 4 - 1
jl l1b jl l1b
// 1 pixel loop // 1 pixel loop
l1: l1:
movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes.
lea eax, [eax + 4] lea eax, [eax + 4]
@ -5534,7 +5534,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
add ecx, 4 - 1 add ecx, 4 - 1
jl l1b jl l1b
// 1 pixel loop // 1 pixel loop
l1: l1:
cvttps2dq xmm0, xmm2 // x, y float to int cvttps2dq xmm0, xmm2 // x, y float to int
packssdw xmm0, xmm0 // x, y as shorts packssdw xmm0, xmm0 // x, y as shorts

View File

@ -483,7 +483,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
: "m"(kShuf0), // %0 : "m"(kShuf0), // %0
"m"(kShuf1), // %1 "m"(kShuf1), // %1
"m"(kShuf2) // %2 "m"(kShuf2) // %2
); );
asm volatile( asm volatile(
LABELALIGN LABELALIGN
@ -521,7 +521,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
: "m"(kShuf01), // %0 : "m"(kShuf01), // %0
"m"(kShuf11), // %1 "m"(kShuf11), // %1
"m"(kShuf21) // %2 "m"(kShuf21) // %2
); );
asm volatile( asm volatile(
"movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11 "movdqa %1,%%xmm0 \n" // kMadd11
@ -530,7 +530,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
: "m"(kMadd01), // %0 : "m"(kMadd01), // %0
"m"(kMadd11), // %1 "m"(kMadd11), // %1
"m"(kRound34) // %2 "m"(kRound34) // %2
); );
asm volatile( asm volatile(
LABELALIGN LABELALIGN
@ -587,7 +587,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
: "m"(kShuf01), // %0 : "m"(kShuf01), // %0
"m"(kShuf11), // %1 "m"(kShuf11), // %1
"m"(kShuf21) // %2 "m"(kShuf21) // %2
); );
asm volatile( asm volatile(
"movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11 "movdqa %1,%%xmm0 \n" // kMadd11
@ -596,7 +596,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
: "m"(kMadd01), // %0 : "m"(kMadd01), // %0
"m"(kMadd11), // %1 "m"(kMadd11), // %1
"m"(kRound34) // %2 "m"(kRound34) // %2
); );
asm volatile( asm volatile(
@ -690,7 +690,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShufAb1), // %1 "m"(kShufAb1), // %1
"m"(kShufAb2), // %2 "m"(kShufAb2), // %2
"m"(kScaleAb2) // %3 "m"(kScaleAb2) // %3
); );
asm volatile( asm volatile(
LABELALIGN LABELALIGN
@ -734,7 +734,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
: "m"(kShufAc), // %0 : "m"(kShufAc), // %0
"m"(kShufAc3), // %1 "m"(kShufAc3), // %1
"m"(kScaleAc33) // %2 "m"(kScaleAc33) // %2
); );
asm volatile( asm volatile(
LABELALIGN LABELALIGN
@ -1272,7 +1272,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
: :
: "m"(kShuffleColARGB), // %0 : "m"(kShuffleColARGB), // %0
"m"(kShuffleFractions) // %1 "m"(kShuffleFractions) // %1
); );
asm volatile( asm volatile(
"movd %5,%%xmm2 \n" "movd %5,%%xmm2 \n"

View File

@ -40,7 +40,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "q0", "q1" // Clobber List : "q0", "q1" // Clobber List
); );
} }
// Read 32x1 average down and write 16x1. // Read 32x1 average down and write 16x1.
@ -61,7 +61,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "q0", "q1" // Clobber List : "q0", "q1" // Clobber List
); );
} }
// Read 32x2 average down and write 16x1. // Read 32x2 average down and write 16x1.
@ -92,7 +92,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
"+r"(dst_width) // %3 "+r"(dst_width) // %3
: :
: "q0", "q1", "q2", "q3" // Clobber List : "q0", "q1", "q2", "q3" // Clobber List
); );
} }
void ScaleRowDown4_NEON(const uint8_t* src_ptr, void ScaleRowDown4_NEON(const uint8_t* src_ptr,
@ -523,7 +523,7 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
"+r"(src_width) // %2 "+r"(src_width) // %2
: :
: "memory", "cc", "q0", "q1", "q2" // Clobber List : "memory", "cc", "q0", "q1", "q2" // Clobber List
); );
} }
// TODO(Yang Zhang): Investigate less load instructions for // TODO(Yang Zhang): Investigate less load instructions for
@ -705,7 +705,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
); );
} }
// 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]! // 46: f964 018d vld4.32 {d16,d18,d20,d22}, [r4]!
@ -734,7 +734,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List
); );
} }
void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,

View File

@ -38,7 +38,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "v0", "v1" // Clobber List : "v0", "v1" // Clobber List
); );
} }
// Read 32x1 average down and write 16x1. // Read 32x1 average down and write 16x1.
@ -60,7 +60,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "v0", "v1" // Clobber List : "v0", "v1" // Clobber List
); );
} }
// Read 32x2 average down and write 16x1. // Read 32x2 average down and write 16x1.
@ -89,7 +89,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
"+r"(dst_width) // %3 "+r"(dst_width) // %3
: :
: "v0", "v1", "v2", "v3" // Clobber List : "v0", "v1", "v2", "v3" // Clobber List
); );
} }
void ScaleRowDown4_NEON(const uint8_t* src_ptr, void ScaleRowDown4_NEON(const uint8_t* src_ptr,
@ -534,7 +534,7 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
"+r"(src_width) // %2 "+r"(src_width) // %2
: :
: "memory", "cc", "v0", "v1", "v2" // Clobber List : "memory", "cc", "v0", "v1", "v2" // Clobber List
); );
} }
// TODO(Yang Zhang): Investigate less load instructions for // TODO(Yang Zhang): Investigate less load instructions for
@ -719,7 +719,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb, void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
@ -742,7 +742,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
"+r"(dst_width) // %2 "+r"(dst_width) // %2
: :
: "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List
); );
} }
void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr, void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
@ -991,7 +991,7 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
"+r"(dst_width) // %3 "+r"(dst_width) // %3
: :
: "v0", "v1", "v2", "v3" // Clobber List : "v0", "v1", "v2", "v3" // Clobber List
); );
} }
// Read 8x2 upsample with filtering and write 16x1. // Read 8x2 upsample with filtering and write 16x1.
@ -1041,7 +1041,7 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
"r"(14LL) // %5 "r"(14LL) // %5
: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18",
"v19" // Clobber List "v19" // Clobber List
); );
} }
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

View File

@ -311,10 +311,10 @@ int I400ToNV21(const uint8_t* src_y,
SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
OFF); \ OFF); \
align_buffer_page_end(dst_y_c, kWidth* kHeight); \ align_buffer_page_end(dst_y_c, kWidth* kHeight); \
align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \ align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \ align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \ SUBSAMPLE(kHeight, SUBSAMP_Y)); \
for (int i = 0; i < kHeight; ++i) \ for (int i = 0; i < kHeight; ++i) \
for (int j = 0; j < kWidth; ++j) \ for (int j = 0; j < kWidth; ++j) \
@ -329,21 +329,21 @@ int I400ToNV21(const uint8_t* src_y,
} \ } \
memset(dst_y_c, 1, kWidth* kHeight); \ memset(dst_y_c, 1, kWidth* kHeight); \
memset(dst_uv_c, 2, \ memset(dst_uv_c, 2, \
SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
memset(dst_y_opt, 101, kWidth* kHeight); \ memset(dst_y_opt, 101, kWidth* kHeight); \
memset(dst_uv_opt, 102, \ memset(dst_uv_opt, 102, \
SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
MaskCpuFlags(disable_cpu_flags_); \ MaskCpuFlags(disable_cpu_flags_); \
SRC_FMT_PLANAR##To##FMT_PLANAR( \ SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \
dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \ dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
MaskCpuFlags(benchmark_cpu_info_); \ MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \ for (int i = 0; i < benchmark_iterations_; ++i) { \
SRC_FMT_PLANAR##To##FMT_PLANAR( \ SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \
src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \ src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \
dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \ dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
} \ } \
int max_diff = 0; \ int max_diff = 0; \
for (int i = 0; i < kHeight; ++i) { \ for (int i = 0; i < kHeight; ++i) { \
@ -357,12 +357,12 @@ int I400ToNV21(const uint8_t* src_y,
} \ } \
EXPECT_LE(max_diff, 1); \ EXPECT_LE(max_diff, 1); \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \
int abs_diff = \ int abs_diff = \
abs(static_cast<int>( \ abs(static_cast<int>( \
dst_uv_c[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \ dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) - \
static_cast<int>( \ static_cast<int>( \
dst_uv_opt[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \ dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j])); \
if (abs_diff > max_diff) { \ if (abs_diff > max_diff) { \
max_diff = abs_diff; \ max_diff = abs_diff; \
} \ } \
@ -395,6 +395,99 @@ TESTPLANARTOBP(I422, 2, 1, NV21, 2, 2)
TESTPLANARTOBP(I444, 1, 1, NV21, 2, 2) TESTPLANARTOBP(I444, 1, 1, NV21, 2, 2)
TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2) TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
#define TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, \
OFF) \
TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = benchmark_height_; \
align_buffer_page_end(src_y, kWidth* kHeight + OFF); \
align_buffer_page_end(src_uv, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2 * \
SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \
OFF); \
align_buffer_page_end(dst_y_c, kWidth* kHeight); \
align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
align_buffer_page_end(dst_y_opt, kWidth* kHeight); \
align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * \
SUBSAMPLE(kHeight, SUBSAMP_Y)); \
for (int i = 0; i < kHeight; ++i) \
for (int j = 0; j < kWidth; ++j) \
src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \
for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \
src_uv[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) * 2 + j + 0 + OFF] = \
(fastrand() & 0xff); \
src_uv[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) * 2 + j + 1 + OFF] = \
(fastrand() & 0xff); \
} \
} \
memset(dst_y_c, 1, kWidth* kHeight); \
memset(dst_uv_c, 2, \
SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
memset(dst_y_opt, 101, kWidth* kHeight); \
memset(dst_uv_opt, 102, \
SUBSAMPLE(kWidth, SUBSAMP_X) * 2 * SUBSAMPLE(kHeight, SUBSAMP_Y)); \
MaskCpuFlags(disable_cpu_flags_); \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_uv + OFF, \
SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2, dst_y_c, kWidth, dst_uv_c, \
SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
SRC_FMT_PLANAR##To##FMT_PLANAR( \
src_y + OFF, kWidth, src_uv + OFF, \
SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * 2, dst_y_opt, kWidth, dst_uv_opt, \
SUBSAMPLE(kWidth, SUBSAMP_X) * 2, kWidth, NEG kHeight); \
} \
int max_diff = 0; \
for (int i = 0; i < kHeight; ++i) { \
for (int j = 0; j < kWidth; ++j) { \
int abs_diff = abs(static_cast<int>(dst_y_c[i * kWidth + j]) - \
static_cast<int>(dst_y_opt[i * kWidth + j])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
} \
} \
EXPECT_LE(max_diff, 1); \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X) * 2; ++j) { \
int abs_diff = \
abs(static_cast<int>( \
dst_uv_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j]) - \
static_cast<int>( \
dst_uv_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) * 2 + j])); \
if (abs_diff > max_diff) { \
max_diff = abs_diff; \
} \
} \
} \
EXPECT_LE(max_diff, 1); \
free_aligned_buffer_page_end(dst_y_c); \
free_aligned_buffer_page_end(dst_uv_c); \
free_aligned_buffer_page_end(dst_y_opt); \
free_aligned_buffer_page_end(dst_uv_opt); \
free_aligned_buffer_page_end(src_y); \
free_aligned_buffer_page_end(src_uv); \
}
#define TESTBIPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0) \
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
SUBSAMP_X, SUBSAMP_Y, benchmark_width_ - 4, _Any, +, 0) \
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
SUBSAMP_X, SUBSAMP_Y, benchmark_width, _Unaligned, +, 1) \
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Invert, -, 0) \
TESTBIPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, FMT_PLANAR, \
SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _Opt, +, 0)
// TODO(fbarchard): Fix msan on this unittest
// TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
#define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \
FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \
DOY) \ DOY) \
@ -680,8 +773,8 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2) TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2)
TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2) TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,\ #define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, \
W1280, DIFF, N, NEG, OFF) \ BPP_B, W1280, DIFF, N, NEG, OFF) \
TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \
const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \
const int kHeight = benchmark_height_; \ const int kHeight = benchmark_height_; \
@ -740,15 +833,15 @@ TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2)
free_aligned_buffer_page_end(dst_argb32_opt); \ free_aligned_buffer_page_end(dst_argb32_opt); \
} }
#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, \ #define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
BPP_B, DIFF) \ DIFF) \
TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
benchmark_width_ - 4, DIFF, _Any, +, 0) \ benchmark_width_ - 4, DIFF, _Any, +, 0) \
TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
benchmark_width_, DIFF, _Unaligned, +, 1) \ benchmark_width_, DIFF, _Unaligned, +, 1) \
TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
benchmark_width_, DIFF, _Invert, -, 0) \ benchmark_width_, DIFF, _Invert, -, 0) \
TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \
benchmark_width_, DIFF, _Opt, +, 0) benchmark_width_, DIFF, _Opt, +, 0)
TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4, 2) TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4, 2)
@ -980,6 +1073,7 @@ TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2)
TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2) TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2)
TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2) TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2)
TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2) TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2)
TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2)
TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2) TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2)
#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ #define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \
@ -1378,14 +1472,15 @@ TEST_F(LibYUVConvertTest, FuzzJpeg) {
orig_pixels[0] = 0xff; orig_pixels[0] = 0xff;
orig_pixels[1] = 0xd8; // SOI. orig_pixels[1] = 0xd8; // SOI.
orig_pixels[kSize - 1] = 0xff; orig_pixels[kSize - 1] = 0xff;
ValidateJpeg(orig_pixels, kSize); // Failure normally expected. ValidateJpeg(orig_pixels,
kSize); // Failure normally expected.
free_aligned_buffer_page_end(orig_pixels); free_aligned_buffer_page_end(orig_pixels);
} }
} }
// Test data created in GIMP. In export jpeg, disable thumbnails etc, // Test data created in GIMP. In export jpeg, disable
// choose a subsampling, and use low quality (50) to keep size small. // thumbnails etc, choose a subsampling, and use low quality
// Generated with xxd -i test.jpg // (50) to keep size small. Generated with xxd -i test.jpg
// test 0 is J400 // test 0 is J400
static const uint8_t kTest0Jpg[] = { static const uint8_t kTest0Jpg[] = {
0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01, 0xff, 0xd8, 0xff, 0xe0, 0x00, 0x10, 0x4a, 0x46, 0x49, 0x46, 0x00, 0x01,
@ -1987,8 +2082,8 @@ TEST_F(LibYUVConvertTest, TestMJPGInfo) {
EXPECT_EQ(1, ShowJPegInfo(kTest1Jpg, kTest1JpgLen)); EXPECT_EQ(1, ShowJPegInfo(kTest1Jpg, kTest1JpgLen));
EXPECT_EQ(1, ShowJPegInfo(kTest2Jpg, kTest2JpgLen)); EXPECT_EQ(1, ShowJPegInfo(kTest2Jpg, kTest2JpgLen));
EXPECT_EQ(1, ShowJPegInfo(kTest3Jpg, kTest3JpgLen)); EXPECT_EQ(1, ShowJPegInfo(kTest3Jpg, kTest3JpgLen));
EXPECT_EQ(1, EXPECT_EQ(1, ShowJPegInfo(kTest4Jpg,
ShowJPegInfo(kTest4Jpg, kTest4JpgLen)); // Valid but unsupported. kTest4JpgLen)); // Valid but unsupported.
} }
#endif // HAVE_JPEG #endif // HAVE_JPEG
@ -2906,7 +3001,8 @@ TEST_F(LibYUVConvertTest, TestH010ToARGB) {
} }
// Test 10 bit YUV to 10 bit RGB // Test 10 bit YUV to 10 bit RGB
// Caveat: Result is near due to float rounding in expected result. // Caveat: Result is near due to float rounding in expected
// result.
TEST_F(LibYUVConvertTest, TestH010ToAR30) { TEST_F(LibYUVConvertTest, TestH010ToAR30) {
const int kSize = 1024; const int kSize = 1024;
int histogram_b[1024]; int histogram_b[1024];
@ -2969,7 +3065,8 @@ TEST_F(LibYUVConvertTest, TestH010ToAR30) {
} }
// Test 10 bit YUV to 10 bit RGB // Test 10 bit YUV to 10 bit RGB
// Caveat: Result is near due to float rounding in expected result. // Caveat: Result is near due to float rounding in expected
// result.
TEST_F(LibYUVConvertTest, TestH010ToAB30) { TEST_F(LibYUVConvertTest, TestH010ToAB30) {
const int kSize = 1024; const int kSize = 1024;
int histogram_b[1024]; int histogram_b[1024];

View File

@ -3268,10 +3268,10 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
} }
float TestFloatDivToByte(int benchmark_width, float TestFloatDivToByte(int benchmark_width,
int benchmark_height, int benchmark_height,
int benchmark_iterations, int benchmark_iterations,
float scale, float scale,
bool opt) { bool opt) {
int i, j; int i, j;
// NEON does multiple of 8, so round count up // NEON does multiple of 8, so round count up
const int kPixels = (benchmark_width * benchmark_height + 7) & ~7; const int kPixels = (benchmark_width * benchmark_height + 7) & ~7;
@ -3287,7 +3287,8 @@ float TestFloatDivToByte(int benchmark_width,
// large values are problematic. audio is really -1 to 1. // large values are problematic. audio is really -1 to 1.
for (i = 0; i < kPixels; ++i) { for (i = 0; i < kPixels; ++i) {
(reinterpret_cast<float*>(src_weights))[i] = scale; (reinterpret_cast<float*>(src_weights))[i] = scale;
(reinterpret_cast<float*>(src_values))[i] = sinf(static_cast<float>(i) * 0.1f); (reinterpret_cast<float*>(src_values))[i] =
sinf(static_cast<float>(i) * 0.1f);
} }
memset(dst_out_c, 0, kPixels); memset(dst_out_c, 0, kPixels);
memset(dst_out_opt, 1, kPixels); memset(dst_out_opt, 1, kPixels);
@ -3295,24 +3296,24 @@ float TestFloatDivToByte(int benchmark_width,
memset(dst_mask_opt, 3, kPixels); memset(dst_mask_opt, 3, kPixels);
FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights), FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
reinterpret_cast<float*>(src_values), reinterpret_cast<float*>(src_values), dst_out_c,
dst_out_c, dst_mask_c, kPixels); dst_mask_c, kPixels);
for (j = 0; j < benchmark_iterations; j++) { for (j = 0; j < benchmark_iterations; j++) {
if (opt) { if (opt) {
#ifdef HAS_FLOATDIVTOBYTEROW_NEON #ifdef HAS_FLOATDIVTOBYTEROW_NEON
FloatDivToByteRow_NEON(reinterpret_cast<float*>(src_weights), FloatDivToByteRow_NEON(reinterpret_cast<float*>(src_weights),
reinterpret_cast<float*>(src_values), reinterpret_cast<float*>(src_values), dst_out_opt,
dst_out_opt, dst_mask_opt, kPixels); dst_mask_opt, kPixels);
#else #else
FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights), FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
reinterpret_cast<float*>(src_values), reinterpret_cast<float*>(src_values), dst_out_opt,
dst_out_opt, dst_mask_opt, kPixels); dst_mask_opt, kPixels);
#endif #endif
} else { } else {
FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights), FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
reinterpret_cast<float*>(src_values), reinterpret_cast<float*>(src_values), dst_out_opt,
dst_out_opt, dst_mask_opt, kPixels); dst_mask_opt, kPixels);
} }
} }
@ -3347,5 +3348,23 @@ TEST_F(LibYUVPlanarTest, TestFloatDivToByte_Opt) {
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
TEST_F(LibYUVPlanarTest, UVToVURow) {
const int kPixels = benchmark_width_ * benchmark_height_;
align_buffer_page_end(src_pixels_vu, kPixels * 2);
align_buffer_page_end(dst_pixels_uv, kPixels * 2);
MemRandomize(src_pixels_vu, kPixels * 2);
memset(dst_pixels_uv, 1, kPixels * 2);
UVToVURow_C(src_pixels_vu, dst_pixels_uv, kPixels);
for (int i = 0; i < kPixels; ++i) {
EXPECT_EQ(dst_pixels_uv[i * 2 + 0], src_pixels_vu[i * 2 + 1]);
EXPECT_EQ(dst_pixels_uv[i * 2 + 1], src_pixels_vu[i * 2 + 0]);
}
free_aligned_buffer_page_end(src_pixels_vu);
free_aligned_buffer_page_end(dst_pixels_uv);
}
} // namespace libyuv } // namespace libyuv

View File

@ -189,7 +189,7 @@ static uint32_t SumSquareError_SSE2(const uint8_t* src_a,
, ,
"xmm0", "xmm1", "xmm2", "xmm3", "xmm5" "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
#endif #endif
); // NOLINT ); // NOLINT
return sse; return sse;
} }
#endif // LIBYUV_DISABLE_X86 etc #endif // LIBYUV_DISABLE_X86 etc