mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
YUY2ToNV12 using YUY2ToY and YUY2ToNVUV
- Optimized YUY2ToNV12 that reduces it from 3 steps to 2 steps - Was SplitUV, memcpy Y, InterpolateUV - Now YUY2ToY, YUY2ToNVUV - rollback LIBYUV_UNLIMITED_DATA 3840x2160 1000 iterations: Pixel 2 Cortex A73 Was YUY2ToNV12_Opt (6515 ms) Now YUY2ToNV12_Opt (3350 ms) AB7 Mediatek P35 Cortex A53 Was YUY2ToNV12_Opt (6435 ms) Now YUY2ToNV12_Opt (3301 ms) Skylake AVX2 x64 Was YUY2ToNV12_Opt (1872 ms) Now YUY2ToNV12_Opt (1657 ms) SSE2 x64 Was YUY2ToNV12_Opt (2008 ms) Now YUY2ToNV12_Opt (1691 ms) Windows Skylake AVX2 32 bit x86 Was YUY2ToNV12_Opt (2161 ms) Now YUY2ToNV12_Opt (1628 ms) Bug: libyuv:943 Change-Id: I6c2ba2ae765413426baf770b837de114f808f6d0 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3929843 Reviewed-by: Wan-Teh Chang <wtc@google.com> Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
9ba40a8f03
commit
00950840d1
@ -236,8 +236,8 @@ extern "C" {
|
||||
#define HAS_UYVYTOUVROW_AVX2
|
||||
#define HAS_UYVYTOYROW_AVX2
|
||||
#define HAS_YUY2TOARGBROW_AVX2
|
||||
#define HAS_YUY2TOUV422ROW_AVX2
|
||||
#define HAS_YUY2TOUVROW_AVX2
|
||||
#define HAS_YUY2TOUV422ROW_AVX2
|
||||
#define HAS_YUY2TOYROW_AVX2
|
||||
|
||||
// Effects:
|
||||
@ -326,6 +326,7 @@ extern "C" {
|
||||
#define HAS_SPLITXRGBROW_SSSE3
|
||||
#define HAS_SPLITRGBROW_SSSE3
|
||||
#define HAS_SWAPUVROW_SSSE3
|
||||
#define HAS_YUY2TONVUVROW_SSE2
|
||||
|
||||
#if defined(__x86_64__) || !defined(__pic__)
|
||||
// TODO(fbarchard): fix build error on android_full_debug=1
|
||||
@ -391,6 +392,7 @@ extern "C" {
|
||||
#define HAS_SPLITXRGBROW_AVX2
|
||||
#define HAS_SPLITUVROW_16_AVX2
|
||||
#define HAS_SWAPUVROW_AVX2
|
||||
#define HAS_YUY2TONVUVROW_AVX2
|
||||
|
||||
#if defined(__x86_64__) || !defined(__pic__)
|
||||
// TODO(fbarchard): fix build error on android_full_debug=1
|
||||
@ -524,8 +526,9 @@ extern "C" {
|
||||
#define HAS_UYVYTOUVROW_NEON
|
||||
#define HAS_UYVYTOYROW_NEON
|
||||
#define HAS_YUY2TOARGBROW_NEON
|
||||
#define HAS_YUY2TOUV422ROW_NEON
|
||||
#define HAS_YUY2TOUVROW_NEON
|
||||
#define HAS_YUY2TONVUVROW_NEON
|
||||
#define HAS_YUY2TOUV422ROW_NEON
|
||||
#define HAS_YUY2TOYROW_NEON
|
||||
|
||||
// Effects:
|
||||
@ -4724,6 +4727,10 @@ void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
|
||||
int stride_yuy2,
|
||||
uint8_t* dst_uv,
|
||||
int width);
|
||||
void YUY2ToUV422Row_AVX2(const uint8_t* src_yuy2,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
@ -4734,6 +4741,10 @@ void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
|
||||
int stride_yuy2,
|
||||
uint8_t* dst_uv,
|
||||
int width);
|
||||
void YUY2ToUV422Row_SSE2(const uint8_t* src_yuy2,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
@ -4744,6 +4755,10 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
|
||||
int stride_yuy2,
|
||||
uint8_t* dst_uv,
|
||||
int width);
|
||||
void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
@ -4774,6 +4789,10 @@ void YUY2ToUVRow_C(const uint8_t* src_yuy2,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
void YUY2ToNVUVRow_C(const uint8_t* src_yuy2,
|
||||
int src_stride_yuy2,
|
||||
uint8_t* dst_uv,
|
||||
int width);
|
||||
void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
@ -4784,6 +4803,10 @@ void YUY2ToUVRow_Any_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
void YUY2ToNVUVRow_Any_AVX2(const uint8_t* src_yuy2,
|
||||
int stride_yuy2,
|
||||
uint8_t* dst_uv,
|
||||
int width);
|
||||
void YUY2ToUV422Row_Any_AVX2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
@ -4794,6 +4817,10 @@ void YUY2ToUVRow_Any_SSE2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
void YUY2ToNVUVRow_Any_SSE2(const uint8_t* src_yuy2,
|
||||
int stride_yuy2,
|
||||
uint8_t* dst_uv,
|
||||
int width);
|
||||
void YUY2ToUV422Row_Any_SSE2(const uint8_t* src_ptr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
@ -4804,6 +4831,10 @@ void YUY2ToUVRow_Any_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
void YUY2ToNVUVRow_Any_NEON(const uint8_t* src_yuy2,
|
||||
int stride_yuy2,
|
||||
uint8_t* dst_uv,
|
||||
int width);
|
||||
void YUY2ToUV422Row_Any_NEON(const uint8_t* src_ptr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
|
||||
@ -5095,9 +5095,6 @@ int ARGBCopyYToAlpha(const uint8_t* src_y,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Consider if width is even Y channel can be split
|
||||
// directly. A SplitUVRow_Odd function could copy the remaining chroma.
|
||||
|
||||
LIBYUV_API
|
||||
int YUY2ToNV12(const uint8_t* src_yuy2,
|
||||
int src_stride_yuy2,
|
||||
@ -5108,13 +5105,10 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
|
||||
int width,
|
||||
int height) {
|
||||
int y;
|
||||
int halfwidth = (width + 1) >> 1;
|
||||
void (*SplitUVRow)(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v,
|
||||
int width) = SplitUVRow_C;
|
||||
void (*InterpolateRow)(uint8_t * dst_ptr, const uint8_t* src_ptr,
|
||||
ptrdiff_t src_stride, int dst_width,
|
||||
int source_y_fraction) = InterpolateRow_C;
|
||||
|
||||
void (*YUY2ToYRow)(const uint8_t* src_yuy2, uint8_t* dst_y, int width) =
|
||||
YUY2ToYRow_C;
|
||||
void (*YUY2ToNVUVRow)(const uint8_t* src_yuy2, int stride_yuy2,
|
||||
uint8_t* dst_uv, int width) = YUY2ToNVUVRow_C;
|
||||
if (!src_yuy2 || !dst_y || !dst_uv || width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
@ -5125,109 +5119,83 @@ int YUY2ToNV12(const uint8_t* src_yuy2,
|
||||
src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
|
||||
src_stride_yuy2 = -src_stride_yuy2;
|
||||
}
|
||||
#if defined(HAS_SPLITUVROW_SSE2)
|
||||
#if defined(HAS_YUY2TOYROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
SplitUVRow = SplitUVRow_Any_SSE2;
|
||||
YUY2ToYRow = YUY2ToYRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
SplitUVRow = SplitUVRow_SSE2;
|
||||
YUY2ToYRow = YUY2ToYRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITUVROW_AVX2)
|
||||
#if defined(HAS_YUY2TOYROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
SplitUVRow = SplitUVRow_Any_AVX2;
|
||||
YUY2ToYRow = YUY2ToYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
SplitUVRow = SplitUVRow_AVX2;
|
||||
YUY2ToYRow = YUY2ToYRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITUVROW_NEON)
|
||||
#if defined(HAS_YUY2TOYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
SplitUVRow = SplitUVRow_Any_NEON;
|
||||
YUY2ToYRow = YUY2ToYRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
SplitUVRow = SplitUVRow_NEON;
|
||||
YUY2ToYRow = YUY2ToYRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITUVROW_MSA)
|
||||
#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUV422ROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
SplitUVRow = SplitUVRow_Any_MSA;
|
||||
YUY2ToYRow = YUY2ToYRow_Any_MSA;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
SplitUVRow = SplitUVRow_MSA;
|
||||
YUY2ToYRow = YUY2ToYRow_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SPLITUVROW_LSX)
|
||||
if (TestCpuFlag(kCpuHasLSX)) {
|
||||
SplitUVRow = SplitUVRow_Any_LSX;
|
||||
#if defined(HAS_YUY2TOYROW_LASX) && defined(HAS_YUY2TOUV422ROW_LASX)
|
||||
if (TestCpuFlag(kCpuHasLASX)) {
|
||||
YUY2ToYRow = YUY2ToYRow_Any_LASX;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
SplitUVRow = SplitUVRow_LSX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
InterpolateRow = InterpolateRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
InterpolateRow = InterpolateRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
InterpolateRow = InterpolateRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
InterpolateRow = InterpolateRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
InterpolateRow = InterpolateRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
InterpolateRow = InterpolateRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
InterpolateRow = InterpolateRow_Any_MSA;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
InterpolateRow = InterpolateRow_MSA;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_INTERPOLATEROW_LSX)
|
||||
if (TestCpuFlag(kCpuHasLSX)) {
|
||||
InterpolateRow = InterpolateRow_Any_LSX;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
InterpolateRow = InterpolateRow_LSX;
|
||||
YUY2ToYRow = YUY2ToYRow_LASX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
{
|
||||
int awidth = halfwidth * 2;
|
||||
// row of y and 2 rows of uv
|
||||
align_buffer_64(rows, awidth * 3);
|
||||
#if defined(HAS_YUY2TONVUVROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
YUY2ToNVUVRow = YUY2ToNVUVRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
YUY2ToNVUVRow = YUY2ToNVUVRow_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_YUY2TONVUVROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
YUY2ToNVUVRow = YUY2ToNVUVRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
YUY2ToNVUVRow = YUY2ToNVUVRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_YUY2TONVUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
YUY2ToNVUVRow = YUY2ToNVUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
YUY2ToNVUVRow = YUY2ToNVUVRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height - 1; y += 2) {
|
||||
// Split Y from UV.
|
||||
SplitUVRow(src_yuy2, rows, rows + awidth, awidth);
|
||||
memcpy(dst_y, rows, width);
|
||||
SplitUVRow(src_yuy2 + src_stride_yuy2, rows, rows + awidth * 2, awidth);
|
||||
memcpy(dst_y + dst_stride_y, rows, width);
|
||||
InterpolateRow(dst_uv, rows + awidth, awidth, awidth, 128);
|
||||
src_yuy2 += src_stride_yuy2 * 2;
|
||||
dst_y += dst_stride_y * 2;
|
||||
dst_uv += dst_stride_uv;
|
||||
}
|
||||
if (height & 1) {
|
||||
// Split Y from UV.
|
||||
SplitUVRow(src_yuy2, rows, dst_uv, awidth);
|
||||
memcpy(dst_y, rows, width);
|
||||
}
|
||||
free_aligned_buffer_64(rows);
|
||||
for (y = 0; y < height - 1; y += 2) {
|
||||
YUY2ToYRow(src_yuy2, dst_y, width);
|
||||
YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
|
||||
YUY2ToNVUVRow(src_yuy2, src_stride_yuy2, dst_uv, width);
|
||||
src_yuy2 += src_stride_yuy2 * 2;
|
||||
dst_y += dst_stride_y * 2;
|
||||
dst_uv += dst_stride_uv;
|
||||
}
|
||||
if (height & 1) {
|
||||
YUY2ToYRow(src_yuy2, dst_y, width);
|
||||
YUY2ToNVUVRow(src_yuy2, 0, dst_uv, width);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -673,6 +673,35 @@ ANY21(SobelXYRow_Any_LSX, SobelXYRow_LSX, 0, 1, 1, 4, 15)
|
||||
#endif
|
||||
#undef ANY21
|
||||
|
||||
// Any 2 planes to 1 with stride
|
||||
// width is measured in source pixels. 4 bytes contains 2 pixels
|
||||
#define ANY21S(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \
|
||||
void NAMEANY(const uint8_t* src_yuy2, int stride_yuy2, uint8_t* dst_uv, \
|
||||
int width) { \
|
||||
SIMD_ALIGNED(uint8_t temp[32 * 3]); \
|
||||
memset(temp, 0, 32 * 2); /* for msan */ \
|
||||
int awidth = (width + 1) / 2; \
|
||||
int r = awidth & MASK; \
|
||||
int n = awidth & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(src_yuy2, stride_yuy2, dst_uv, n * 2); \
|
||||
} \
|
||||
memcpy(temp, src_yuy2 + n * SBPP, r * SBPP); \
|
||||
memcpy(temp + 32, src_yuy2 + stride_yuy2 + n * SBPP, r * SBPP); \
|
||||
ANY_SIMD(temp, 32, temp + 64, MASK + 1); \
|
||||
memcpy(dst_uv + n * BPP, temp + 64, r * BPP); \
|
||||
}
|
||||
|
||||
#ifdef HAS_YUY2TONVUVROW_NEON
|
||||
ANY21S(YUY2ToNVUVRow_Any_NEON, YUY2ToNVUVRow_NEON, 4, 2, 7)
|
||||
#endif
|
||||
#ifdef HAS_YUY2TONVUVROW_SSE2
|
||||
ANY21S(YUY2ToNVUVRow_Any_SSE2, YUY2ToNVUVRow_SSE2, 4, 2, 7)
|
||||
#endif
|
||||
#ifdef HAS_YUY2TONVUVROW_AVX2
|
||||
ANY21S(YUY2ToNVUVRow_Any_AVX2, YUY2ToNVUVRow_AVX2, 4, 2, 15)
|
||||
#endif
|
||||
|
||||
// Any 2 planes to 1 with yuvconstants
|
||||
#define ANY21C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
|
||||
void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \
|
||||
|
||||
@ -22,9 +22,13 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
// This macro controls YUV to RGB using unsigned math to extend range of
|
||||
// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B.
|
||||
// Enable this macro for backwards compatiability with limited range 0 to 2.
|
||||
// LIBYUV_LIMITED_DATA
|
||||
// YUV to RGB coefficients to 0 to 4 instead of 0 to 2 for more accuracy on B:
|
||||
// LIBYUV_UNLIMITED_DATA
|
||||
|
||||
// Macros to enable unlimited data for each colorspace
|
||||
// LIBYUV_UNLIMITED_BT601
|
||||
// LIBYUV_UNLIMITED_BT709
|
||||
// LIBYUV_UNLIMITED_BT2020
|
||||
|
||||
// The following macro from row_win makes the C code match the row_win code,
|
||||
// which is 7 bit fixed point for ARGBToI420:
|
||||
@ -1480,7 +1484,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
||||
// KR = 0.299; KB = 0.114
|
||||
|
||||
// U and V contributions to R,G,B.
|
||||
#if !defined(LIBYUV_LIMITED_DATA)
|
||||
#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT601)
|
||||
#define UB 129 /* round(2.018 * 64) */
|
||||
#else
|
||||
#define UB 128 /* max(128, round(2.018 * 64)) */
|
||||
@ -1534,7 +1538,7 @@ MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR)
|
||||
// KR = 0.2126, KB = 0.0722
|
||||
|
||||
// U and V contributions to R,G,B.
|
||||
#if !defined(LIBYUV_LIMITED_DATA)
|
||||
#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT709)
|
||||
#define UB 135 /* round(2.112 * 64) */
|
||||
#else
|
||||
#define UB 128 /* max(128, round(2.112 * 64)) */
|
||||
@ -1588,7 +1592,7 @@ MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR)
|
||||
// KR = 0.2627; KB = 0.0593
|
||||
|
||||
// U and V contributions to R,G,B.
|
||||
#if !defined(LIBYUV_LIMITED_DATA)
|
||||
#if defined(LIBYUV_UNLIMITED_DATA) || defined(LIBYUV_UNLIMITED_BT2020)
|
||||
#define UB 137 /* round(2.142 * 64) */
|
||||
#else
|
||||
#define UB 128 /* max(128, round(2.142 * 64)) */
|
||||
@ -3094,6 +3098,21 @@ void YUY2ToUVRow_C(const uint8_t* src_yuy2,
|
||||
}
|
||||
}
|
||||
|
||||
// Filter 2 rows of YUY2 UV's (422) into UV (NV12).
|
||||
void YUY2ToNVUVRow_C(const uint8_t* src_yuy2,
|
||||
int src_stride_yuy2,
|
||||
uint8_t* dst_uv,
|
||||
int width) {
|
||||
// Output a row of UV values, filtering 2 rows of YUY2.
|
||||
int x;
|
||||
for (x = 0; x < width; x += 2) {
|
||||
dst_uv[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
|
||||
dst_uv[1] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
|
||||
src_yuy2 += 4;
|
||||
dst_uv += 2;
|
||||
}
|
||||
}
|
||||
|
||||
// Copy row of YUY2 UV's (422) into U and V (422).
|
||||
void YUY2ToUV422Row_C(const uint8_t* src_yuy2,
|
||||
uint8_t* dst_u,
|
||||
|
||||
@ -6739,6 +6739,33 @@ void YUY2ToYRow_SSE2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm5");
|
||||
}
|
||||
|
||||
void YUY2ToNVUVRow_SSE2(const uint8_t* src_yuy2,
|
||||
int stride_yuy2,
|
||||
uint8_t* dst_uv,
|
||||
int width) {
|
||||
asm volatile(LABELALIGN
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu 0x10(%0),%%xmm1 \n"
|
||||
"movdqu 0x00(%0,%3,1),%%xmm2 \n"
|
||||
"movdqu 0x10(%0,%3,1),%%xmm3 \n"
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"pavgb %%xmm2,%%xmm0 \n"
|
||||
"pavgb %%xmm3,%%xmm1 \n"
|
||||
"psrlw $0x8,%%xmm0 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"movdqu %%xmm0,(%1) \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_yuy2), // %0
|
||||
"+r"(dst_uv), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"((intptr_t)(stride_yuy2)) // %3
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
|
||||
}
|
||||
|
||||
void YUY2ToUVRow_SSE2(const uint8_t* src_yuy2,
|
||||
int stride_yuy2,
|
||||
uint8_t* dst_u,
|
||||
@ -6939,6 +6966,35 @@ void YUY2ToYRow_AVX2(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm5");
|
||||
}
|
||||
|
||||
void YUY2ToNVUVRow_AVX2(const uint8_t* src_yuy2,
|
||||
int stride_yuy2,
|
||||
uint8_t* dst_uv,
|
||||
int width) {
|
||||
asm volatile(
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%0),%%ymm0 \n"
|
||||
"vmovdqu 0x20(%0),%%ymm1 \n"
|
||||
"vpavgb 0x00(%0,%3,1),%%ymm0,%%ymm0 \n"
|
||||
"vpavgb 0x20(%0,%3,1),%%ymm1,%%ymm1 \n"
|
||||
"lea 0x40(%0),%0 \n"
|
||||
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
|
||||
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
|
||||
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||
"vmovdqu %%ymm0,(%1) \n"
|
||||
"lea 0x20(%1),%1 \n"
|
||||
"sub $0x20,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_yuy2), // %0
|
||||
"+r"(dst_uv), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"((intptr_t)(stride_yuy2)) // %3
|
||||
: "memory", "cc", "xmm0", "xmm1");
|
||||
}
|
||||
|
||||
void YUY2ToUVRow_AVX2(const uint8_t* src_yuy2,
|
||||
int stride_yuy2,
|
||||
uint8_t* dst_u,
|
||||
|
||||
@ -1583,6 +1583,29 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
|
||||
);
|
||||
}
|
||||
|
||||
void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
|
||||
int stride_yuy2,
|
||||
uint8_t* dst_uv,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"add %1, %0, %1 \n" // stride + src_yuy2
|
||||
"1: \n"
|
||||
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
|
||||
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
|
||||
"vld2.8 {q2, q3}, [%1]! \n" // load next row YUY2.
|
||||
"vrhadd.u8 q4, q1, q3 \n" // average rows of UV
|
||||
"vst1.8 {q4}, [%2]! \n" // store 8 UV.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_yuy2), // %0
|
||||
"+r"(stride_yuy2), // %1
|
||||
"+r"(dst_uv), // %2
|
||||
"+r"(width) // %3
|
||||
:
|
||||
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
|
||||
"d7" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
|
||||
void ARGBShuffleRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb,
|
||||
|
||||
@ -1808,6 +1808,29 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
|
||||
);
|
||||
}
|
||||
|
||||
void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
|
||||
int stride_yuy2,
|
||||
uint8_t* dst_uv,
|
||||
int width) {
|
||||
const uint8_t* src_yuy2b = src_yuy2 + stride_yuy2;
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels
|
||||
"subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs.
|
||||
"ld2 {v2.16b,v3.16b}, [%1], #32 \n" // load next row
|
||||
"urhadd v4.16b, v1.16b, v3.16b \n" // average rows of UV
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"st1 {v4.16b}, [%2], #16 \n" // store 8 UV.
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_yuy2), // %0
|
||||
"+r"(src_yuy2b), // %1
|
||||
"+r"(dst_uv), // %2
|
||||
"+r"(width) // %3
|
||||
:
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
|
||||
);
|
||||
}
|
||||
|
||||
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
|
||||
void ARGBShuffleRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb,
|
||||
|
||||
@ -32,10 +32,10 @@ namespace libyuv {
|
||||
#endif
|
||||
#define ERROR_R 1
|
||||
#define ERROR_G 1
|
||||
#if defined(LIBYUV_LIMITED_DATA)
|
||||
#define ERROR_B 18
|
||||
#else
|
||||
#ifdef LIBYUV_UNLIMITED_DATA
|
||||
#define ERROR_B 1
|
||||
#else
|
||||
#define ERROR_B 18
|
||||
#endif
|
||||
|
||||
#define TESTCS(TESTNAME, YUVTOARGB, ARGBTOYUV, HS1, HS, HN, DIFF) \
|
||||
@ -502,10 +502,10 @@ TEST_F(LibYUVColorTest, TestYUV) {
|
||||
YUVToRGB(240, 0, 0, &r1, &g1, &b1);
|
||||
EXPECT_EQ(57, r1);
|
||||
EXPECT_EQ(255, g1);
|
||||
#if defined(LIBYUV_LIMITED_DATA)
|
||||
EXPECT_EQ(5, b1);
|
||||
#else
|
||||
#ifdef LIBYUV_UNLIMITED_DATA
|
||||
EXPECT_EQ(3, b1);
|
||||
#else
|
||||
EXPECT_EQ(5, b1);
|
||||
#endif
|
||||
|
||||
for (int i = 0; i < 256; ++i) {
|
||||
|
||||
@ -2620,10 +2620,10 @@ TEST_F(LibYUVConvertTest, TestMJPGToARGB) {
|
||||
|
||||
// Test result matches known hash value.
|
||||
uint32_t dst_argb_hash = HashDjb2(dst_argb, width * height, 5381);
|
||||
#if defined(LIBYUV_LIMITED_DATA)
|
||||
EXPECT_EQ(dst_argb_hash, 2355976473u);
|
||||
#else
|
||||
#ifdef LIBYUV_UNLIMITED_DATA
|
||||
EXPECT_EQ(dst_argb_hash, 3900633302u);
|
||||
#else
|
||||
EXPECT_EQ(dst_argb_hash, 2355976473u);
|
||||
#endif
|
||||
|
||||
free_aligned_buffer_page_end(dst_argb);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user