diff --git a/README.chromium b/README.chromium index 4b758aa51..d42db11c1 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 410 +Version: 413 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index bead6eec3..b25853c48 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -84,6 +84,8 @@ extern "C" { #define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOYROW_SSE2 +#define HAS_I422TOYUY2ROW_SSE2 +#define HAS_I422TOUYVYROW_SSE2 // Effects #define HAS_ARGBAFFINEROW_SSE2 @@ -119,6 +121,9 @@ extern "C" { #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_ABGRTOARGBROW_NEON #define HAS_ARGBTOBAYERROW_NEON +#define HAS_ARGBTORAWROW_NEON +#define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORGBAROW_NEON #define HAS_BGRATOARGBROW_NEON #define HAS_COPYROW_NEON #define HAS_HALFROW_NEON @@ -130,6 +135,8 @@ extern "C" { #define HAS_I422TORGBAROW_NEON #define HAS_MIRRORROW_NEON #define HAS_MIRRORROWUV_NEON +#define HAS_NV12TOARGBROW_NEON +#define HAS_NV21TOARGBROW_NEON #define HAS_RAWTOARGBROW_NEON #define HAS_RGB24TOARGBROW_NEON #define HAS_RGBATOARGBROW_NEON @@ -141,13 +148,9 @@ extern "C" { #define HAS_YUY2TOUV422ROW_NEON #define HAS_YUY2TOUVROW_NEON #define HAS_YUY2TOYROW_NEON +#define HAS_I422TOYUY2ROW_NEON +#define HAS_I422TOUYVYROW_NEON -// TODO(fbarchard): Hook these up to calling functions. -#define HAS_ARGBTORAWROW_NEON -#define HAS_ARGBTORGB24ROW_NEON -#define HAS_ARGBTORGBAROW_NEON -#define HAS_NV12TOARGBROW_NEON -#define HAS_NV21TOARGBROW_NEON #endif #if defined(_MSC_VER) && !defined(__CLR_VER) @@ -768,6 +771,31 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix); +void I422ToYUY2Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width); +void I422ToUYVYRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width); +void I422ToYUY2Row_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width); +void I422ToUYVYRow_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width); +void I422ToYUY2Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width); +void I422ToUYVYRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width); + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 5acb9e225..eef9ce8ef 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 410 +#define LIBYUV_VERSION 413 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_from.cc b/source/convert_from.cc index b9c205518..443c140b7 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -222,208 +222,6 @@ int I400Copy(const uint8* src_y, int src_stride_y, return 0; } -// YUY2 - Macro-pixel = 2 image pixels -// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... - -// UYVY - Macro-pixel = 2 image pixels -// U0Y0V0Y1 - -#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) -#define HAS_I422TOYUY2ROW_SSE2 -__declspec(naked) __declspec(align(16)) -static void I422ToYUY2Row_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u - mov edx, [esp + 8 + 12] // src_v - mov edi, [esp + 8 + 16] // dst_frame - mov ecx, [esp + 8 + 20] // width - sub edx, esi - - align 16 - convertloop: - movq xmm2, qword ptr [esi] // U - movq xmm3, qword ptr [esi + edx] // V - lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV - movdqa xmm0, [eax] // Y - lea eax, [eax + 16] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm2 // YUYV - punpckhbw xmm1, xmm2 - movdqa [edi], xmm0 - movdqa [edi + 16], xmm1 - lea edi, [edi + 32] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} - -#define HAS_I422TOUYVYROW_SSE2 -__declspec(naked) __declspec(align(16)) -static void I422ToUYVYRow_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u - mov edx, [esp + 8 + 12] // src_v - mov edi, [esp + 8 + 16] // dst_frame - mov ecx, [esp + 8 + 20] // width - sub edx, esi - - align 16 - convertloop: - movq xmm2, qword ptr [esi] // U - movq xmm3, qword ptr [esi + edx] // V - lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV - movdqa xmm0, [eax] // Y - movdqa xmm1, xmm2 - lea eax, [eax + 16] - punpcklbw xmm1, xmm0 // UYVY - punpckhbw xmm2, xmm0 - movdqa [edi], xmm1 - movdqa [edi + 16], xmm2 - lea edi, [edi + 32] - sub ecx, 16 - jg convertloop - - pop edi - pop esi - ret - } -} -#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__)) -#define HAS_I422TOYUY2ROW_SSE2 -static void I422ToYUY2Row_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { - asm volatile ( - "sub %1,%2 \n" - ".p2align 4 \n" - "1: \n" - "movq (%1),%%xmm2 \n" - "movq (%1,%2,1),%%xmm3 \n" - "lea 0x8(%1),%1 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqa (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm2,%%xmm0 \n" - "punpckhbw %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,(%3) \n" - "movdqa %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_frame), // %3 - "+rm"(width) // %4 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" -#endif - ); -} - -#define HAS_I422TOUYVYROW_SSE2 -static void I422ToUYVYRow_SSE2(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { - asm volatile ( - "sub %1,%2 \n" - ".p2align 4 \n" - "1: \n" - "movq (%1),%%xmm2 \n" - "movq (%1,%2,1),%%xmm3 \n" - "lea 0x8(%1),%1 \n" - "punpcklbw %%xmm3,%%xmm2 \n" - "movdqa (%0),%%xmm0 \n" - "movdqa %%xmm2,%%xmm1 \n" - "lea 0x10(%0),%0 \n" - "punpcklbw %%xmm0,%%xmm1 \n" - "punpckhbw %%xmm0,%%xmm2 \n" - "movdqa %%xmm1,(%3) \n" - "movdqa %%xmm2,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x10,%4 \n" - "jg 1b \n" - : "+r"(src_y), // %0 - "+r"(src_u), // %1 - "+r"(src_v), // %2 - "+r"(dst_frame), // %3 - "+rm"(width) // %4 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" -#endif - ); -} -#endif - -static void I422ToYUY2Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { - for (int x = 0; x < width - 1; x += 2) { - dst_frame[0] = src_y[0]; - dst_frame[1] = src_u[0]; - dst_frame[2] = src_y[1]; - dst_frame[3] = src_v[0]; - dst_frame += 4; - src_y += 2; - src_u += 1; - src_v += 1; - } - if (width & 1) { - dst_frame[0] = src_y[0]; - dst_frame[1] = src_u[0]; - dst_frame[2] = src_y[0]; // duplicate last y - dst_frame[3] = src_v[0]; - } -} - -static void I422ToUYVYRow_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_frame, int width) { - for (int x = 0; x < width - 1; x += 2) { - dst_frame[0] = src_u[0]; - dst_frame[1] = src_y[0]; - dst_frame[2] = src_v[0]; - dst_frame[3] = src_y[1]; - dst_frame += 4; - src_y += 2; - src_u += 1; - src_v += 1; - } - if (width & 1) { - dst_frame[0] = src_u[0]; - dst_frame[1] = src_y[0]; - dst_frame[2] = src_v[0]; - dst_frame[3] = src_y[0]; // duplicate last y - } -} - // Visual C x86 or GCC little endian. #if defined(__x86_64__) || defined(_M_X64) || \ defined(__i386__) || defined(_M_IX86) || \ @@ -463,7 +261,6 @@ static void UYVYToV210Row_C(const uint8* src_uyvy, uint8* dst_v210, int width) { } } -// TODO(fbarchard): Deprecate, move or expand 422 support? LIBYUV_API int I422ToYUY2(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, @@ -490,6 +287,10 @@ int I422ToYUY2(const uint8* src_y, int src_stride_y, IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) { I422ToYUY2Row = I422ToYUY2Row_SSE2; } +#elif defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_NEON; + } #endif for (int y = 0; y < height; ++y) { @@ -528,6 +329,10 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y, IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) { I422ToYUY2Row = I422ToYUY2Row_SSE2; } +#elif defined(HAS_I422TOYUY2ROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + I422ToYUY2Row = I422ToYUY2Row_NEON; + } #endif for (int y = 0; y < height - 1; y += 2) { @@ -572,6 +377,10 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y, IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) { I422ToUYVYRow = I422ToUYVYRow_SSE2; } +#elif defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_NEON; + } #endif for (int y = 0; y < height; ++y) { @@ -610,6 +419,10 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y, IS_ALIGNED(dst_frame, 16) && IS_ALIGNED(dst_stride_frame, 16)) { I422ToUYVYRow = I422ToUYVYRow_SSE2; } +#elif defined(HAS_I422TOUYVYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { + I422ToUYVYRow = I422ToUYVYRow_NEON; + } #endif for (int y = 0; y < height - 1; y += 2) { diff --git a/source/row_common.cc b/source/row_common.cc index eb8b56178..2e7b2e4cb 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1279,6 +1279,50 @@ void ARGBToBayerRow_C(const uint8* src_argb, } } +void I422ToYUY2Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + for (int x = 0; x < width - 1; x += 2) { + dst_frame[0] = src_y[0]; + dst_frame[1] = src_u[0]; + dst_frame[2] = src_y[1]; + dst_frame[3] = src_v[0]; + dst_frame += 4; + src_y += 2; + src_u += 1; + src_v += 1; + } + if (width & 1) { + dst_frame[0] = src_y[0]; + dst_frame[1] = src_u[0]; + dst_frame[2] = src_y[0]; // duplicate last y + dst_frame[3] = src_v[0]; + } +} + +void I422ToUYVYRow_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + for (int x = 0; x < width - 1; x += 2) { + dst_frame[0] = src_u[0]; + dst_frame[1] = src_y[0]; + dst_frame[2] = src_v[0]; + dst_frame[3] = src_y[1]; + dst_frame += 4; + src_y += 2; + src_u += 1; + src_v += 1; + } + if (width & 1) { + dst_frame[0] = src_u[0]; + dst_frame[1] = src_y[0]; + dst_frame[2] = src_v[0]; + dst_frame[3] = src_y[0]; // duplicate last y + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_neon.cc b/source/row_neon.cc index 0d5db423c..5c5b8150c 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -75,10 +75,10 @@ static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, #endif #ifdef HAS_I422TOARGBROW_NEON -void I422ToARGBRow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, +void I422ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, int width) { asm volatile ( "vld1.u8 {d24}, [%5] \n" @@ -94,13 +94,13 @@ void I422ToARGBRow_NEON(const uint8* y_buf, "vmov.u8 d23, #255 \n" "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 - "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -108,10 +108,10 @@ void I422ToARGBRow_NEON(const uint8* y_buf, #endif // HAS_I422TOARGBROW_NEON #ifdef HAS_I422TOBGRAROW_NEON -void I422ToBGRARow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, +void I422ToBGRARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, int width) { asm volatile ( "vld1.u8 {d24}, [%5] \n" @@ -128,13 +128,13 @@ void I422ToBGRARow_NEON(const uint8* y_buf, "vmov.u8 d19, #255 \n" "vst4.8 {d19, d20, d21, d22}, [%3]! \n" "bgt 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 - "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_bgra), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -142,10 +142,10 @@ void I422ToBGRARow_NEON(const uint8* y_buf, #endif // HAS_I422TOBGRAROW_NEON #ifdef HAS_I422TOABGRROW_NEON -void I422ToABGRRow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, +void I422ToABGRRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_abgr, int width) { asm volatile ( "vld1.u8 {d24}, [%5] \n" @@ -162,13 +162,13 @@ void I422ToABGRRow_NEON(const uint8* y_buf, "vmov.u8 d23, #255 \n" "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 - "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_abgr), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -176,10 +176,10 @@ void I422ToABGRRow_NEON(const uint8* y_buf, #endif // HAS_I422TOABGRROW_NEON #ifdef HAS_I422TORGBAROW_NEON -void I422ToRGBARow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, +void I422ToRGBARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgba, int width) { asm volatile ( "vld1.u8 {d24}, [%5] \n" @@ -195,13 +195,13 @@ void I422ToRGBARow_NEON(const uint8* y_buf, "vmov.u8 d19, #255 \n" "vst4.8 {d19, d20, d21, d22}, [%3]! \n" "bgt 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 - "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgba), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -209,10 +209,10 @@ void I422ToRGBARow_NEON(const uint8* y_buf, #endif // HAS_I422TORGBAROW_NEON #ifdef HAS_I422TORGB24ROW_NEON -void I422ToRGB24Row_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, +void I422ToRGB24Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb24, int width) { asm volatile ( "vld1.u8 {d24}, [%5] \n" @@ -227,13 +227,13 @@ void I422ToRGB24Row_NEON(const uint8* y_buf, "subs %4, %4, #8 \n" "vst3.8 {d20, d21, d22}, [%3]! \n" "bgt 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 - "+r"(width) // %4 - : "r"(&kUVToRB), // %5 - "r"(&kUVToG) // %6 + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_rgb24), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -241,10 +241,10 @@ void I422ToRGB24Row_NEON(const uint8* y_buf, #endif // HAS_I422TORGB24ROW_NEON #ifdef HAS_I422TORAWROW_NEON -void I422ToRAWRow_NEON(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, +void I422ToRAWRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, int width) { asm volatile ( "vld1.u8 {d24}, [%5] \n" @@ -260,10 +260,10 @@ void I422ToRAWRow_NEON(const uint8* y_buf, "vswp.u8 d20, d22 \n" "vst3.8 {d20, d21, d22}, [%3]! \n" "bgt 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_raw), // %3 "+r"(width) // %4 : "r"(&kUVToRB), // %5 "r"(&kUVToG) // %6 @@ -274,9 +274,9 @@ void I422ToRAWRow_NEON(const uint8* y_buf, #endif // HAS_I422TORAWROW_NEON #ifdef HAS_NV12TOARGBROW_NEON -void NV12ToARGBRow_NEON(const uint8* y_buf, - const uint8* uv_buf, - uint8* rgb_buf, +void NV12ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, int width) { asm volatile ( "vld1.u8 {d24}, [%4] \n" @@ -292,12 +292,12 @@ void NV12ToARGBRow_NEON(const uint8* y_buf, "vmov.u8 d23, #255 \n" "vst4.8 {d20, d21, d22, d23}, [%2]! \n" "bgt 1b \n" - : "+r"(y_buf), // %0 - "+r"(uv_buf), // %1 - "+r"(rgb_buf), // %2 - "+r"(width) // %3 - : "r"(&kUVToRB), // %4 - "r"(&kUVToG) // %5 + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -305,9 +305,9 @@ void NV12ToARGBRow_NEON(const uint8* y_buf, #endif // HAS_NV12TOARGBROW_NEON #ifdef HAS_NV21TOARGBROW_NEON -void NV21ToARGBRow_NEON(const uint8* y_buf, - const uint8* uv_buf, - uint8* rgb_buf, +void NV21ToARGBRow_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_argb, int width) { asm volatile ( "vld1.u8 {d24}, [%4] \n" @@ -323,12 +323,12 @@ void NV21ToARGBRow_NEON(const uint8* y_buf, "vmov.u8 d23, #255 \n" "vst4.8 {d20, d21, d22, d23}, [%2]! \n" "bgt 1b \n" - : "+r"(y_buf), // %0 - "+r"(uv_buf), // %1 - "+r"(rgb_buf), // %2 - "+r"(width) // %3 - : "r"(&kUVToRB), // %4 - "r"(&kUVToG) // %5 + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_argb), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); @@ -862,6 +862,52 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, ); } +void I422ToYUY2Row_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_yuy2, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + "vld1.8 {d1}, [%1]! \n" // load 8 Us + "vld1.8 {d3}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.u8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_yuy2), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3" + ); +} + +void I422ToUYVYRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_uyvy, int width) { + asm volatile ( + ".p2align 2 \n" + "1: \n" + "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + "vld1.8 {d0}, [%1]! \n" // load 8 Us + "vld1.8 {d2}, [%2]! \n" // load 8 Vs + "subs %4, %4, #16 \n" // 16 pixels + "vst4.u8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_uyvy), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "d0", "d1", "d2", "d3" + ); +} + #endif // __ARM_NEON__ #ifdef __cplusplus diff --git a/source/row_posix.cc b/source/row_posix.cc index e7cdd752e..159a790bc 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -4109,6 +4109,77 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, #endif ); } + +void I422ToYUY2Row_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + asm volatile ( + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movq (%1),%%xmm2 \n" + "movq (%1,%2,1),%%xmm3 \n" + "lea 0x8(%1),%1 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa (%0),%%xmm0 \n" + "lea 0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm0 \n" + "punpckhbw %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,(%3) \n" + "movdqa %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_frame), // %3 + "+rm"(width) // %4 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + +void I422ToUYVYRow_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + asm volatile ( + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movq (%1),%%xmm2 \n" + "movq (%1,%2,1),%%xmm3 \n" + "lea 0x8(%1),%1 \n" + "punpcklbw %%xmm3,%%xmm2 \n" + "movdqa (%0),%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "lea 0x10(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpckhbw %%xmm0,%%xmm2 \n" + "movdqa %%xmm1,(%3) \n" + "movdqa %%xmm2,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x10,%4 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_frame), // %3 + "+rm"(width) // %4 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3" +#endif + ); +} + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/row_win.cc b/source/row_win.cc index cf29bb632..2bb9ef958 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4239,6 +4239,87 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, } } +// YUY2 - Macro-pixel = 2 image pixels +// Y0U0Y1V0....Y2U2Y3V2...Y4U4Y5V4.... + +// UYVY - Macro-pixel = 2 image pixels +// U0Y0V0Y1 + +__declspec(naked) __declspec(align(16)) +void I422ToYUY2Row_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width + sub edx, esi + + align 16 + convertloop: + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V + lea esi, [esi + 8] + punpcklbw xmm2, xmm3 // UV + movdqa xmm0, [eax] // Y + lea eax, [eax + 16] + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm2 // YUYV + punpckhbw xmm1, xmm2 + movdqa [edi], xmm0 + movdqa [edi + 16], xmm1 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToUYVYRow_SSE2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_frame, int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u + mov edx, [esp + 8 + 12] // src_v + mov edi, [esp + 8 + 16] // dst_frame + mov ecx, [esp + 8 + 20] // width + sub edx, esi + + align 16 + convertloop: + movq xmm2, qword ptr [esi] // U + movq xmm3, qword ptr [esi + edx] // V + lea esi, [esi + 8] + punpcklbw xmm2, xmm3 // UV + movdqa xmm0, [eax] // Y + movdqa xmm1, xmm2 + lea eax, [eax + 16] + punpcklbw xmm1, xmm0 // UYVY + punpckhbw xmm2, xmm0 + movdqa [edi], xmm1 + movdqa [edi + 16], xmm2 + lea edi, [edi + 32] + sub ecx, 16 + jg convertloop + + pop edi + pop esi + ret + } +} #endif // _M_IX86 #ifdef __cplusplus