diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index 0e164a5e7..665c5a723 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -12,6 +12,8 @@ #define INCLUDE_LIBYUV_CONVERT_H_ #include "libyuv/basic_types.h" +// TODO(fbarchard): Remove the following headers includes +#include "libyuv/convert_from.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index e1380fa27..214235a01 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -13,6 +13,10 @@ #include "libyuv/basic_types.h" +// TODO(fbarchard): Remove the following headers includes +#include "libyuv/convert.h" +#include "libyuv/planar_functions.h" + #ifdef __cplusplus namespace libyuv { extern "C" { diff --git a/include/libyuv/version.h b/include/libyuv/version.h index e0f4ec43b..1566171f3 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 174 +#define LIBYUV_VERSION 175 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index d7638b324..7d682083d 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -10,8 +10,6 @@ #include "libyuv/convert.h" -#include // For memcpy() - #include "libyuv/basic_types.h" #include "libyuv/cpu_id.h" #include "libyuv/format_conversion.h" @@ -283,14 +281,34 @@ int I400ToI420(const uint8* src_y, int src_stride_y, static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, uint8* dst, int dst_stride_frame, int width, int height) { + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) { + CopyRow = CopyRow_NEON; + } +#elif defined(HAS_COPYROW_X86) + if (IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_X86; +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 32) && IS_ALIGNED(src, 16) && + IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_frame, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif + } +#endif + // Copy plane - for (int y = 0; y < height; y += 2) { - memcpy(dst, src, width); - src += src_stride_0; - dst += dst_stride_frame; - memcpy(dst, src, width); - src += src_stride_1; - dst += dst_stride_frame; + for (int y = 0; y < height - 1; y += 2) { + CopyRow(src, dst, width); + CopyRow(src + src_stride_0, dst + dst_stride_frame, width); + src += src_stride_0 + src_stride_1; + dst += dst_stride_frame * 2; + } + if (height & 1) { + CopyRow(src, dst, width); } } @@ -514,6 +532,24 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, dst_stride_u = -dst_stride_u; dst_stride_v = -dst_stride_v; } + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) { + CopyRow = CopyRow_NEON; + } +#elif defined(HAS_COPYROW_X86) + if (IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_X86; +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif + } +#endif + void (*SplitYUY2)(const uint8* src_yuy2, uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix); #if defined(HAS_SPLITYUY2_SSE2) @@ -528,7 +564,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, SplitYUY2 = SplitYUY2_C; } for (int y = 0; y < height; y += 2) { - memcpy(dst_y, src_y, width); + CopyRow(src_y, dst_y, width); dst_y += dst_stride_y; src_y += src_stride_y; diff --git a/source/convert_from.cc b/source/convert_from.cc index d11b6aa21..cb436a1e4 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -10,8 +10,6 @@ #include "libyuv/convert_from.h" -#include // For memcpy() - #include "libyuv/basic_types.h" #include "libyuv/convert.h" // For I420Copy #include "libyuv/cpu_id.h" @@ -43,33 +41,53 @@ int I420ToI422(const uint8* src_y, int src_stride_y, dst_stride_u = -dst_stride_u; dst_stride_v = -dst_stride_v; } + int halfwidth = (width + 1) >> 1; + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 64)) { + CopyRow = CopyRow_NEON; + } +#elif defined(HAS_COPYROW_X86) + if (IS_ALIGNED(halfwidth, 4)) { + CopyRow = CopyRow_X86; +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(halfwidth, 32) && + IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) && + IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && + IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) && + IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif + } +#endif + // Copy Y plane if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } - int halfwidth = (width + 1) >> 1; // UpSample U plane. int y; for (y = 0; y < height - 1; y += 2) { - memcpy(dst_u, src_u, halfwidth); - memcpy(dst_u + dst_stride_u, src_u, halfwidth); + CopyRow(src_u, dst_u, halfwidth); + CopyRow(src_u, dst_u + dst_stride_u, halfwidth); src_u += src_stride_u; dst_u += dst_stride_u * 2; } if (height & 1) { - memcpy(dst_u, src_u, halfwidth); + CopyRow(src_u, dst_u, halfwidth); } // UpSample V plane. for (y = 0; y < height - 1; y += 2) { - memcpy(dst_v, src_v, halfwidth); - memcpy(dst_v + dst_stride_v, src_v, halfwidth); + CopyRow(src_v, dst_v, halfwidth); + CopyRow(src_v, dst_v + dst_stride_v, halfwidth); src_v += src_stride_v; dst_v += dst_stride_v * 2; } if (height & 1) { - memcpy(dst_v, src_v, halfwidth); + CopyRow(src_v, dst_v, halfwidth); } return 0; } diff --git a/source/convertfrom.cc b/source/convertfrom.cc new file mode 100644 index 000000000..d68703627 --- /dev/null +++ b/source/convertfrom.cc @@ -0,0 +1,3 @@ +// TODO(fbarchard): Remove once builds have switched to convert_from +#include "convert_from.cc" + diff --git a/source/planar_functions.cc b/source/planar_functions.cc index a48e96eaa..2691db7ea 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -10,8 +10,6 @@ #include "libyuv/planar_functions.h" -#include // For memcpy() - #include "libyuv/cpu_id.h" #include "row.h" @@ -20,110 +18,28 @@ namespace libyuv { extern "C" { #endif -// CopyRows copys 'count' bytes using a 16 byte load/store, 64 bytes at time -#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) -#define HAS_COPYROW_SSE2 -__declspec(naked) -void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { - __asm { - mov eax, [esp + 4] // src - mov edx, [esp + 8] // dst - mov ecx, [esp + 12] // count - sub edx, eax - convertloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - movdqa [eax + edx], xmm0 - movdqa [eax + edx + 16], xmm1 - lea eax, [eax + 32] - sub ecx, 32 - ja convertloop - ret - } -} - -#define HAS_COPYROW_X86 -__declspec(naked) -void CopyRow_X86(const uint8* src, uint8* dst, int count) { - __asm { - mov eax, esi - mov edx, edi - mov esi, [esp + 4] // src - mov edi, [esp + 8] // dst - mov ecx, [esp + 12] // count - shr ecx, 2 - rep movsd - mov edi, edx - mov esi, eax - ret - } -} -#elif (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM) -#define HAS_COPYROW_SSE2 -void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { - asm volatile ( - "sub %0,%1 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa %%xmm0,(%0,%1) \n" - "movdqa %%xmm1,0x10(%0,%1) \n" - "lea 0x20(%0),%0 \n" - "sub $0x20,%2 \n" - "ja 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(count) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif - ); -} - -#define HAS_COPYROW_X86 -void CopyRow_X86(const uint8* src, uint8* dst, int width) { - size_t width_tmp = static_cast(width); - asm volatile ( - "shr $0x2,%2 \n" - "rep movsl \n" - : "+S"(src), // %0 - "+D"(dst), // %1 - "+c"(width_tmp) // %2 - : - : "memory", "cc" - ); -} -#endif - -void CopyRow_C(const uint8* src, uint8* dst, int count) { - memcpy(dst, src, count); -} - // Copy a plane of data void CopyPlane(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, int width, int height) { - void (*CopyRow)(const uint8* src, uint8* dst, int width); -#if defined(HAS_COPYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(width, 32) && - IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - CopyRow = CopyRow_SSE2; - } else -#endif -#if defined(HAS_COPYROW_X86) - if (IS_ALIGNED(width, 4) && - IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && - IS_ALIGNED(dst_y, 4) && IS_ALIGNED(dst_stride_y, 4)) { - CopyRow = CopyRow_X86; - } else -#endif - { - CopyRow = CopyRow_C; + void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; +#if defined(HAS_COPYROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 64)) { + CopyRow = CopyRow_NEON; } +#elif defined(HAS_COPYROW_X86) + if (IS_ALIGNED(width, 4)) { + CopyRow = CopyRow_X86; +#if defined(HAS_COPYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 32) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + CopyRow = CopyRow_SSE2; + } +#endif + } +#endif // Copy plane for (int y = 0; y < height; ++y) { diff --git a/source/row.h b/source/row.h index c8670a893..c45127d05 100644 --- a/source/row.h +++ b/source/row.h @@ -47,6 +47,8 @@ extern "C" { #define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORROW_SSE2 #define HAS_SPLITUV_SSE2 +#define HAS_COPYROW_SSE2 +#define HAS_COPYROW_X86 #define HAS_YUY2TOYROW_SSE2 #define HAS_UYVYTOYROW_SSE2 #define HAS_YUY2TOUVROW_SSE2 @@ -69,6 +71,7 @@ extern "C" { #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) #define HAS_MIRRORROW_NEON #define HAS_SPLITUV_NEON +#define HAS_COPYROW_NEON #define HAS_I420TOARGBROW_NEON #define HAS_I420TOBGRAROW_NEON #define HAS_I420TOABGRROW_NEON @@ -131,6 +134,11 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); +void CopyRow_SSE2(const uint8* src, uint8* dst, int count); +void CopyRow_X86(const uint8* src, uint8* dst, int count); +void CopyRow_NEON(const uint8* src, uint8* dst, int count); +void CopyRow_C(const uint8* src, uint8* dst, int count); + void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix); diff --git a/source/row_common.cc b/source/row_common.cc index ec22b37fe..23352c8b1 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -377,6 +377,10 @@ void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { } } +void CopyRow_C(const uint8* src, uint8* dst, int count) { + memcpy(dst, src, count); +} + // Filter 2 rows of YUY2 UV's (422) into U and V (420) void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_u, uint8* dst_v, int pix) { @@ -423,36 +427,36 @@ void UYVYToYRow_C(const uint8* src_uyvy, } // Wrappers to handle odd sizes/alignments -#define MAKEYUVANY(NAMEANY, NAME) \ -void NAMEANY(const uint8* y_buf, \ - const uint8* u_buf, \ - const uint8* v_buf, \ - uint8* rgb_buf, \ - int width) { \ - SIMD_ALIGNED(uint8 row[kMaxStride]); \ - NAME(y_buf, u_buf, v_buf, row, width); \ - memcpy(rgb_buf, row, width << 2); \ -} +#define MAKEYUVANY(NAMEANY, NAME, COPYROW) \ + void NAMEANY(const uint8* y_buf, \ + const uint8* u_buf, \ + const uint8* v_buf, \ + uint8* rgb_buf, \ + int width) { \ + SIMD_ALIGNED(uint8 row[kMaxStride]); \ + NAME(y_buf, u_buf, v_buf, row, width); \ + COPYROW(row, rgb_buf, width << 2); \ + } #if defined(HAS_I420TOARGBROW_SSSE3) -MAKEYUVANY(I420ToARGBRow_Any_SSSE3, I420ToARGBRow_SSSE3) -MAKEYUVANY(I420ToBGRARow_Any_SSSE3, I420ToBGRARow_SSSE3) -MAKEYUVANY(I420ToABGRRow_Any_SSSE3, I420ToABGRRow_SSSE3) +MAKEYUVANY(I420ToARGBRow_Any_SSSE3, I420ToARGBRow_SSSE3, CopyRow_X86) +MAKEYUVANY(I420ToBGRARow_Any_SSSE3, I420ToBGRARow_SSSE3, CopyRow_X86) +MAKEYUVANY(I420ToABGRRow_Any_SSSE3, I420ToABGRRow_SSSE3, CopyRow_X86) #endif #if defined(HAS_I420TOARGBROW_NEON) -MAKEYUVANY(I420ToARGBRow_Any_NEON, I420ToARGBRow_NEON) -MAKEYUVANY(I420ToBGRARow_Any_NEON, I420ToBGRARow_NEON) -MAKEYUVANY(I420ToABGRRow_Any_NEON, I420ToABGRRow_NEON) +MAKEYUVANY(I420ToARGBRow_Any_NEON, I420ToARGBRow_NEON, CopyRow_C) +MAKEYUVANY(I420ToBGRARow_Any_NEON, I420ToBGRARow_NEON, CopyRow_C) +MAKEYUVANY(I420ToABGRRow_Any_NEON, I420ToABGRRow_NEON, CopyRow_C) #endif #define MAKEYUVANYRGB(NAMEANY, ARGBTORGB, BPP) \ -void NAMEANY(const uint8* argb_buf, \ - uint8* rgb_buf, \ - int width) { \ - SIMD_ALIGNED(uint8 row[kMaxStride]); \ - ARGBTORGB(argb_buf, row, width); \ - memcpy(rgb_buf, row, width * BPP); \ -} + void NAMEANY(const uint8* argb_buf, \ + uint8* rgb_buf, \ + int width) { \ + SIMD_ALIGNED(uint8 row[kMaxStride]); \ + ARGBTORGB(argb_buf, row, width); \ + memcpy(rgb_buf, row, width * BPP); \ + } #if defined(HAS_ARGBTORGB24ROW_SSSE3) MAKEYUVANYRGB(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 3) diff --git a/source/row_neon.cc b/source/row_neon.cc index 68028b192..00bbf729d 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -184,7 +184,27 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { } #endif -#endif // __ARM_NEON__ +#if defined(HAS_COPYROW_NEON) +// TODO(fbarchard): Test with and without pld +// "pld [%0, #0xC0] \n" // preload +// Copy multiple of 64 +void CopyRow_NEON(const uint8* src, uint8* dst, int count) { + asm volatile ( + "1: \n" + "vld1.u8 {q0,q1,q2,q3}, [%0]! \n" // load 64 + "subs %2, %2, #64 \n" // 64 processed per loop + "vst1.u8 {q0,q1,q2,q3}, [%1]! \n" // store 64 + "bhi 1b \n" + : "+r"(src), + "+r"(dst), + "+r"(count) // Output registers + : // Input registers + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} +#endif // HAS_COPYROW_NEON + +#endif // __ARM_NEON__ #ifdef __cplusplus } // extern "C" diff --git a/source/row_posix.cc b/source/row_posix.cc index 6fb3f3c73..9a1770e5f 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -848,7 +848,6 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" "sub %1,%2 \n" - "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" @@ -879,6 +878,45 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { } #endif +#ifdef HAS_COPYROW_SSE2 +void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { + asm volatile ( + "sub %0,%1 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa %%xmm0,(%0,%1) \n" + "movdqa %%xmm1,0x10(%0,%1) \n" + "lea 0x20(%0),%0 \n" + "sub $0x20,%2 \n" + "ja 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(count) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif // HAS_COPYROW_SSE2 + +#ifdef HAS_COPYROW_X86 +void CopyRow_X86(const uint8* src, uint8* dst, int width) { + size_t width_tmp = static_cast(width); + asm volatile ( + "shr $0x2,%2 \n" + "rep movsl \n" + : "+S"(src), // %0 + "+D"(dst), // %1 + "+c"(width_tmp) // %2 + : + : "memory", "cc" + ); +} +#endif + #ifdef HAS_YUY2TOYROW_SSE2 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { asm volatile ( diff --git a/source/row_win.cc b/source/row_win.cc index 7d347c436..31224076f 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -1569,6 +1569,46 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { } #endif +#ifdef HAS_COPYROW_SSE2 +// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time +__declspec(naked) +void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // count + sub edx, eax + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa [eax + edx], xmm0 + movdqa [eax + edx + 16], xmm1 + lea eax, [eax + 32] + sub ecx, 32 + ja convertloop + ret + } +} +#endif // HAS_COPYROW_SSE2 + +#ifdef HAS_COPYROW_X86 +__declspec(naked) +void CopyRow_X86(const uint8* src, uint8* dst, int count) { + __asm { + mov eax, esi + mov edx, edi + mov esi, [esp + 4] // src + mov edi, [esp + 8] // dst + mov ecx, [esp + 12] // count + shr ecx, 2 + rep movsd + mov edi, edx + mov esi, eax + ret + } +} +#endif + #ifdef HAS_YUY2TOYROW_SSE2 __declspec(naked) void YUY2ToYRow_SSE2(const uint8* src_yuy2,