diff --git a/README.chromium b/README.chromium index 3462416ed..e4cd56b95 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1220 +Version: 1222 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index a597a53b5..c0eb794d4 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -114,7 +114,8 @@ extern "C" { #define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOYROW_SSSE3 #define HAS_SETROW_X86 -#define HAS_ARGBSETROWS_X86 +#define HAS_SETROW_ERMS +#define HAS_ARGBSETROW_X86 #define HAS_SPLITUVROW_SSE2 #define HAS_UYVYTOARGBROW_SSSE3 #define HAS_UYVYTOUV422ROW_SSE2 @@ -302,7 +303,8 @@ extern "C" { #define HAS_RGB565TOYROW_NEON #define HAS_RGBATOUVROW_NEON #define HAS_RGBATOYROW_NEON -// #define HAS_SETROW_NEON +#define HAS_SETROW_NEON +#define HAS_ARGBSETROW_NEON #define HAS_SPLITUVROW_NEON #define HAS_UYVYTOARGBROW_NEON #define HAS_UYVYTOUV422ROW_NEON @@ -332,7 +334,6 @@ extern "C" { #define HAS_SOBELXYROW_NEON #define HAS_SOBELYROW_NEON #define HAS_ARGBCOLORMATRIXROW_NEON -// #define HAS_ARGBSETROWS_NEON #define HAS_ARGBSHUFFLEROW_NEON #endif @@ -800,15 +801,17 @@ void ARGBCopyYToAlphaRow_C(const uint8* src_y, uint8* dst_argb, int width); void ARGBCopyYToAlphaRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); void ARGBCopyYToAlphaRow_AVX2(const uint8* src_y, uint8* dst_argb, int width); -void SetRow_C(uint8* dst, uint32 v32, int count); -void SetRow_X86(uint8* dst, uint32 v32, int count); -void SetRow_NEON(uint8* dst, uint32 v32, int count); -void ARGBSetRows_C(uint8* dst, uint32 v32, int width, int dst_stride, - int height); -void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, - int dst_stride, int height); -void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width, - int dst_stride, int height); +void SetRow_C(uint8* dst, uint8 v8, int count); +void SetRow_X86(uint8* dst, uint8 v8, int count); +void SetRow_ERMS(uint8* dst, uint8 v8, int count); +void SetRow_NEON(uint8* dst, uint8 v8, int count); +void SetRow_Any_X86(uint8* dst, uint8 v8, int count); +void SetRow_Any_NEON(uint8* dst, uint8 v8, int count); + +void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int count); +void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count); +void ARGBSetRow_NEON(uint8* dst_argb, uint32 v32, int count); +void ARGBSetRow_Any_NEON(uint8* dst_argb, uint32 v32, int count); // ARGBShufflers for BGRAToARGB etc. void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 8b59f5e17..39cff222c 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1220 +#define LIBYUV_VERSION 1222 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index c17c8ad51..691b80e34 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1094,8 +1094,7 @@ void SetPlane(uint8* dst_y, int dst_stride_y, int width, int height, uint32 value) { int y; - uint32 v32 = value | (value << 8) | (value << 16) | (value << 24); - void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C; + void (*SetRow)(uint8* dst, uint8 value, int pix) = SetRow_C; if (height < 0) { height = -height; dst_y = dst_y + (height - 1) * dst_stride_y; @@ -1108,19 +1107,30 @@ void SetPlane(uint8* dst_y, int dst_stride_y, dst_stride_y = 0; } #if defined(HAS_SETROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { - SetRow = SetRow_NEON; + if (TestCpuFlag(kCpuHasNEON)) { + SetRow = SetRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + SetRow = SetRow_NEON; + } } #endif #if defined(HAS_SETROW_X86) - if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) { - SetRow = SetRow_X86; + if (TestCpuFlag(kCpuHasX86)) { + SetRow = SetRow_Any_X86; + if (IS_ALIGNED(width, 4)) { + SetRow = SetRow_X86; + } + } +#endif +#if defined(HAS_SETROW_ERMS) + if (TestCpuFlag(kCpuHasERMS)) { + SetRow = SetRow_ERMS; } #endif // Set plane for (y = 0; y < height; ++y) { - SetRow(dst_y, v32, width); + SetRow(dst_y, value, width); dst_y += dst_stride_y; } } @@ -1139,7 +1149,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y, uint8* start_u = dst_u + (y / 2) * dst_stride_u + (x / 2); uint8* start_v = dst_v + (y / 2) * dst_stride_v + (x / 2); if (!dst_y || !dst_u || !dst_v || - width <= 0 || height <= 0 || + width <= 0 || height == 0 || x < 0 || y < 0 || value_y < 0 || value_y > 255 || value_u < 0 || value_u > 255 || @@ -1159,6 +1169,8 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, int dst_x, int dst_y, int width, int height, uint32 value) { + int y; + void (*ARGBSetRow)(uint8* dst_argb, uint32 value, int pix) = ARGBSetRow_C; if (!dst_argb || width <= 0 || height == 0 || dst_x < 0 || dst_y < 0) { @@ -1176,19 +1188,26 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb, height = 1; dst_stride_argb = 0; } -#if defined(HAS_ARGBSETROWS_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { - ARGBSetRows_NEON(dst_argb, value, width, dst_stride_argb, height); - return 0; + +#if defined(HAS_ARGBSETROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBSetRow = ARGBSetRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBSetRow = ARGBSetRow_NEON; + } } #endif -#if defined(HAS_ARGBSETROWS_X86) +#if defined(HAS_ARGBSETROW_X86) if (TestCpuFlag(kCpuHasX86)) { - ARGBSetRows_X86(dst_argb, value, width, dst_stride_argb, height); - return 0; + ARGBSetRow = ARGBSetRow_X86; } #endif - ARGBSetRows_C(dst_argb, value, width, dst_stride_argb, height); + + // Set plane + for (y = 0; y < height; ++y) { + ARGBSetRow(dst_argb, value, width); + dst_argb += dst_stride_argb; + } return 0; } diff --git a/source/row_any.cc b/source/row_any.cc index bf5455a51..398eb3864 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -681,6 +681,27 @@ MANY(CopyRow_Any_NEON, CopyRow_NEON, CopyRow_C, 1, 31) #endif #undef MANY +#define SETANY(NAMEANY, SET_SIMD, SET_C, T, BPP, MASK) \ + void NAMEANY(uint8* dst_y, T v8, int width) { \ + int n = width & ~MASK; \ + int r = width & MASK; \ + if (n > 0) { \ + SET_SIMD(dst_y, v8, n); \ + } \ + SET_C(dst_y + n * BPP, v8, r); \ + } + +#ifdef HAS_SETROW_X86 +SETANY(SetRow_Any_X86, SetRow_X86, SetRow_ERMS, uint8, 1, 3) +#endif +#ifdef HAS_SETROW_NEON +SETANY(SetRow_Any_NEON, SetRow_NEON, SetRow_C, uint8, 1, 15) +#endif +#ifdef HAS_ARGBSETROW_NEON +SETANY(ARGBSetRow_Any_NEON, ARGBSetRow_NEON, ARGBSetRow_C, uint32, 4, 3) +#endif +#undef SETANY + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_common.cc b/source/row_common.cc index d29dc7675..eb6b75119 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1623,28 +1623,15 @@ void CopyRow_16_C(const uint16* src, uint16* dst, int count) { memcpy(dst, src, count * 2); } -void SetRow_C(uint8* dst, uint32 v8, int count) { -#ifdef _MSC_VER - // VisualC will generate rep stosb. - int x; - for (x = 0; x < count; ++x) { - dst[x] = v8; - } -#else - memset(dst, v8, count); -#endif +void SetRow_C(uint8* dst, uint8 v8, int width) { + memset(dst, v8, width); } -void ARGBSetRows_C(uint8* dst, uint32 v32, int width, - int dst_stride, int height) { - int y; - for (y = 0; y < height; ++y) { - uint32* d = (uint32*)(dst); - int x; - for (x = 0; x < width; ++x) { - d[x] = v32; - } - dst += dst_stride; +void ARGBSetRow_C(uint8* dst_argb, uint32 v32, int width) { + uint32* d = (uint32*)(dst_argb); + int x; + for (x = 0; x < width; ++x) { + d[x] = v32; } } diff --git a/source/row_neon.cc b/source/row_neon.cc index 65355f327..ff2463788 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -846,7 +846,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { } // SetRow writes 'count' bytes using an 8 bit value repeated. -void SetRow_NEON(uint8* dst, uint32 v8, int count) { +void SetRow_NEON(uint8* dst, uint8 v8, int count) { asm volatile ( "vdup.8 q0, %2 \n" // duplicate 16 bytes "1: \n" diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 7914aadee..82caeb9fa 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -736,7 +736,7 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { #endif // HAS_COPYROW_NEON // SetRow writes 'count' bytes using an 8 bit value repeated. -void SetRow_NEON(uint8* dst, uint32 v8, int count) { +void SetRow_NEON(uint8* dst, uint8 v8, int count) { asm volatile ( "dup v0.16b, %w2 \n" // duplicate 16 bytes "1: \n" diff --git a/source/row_posix.cc b/source/row_posix.cc index be070188a..45a2a9fa8 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -2893,10 +2893,10 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 #ifdef HAS_SETROW_X86 -void SetRow_X86(uint8* dst, uint32 v32, int width) { - size_t width_tmp = (size_t)(width); +void SetRow_X86(uint8* dst, uint8 v8, int width) { + size_t width_tmp = (size_t)(width >> 2); + const uint32 v32 = v8 * 0x01010101; // Duplicate byte to all bytes. asm volatile ( - "shr $0x2,%1 \n" "rep stosl " MEMSTORESTRING(eax,0) " \n" : "+D"(dst), // %0 "+c"(width_tmp) // %1 @@ -2904,19 +2904,24 @@ void SetRow_X86(uint8* dst, uint32 v32, int width) { : "memory", "cc"); } -void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, - int dst_stride, int height) { - for (int y = 0; y < height; ++y) { - size_t width_tmp = (size_t)(width); - uint32* d = (uint32*)(dst); - asm volatile ( - "rep stosl " MEMSTORESTRING(eax,0) " \n" - : "+D"(d), // %0 - "+c"(width_tmp) // %1 - : "a"(v32) // %2 - : "memory", "cc"); - dst += dst_stride; - } +void SetRow_ERMS(uint8* dst, uint8 v8, int width) { + size_t width_tmp = (size_t)(width); + asm volatile ( + "rep stosb " MEMSTORESTRING(al,0) " \n" + : "+D"(dst), // %0 + "+c"(width_tmp) // %1 + : "a"(v8) // %2 + : "memory", "cc"); +} + +void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) { + size_t width_tmp = (size_t)(width); + asm volatile ( + "rep stosl " MEMSTORESTRING(eax,0) " \n" + : "+D"(dst_argb), // %0 + "+c"(width_tmp) // %1 + : "a"(v32) // %2 + : "memory", "cc"); } #endif // HAS_SETROW_X86 diff --git a/source/row_win.cc b/source/row_win.cc index f36f84eba..822ad99ce 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2848,13 +2848,16 @@ void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) { #endif // HAS_ARGBCOPYYTOALPHAROW_AVX2 #ifdef HAS_SETROW_X86 -// SetRow writes 'count' bytes using a 32 bit value repeated. +// Write 'count' bytes using an 8 bit value repeated. +// Count should be multiple of 4. __declspec(naked) __declspec(align(16)) -void SetRow_X86(uint8* dst, uint32 v32, int count) { +void SetRow_X86(uint8* dst, uint8 v8, int count) { __asm { + movzx eax, byte ptr [esp + 8] // v8 + mov edx, 0x01010101 // Duplicate byte to all bytes. + mul edx // overwrites edx with upper part of result. mov edx, edi mov edi, [esp + 4] // dst - mov eax, [esp + 8] // v32 mov ecx, [esp + 12] // count shr ecx, 2 rep stosd @@ -2863,32 +2866,30 @@ void SetRow_X86(uint8* dst, uint32 v32, int count) { } } -// SetRow32 writes 'count' words using a 32 bit value repeated. +// Write 'count' bytes using an 8 bit value repeated. __declspec(naked) __declspec(align(16)) -void ARGBSetRows_X86(uint8* dst, uint32 v32, int width, - int dst_stride, int height) { +void SetRow_ERMS(uint8* dst, uint8 v8, int count) { __asm { - push esi - push edi - push ebp - mov edi, [esp + 12 + 4] // dst - mov eax, [esp + 12 + 8] // v32 - mov ebp, [esp + 12 + 12] // width - mov edx, [esp + 12 + 16] // dst_stride - mov esi, [esp + 12 + 20] // height - lea ecx, [ebp * 4] - sub edx, ecx // stride - width * 4 + mov edx, edi + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v8 + mov ecx, [esp + 12] // count + rep stosb + mov edi, edx + ret + } +} - convertloop: - mov ecx, ebp +// Write 'count' 32 bit values. +__declspec(naked) __declspec(align(16)) +void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { + __asm { + mov edx, edi + mov edi, [esp + 4] // dst + mov eax, [esp + 8] // v32 + mov ecx, [esp + 12] // count rep stosd - add edi, edx - sub esi, 1 - jg convertloop - - pop ebp - pop edi - pop esi + mov edi, edx ret } }