diff --git a/README.chromium b/README.chromium index de225cf38..b769c3752 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 508 +Version: 509 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 975497176..58fa8bb2a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -223,6 +223,7 @@ extern "C" { #define HAS_ARGBBLENDROW_NEON #define HAS_ARGBATTENUATEROW_NEON #define HAS_ARGBQUANTIZEROW_NEON +#define HAS_ARGBSHADEROW_NEON #endif // The following are available on Mips platforms @@ -1250,6 +1251,8 @@ void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, uint32 value); void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, uint32 value); +void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value); LIBYUV_API void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 518d60c3d..08ff1217f 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 508 +#define LIBYUV_VERSION 509 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 60d61c150..adb6b2b27 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1133,6 +1133,10 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb, IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { ARGBShadeRow = ARGBShadeRow_SSE2; } +#elif defined(HAS_ARGBSHADEROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBShadeRow = ARGBShadeRow_NEON; + } #endif for (int y = 0; y < height; ++y) { diff --git a/source/row_common.cc b/source/row_common.cc index 6bd9945b5..e5edb36f6 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -665,6 +665,32 @@ void ARGBQuantizeRow_C(uint8* dst_argb, int scale, int interval_size, } } +#define REPEAT8(v) (v) | ((v) << 8) +#define SHADE(f, v) v * f >> 24 + +void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + const uint32 b_scale = REPEAT8(value & 0xff); + const uint32 g_scale = REPEAT8((value >> 8) & 0xff); + const uint32 r_scale = REPEAT8((value >> 16) & 0xff); + const uint32 a_scale = REPEAT8(value >> 24); + + for (int i = 0; i < width; ++i) { + const uint32 b = REPEAT8(src_argb[0]); + const uint32 g = REPEAT8(src_argb[1]); + const uint32 r = REPEAT8(src_argb[2]); + const uint32 a = REPEAT8(src_argb[3]); + dst_argb[0] = SHADE(b, b_scale); + dst_argb[1] = SHADE(g, g_scale); + dst_argb[2] = SHADE(r, r_scale); + dst_argb[3] = SHADE(a, a_scale); + src_argb += 4; + dst_argb += 4; + } +} +#undef REPEAT8 +#undef SHADE + void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { // Copy a Y to RGB. for (int x = 0; x < width; ++x) { @@ -1512,32 +1538,6 @@ void CumulativeSumToAverageRow_C(const int32* tl, const int32* bl, } } -#define REPEAT8(v) (v) | ((v) << 8) -#define SHADE(f, v) v * f >> 24 - -void ARGBShadeRow_C(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value) { - const uint32 b_scale = REPEAT8(value & 0xff); - const uint32 g_scale = REPEAT8((value >> 8) & 0xff); - const uint32 r_scale = REPEAT8((value >> 16) & 0xff); - const uint32 a_scale = REPEAT8(value >> 24); - - for (int i = 0; i < width; ++i) { - const uint32 b = REPEAT8(src_argb[0]); - const uint32 g = REPEAT8(src_argb[1]); - const uint32 r = REPEAT8(src_argb[2]); - const uint32 a = REPEAT8(src_argb[3]); - dst_argb[0] = SHADE(b, b_scale); - dst_argb[1] = SHADE(g, g_scale); - dst_argb[2] = SHADE(r, r_scale); - dst_argb[3] = SHADE(a, a_scale); - src_argb += 4; - dst_argb += 4; - } -} -#undef REPEAT8 -#undef SHADE - // Copy pixels from rotated source to destination row with a slope. LIBYUV_API void ARGBAffineRow_C(const uint8* src_argb, int src_argb_stride, diff --git a/source/row_neon.cc b/source/row_neon.cc index c9f9f5a45..669d5637c 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -2510,7 +2510,43 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, : "r"(scale), // %2 "r"(interval_size), // %3 "r"(interval_offset) // %4 - : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10" + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10" + ); +} + +// Shade 8 pixels at a time by specified value. +// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. +void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + asm volatile ( + "vdup.u32 q0, %3 \n" // duplicate scale value. + "vtrn.u8 d0, d1 \n" // d0 rrbb, d1 aagg + "vshr.u16 q0, q0, #1 \n" // scale >>= 1 + + // 8 pixel loop. + ".p2align 2 \n" + "1: \n" + "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmovl.u8 q10, d20 \n" // b (0 .. 255) + "vmovl.u8 q11, d22 \n" + "vmovl.u8 q12, d24 \n" + "vmovl.u8 q13, d26 \n" + "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale + "vqrdmulh.s16 q11, q11, d1[0] \n" // g + "vqrdmulh.s16 q12, q12, d0[1] \n" // r + "vqrdmulh.s16 q13, q13, d1[0] \n" // a + "vqmovn.u16 d20, q10 \n" + "vqmovn.u16 d22, q11 \n" + "vqmovn.u16 d24, q12 \n" + "vqmovn.u16 d26, q13 \n" + "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "cc", "memory", "q0", "q10", "q11", "q12", "q13" ); } diff --git a/source/row_posix.cc b/source/row_posix.cc index 920a8c404..e9d76271f 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -3921,6 +3921,45 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, } #endif // HAS_ARGBQUANTIZEROW_SSE2 +#ifdef HAS_ARGBSHADEROW_SSE2 +// Shade 4 pixels at a time by specified value. +// Aligned to 16 bytes. +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + asm volatile ( + "movd %3,%%xmm2 \n" + "sub %0,%1 \n" + "punpcklbw %%xmm2,%%xmm2 \n" + "punpcklqdq %%xmm2,%%xmm2 \n" + + // 4 pixel loop. + ".p2align 2 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "pmulhuw %%xmm2,%%xmm0 \n" + "pmulhuw %%xmm2,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%0,%1,1) \n" + "lea 0x10(%0),%0 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(value) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2" +#endif + ); +} +#endif // HAS_ARGBSHADEROW_SSE2 + #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 // Creates a table of cumulative sums where each value is a sum of all values // above and to the left of the value, inclusive of the value. @@ -4091,44 +4130,6 @@ void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft, ); } #endif // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 -#ifdef HAS_ARGBSHADEROW_SSE2 -// Shade 4 pixels at a time by specified value. -// Aligned to 16 bytes. -void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value) { - asm volatile ( - "movd %3,%%xmm2 \n" - "sub %0,%1 \n" - "punpcklbw %%xmm2,%%xmm2 \n" - "punpcklqdq %%xmm2,%%xmm2 \n" - - // 4 pixel loop. - ".p2align 2 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm0,%%xmm0 \n" - "punpckhbw %%xmm1,%%xmm1 \n" - "pmulhuw %%xmm2,%%xmm0 \n" - "pmulhuw %%xmm2,%%xmm1 \n" - "psrlw $0x8,%%xmm0 \n" - "psrlw $0x8,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0,(%0,%1,1) \n" - "lea 0x10(%0),%0 \n" - "jg 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "r"(value) // %3 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2" -#endif - ); -} -#endif // HAS_ARGBSHADEROW_SSE2 #ifdef HAS_ARGBAFFINEROW_SSE2 // TODO(fbarchard): Find 64 bit way to avoid masking. diff --git a/source/row_win.cc b/source/row_win.cc index b0d8a1117..360918b38 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4124,6 +4124,42 @@ void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, } #endif // HAS_ARGBQUANTIZEROW_SSE2 +#ifdef HAS_ARGBSHADEROW_SSE2 +// Shade 4 pixels at a time by specified value. +// Aligned to 16 bytes. +__declspec(naked) __declspec(align(16)) +void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, + uint32 value) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + movd xmm2, [esp + 16] // value + sub edx, eax + punpcklbw xmm2, xmm2 + punpcklqdq xmm2, xmm2 + + align 16 + convertloop: + movdqa xmm0, [eax] // read 4 pixels + movdqa xmm1, xmm0 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + pmulhuw xmm0, xmm2 // argb * value + pmulhuw xmm1, xmm2 // argb * value + psrlw xmm0, 8 + psrlw xmm1, 8 + packuswb xmm0, xmm1 + sub ecx, 4 + movdqa [eax + edx], xmm0 + lea eax, [eax + 16] + jg convertloop + + ret + } +} +#endif // HAS_ARGBSHADEROW_SSE2 + #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2 // Consider float CumulativeSum. // Consider calling CumulativeSum one row at time as needed. @@ -4315,42 +4351,6 @@ void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, } #endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 -#ifdef HAS_ARGBSHADEROW_SSE2 -// Shade 4 pixels at a time by specified value. -// Aligned to 16 bytes. -__declspec(naked) __declspec(align(16)) -void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, - uint32 value) { - __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width - movd xmm2, [esp + 16] // value - sub edx, eax - punpcklbw xmm2, xmm2 - punpcklqdq xmm2, xmm2 - - align 16 - convertloop: - movdqa xmm0, [eax] // read 4 pixels - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - pmulhuw xmm0, xmm2 // argb * value - pmulhuw xmm1, xmm2 // argb * value - psrlw xmm0, 8 - psrlw xmm1, 8 - packuswb xmm0, xmm1 - sub ecx, 4 - movdqa [eax + edx], xmm0 - lea eax, [eax + 16] - jg convertloop - - ret - } -} -#endif // HAS_ARGBSHADEROW_SSE2 - #ifdef HAS_ARGBAFFINEROW_SSE2 // Copy ARGB pixels from source image with slope to a row of destination. __declspec(naked) __declspec(align(16)) diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 169d96317..325fdcba1 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -539,7 +539,8 @@ TEST_F(libyuvTest, TestShade) { orig_pixels[3][1] = 0u; orig_pixels[3][2] = 0u; orig_pixels[3][3] = 0u; - ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 4, 1, 0x80ffffff); + // Do 8 pixels to allow opt version to be used. + ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x80ffffff); EXPECT_EQ(10u, shade_pixels[0][0]); EXPECT_EQ(20u, shade_pixels[0][1]); EXPECT_EQ(40u, shade_pixels[0][2]); @@ -557,12 +558,18 @@ TEST_F(libyuvTest, TestShade) { EXPECT_EQ(0u, shade_pixels[3][2]); EXPECT_EQ(0u, shade_pixels[3][3]); - ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 4, 1, 0x80808080); + ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x80808080); EXPECT_EQ(5u, shade_pixels[0][0]); EXPECT_EQ(10u, shade_pixels[0][1]); EXPECT_EQ(20u, shade_pixels[0][2]); EXPECT_EQ(40u, shade_pixels[0][3]); + ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 8, 1, 0x10204080); + EXPECT_EQ(5u, shade_pixels[0][0]); + EXPECT_EQ(5u, shade_pixels[0][1]); + EXPECT_EQ(5u, shade_pixels[0][2]); + EXPECT_EQ(5u, shade_pixels[0][3]); + for (int i = 0; i < benchmark_pixels_div256_; ++i) { ARGBShade(&orig_pixels[0][0], 0, &shade_pixels[0][0], 0, 256, 1, 0x80808080);