diff --git a/README.chromium b/README.chromium index fc9928645..798267777 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 421 +Version: 423 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index aa7c2193a..80a3da86d 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -153,6 +153,7 @@ extern "C" { #define HAS_ARGBTORGB565ROW_NEON #define HAS_ARGBTOARGB1555ROW_NEON #define HAS_ARGBTOARGB4444ROW_NEON +#define HAS_ARGBTOYROW_NEON #endif // The following are available on Mips platforms @@ -259,6 +260,8 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); +void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix); + void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width); void MirrorRow_SSE2(const uint8* src, uint8* dst, int width); void MirrorRow_NEON(const uint8* src, uint8* dst, int width); @@ -613,6 +616,8 @@ void ABGRToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); void RGBAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); +void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix); + void I422ToARGBRow_Any_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index c7c9519e0..ef5f7779e 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 421 +#define LIBYUV_VERSION 423 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert.cc b/source/convert.cc index 5ab3609ea..359c62569 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -909,6 +909,15 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, } } } +#elif defined(HAS_ARGBTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width > 8) { + ARGBToYRow = ARGBToYRow_Any_NEON; + } + if (IS_ALIGNED(width, 8)) { + ARGBToYRow = ARGBToYRow_NEON; + } + } #endif for (int y = 0; y < height - 1; y += 2) { diff --git a/source/row_common.cc b/source/row_common.cc index 9bd5dfe73..cd23f59a9 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1083,27 +1083,30 @@ RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C, #undef RGBANY // RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD. -#define YANY(NAMEANY, ARGBTOY_SIMD, BPP) \ +#define YANY(NAMEANY, ARGBTOY_SIMD, BPP, NUM) \ void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \ - ARGBTOY_SIMD(src_argb, dst_y, width - 16); \ - ARGBTOY_SIMD(src_argb + (width - 16) * BPP, dst_y + (width - 16), 16); \ + ARGBTOY_SIMD(src_argb, dst_y, width - NUM); \ + ARGBTOY_SIMD(src_argb + (width - NUM) * BPP, dst_y + (width - NUM), NUM);\ } #ifdef HAS_ARGBTOYROW_SSSE3 -YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4) -YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4) -YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4) +YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 16) +YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 16) +YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 16) #endif #ifdef HAS_RGBATOYROW_SSSE3 -YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4) +YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 16) +#endif +#ifdef HAS_ARGBTOYROW_NEON +YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 8) #endif #ifdef HAS_YUY2TOYROW_SSE2 -YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2) -YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2) +YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 16) +YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 16) #endif #ifdef HAS_YUY2TOYROW_NEON -YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2) -YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2) +YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 16) +YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 16) #endif #undef YANY diff --git a/source/row_neon.cc b/source/row_neon.cc index 5b37890a6..e86ce3138 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -995,6 +995,34 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, ); } #endif // HAS_ARGBTOARGB4444ROW_NEON + +#ifdef HAS_ARGBTOYROW_NEON +void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient + "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient + "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d7, #16 \n" // Add 16 constant + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmull.u8 q8, d0, d4 \n" // B + "vmlal.u8 q8, d1, d5 \n" // G + "vmlal.u8 q8, d2, d6 \n" // R + "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y + "vqadd.u8 d0, d7 \n" + "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8" + ); +} +#endif // HAS_ARGBTOYROW_NEON + #endif // __ARM_NEON__ #ifdef __cplusplus