diff --git a/Android.mk b/Android.mk index e6b5a1923..8456cc90a 100644 --- a/Android.mk +++ b/Android.mk @@ -16,12 +16,11 @@ common_SRC_FILES := \ source/row_posix.cc \ source/scale.cc \ source/scale_argb.cc \ - source/video_common.cc -# For Neon support, add .neon to all filenames and the following -# source/rotate_neon.cc -# source/row_neon.cc + source/video_common.cc \ + source/rotate_neon.cc \ + source/row_neon.cc -common_CFLAGS := -Wall -fexceptions +common_CFLAGS := -Wall -fexceptions -DHAVE_ARMEABI_V7A=1 -mfloat-abi=softfp -mfpu=neon common_C_INCLUDES = $(LOCAL_PATH)/include diff --git a/README.chromium b/README.chromium index c065f08da..bd3d851b3 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 370 +Version: 371 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 31a9b6207..ebeb92f77 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -67,6 +67,7 @@ extern "C" { #define HAS_RGB24TOARGBROW_SSSE3 #define HAS_RGB565TOARGBROW_SSE2 #define HAS_SPLITUV_SSE2 +#define HAS_UYVYTOUV422ROW_SSE2 #define HAS_UYVYTOUVROW_SSE2 #define HAS_UYVYTOYROW_SSE2 #define HAS_YTOARGBROW_SSE2 @@ -119,11 +120,22 @@ extern "C" { #define HAS_I422TOBGRAROW_NEON #define HAS_I422TOABGRROW_NEON #define HAS_I422TORGBAROW_NEON +// TODO(fbarchard): Hook these up to calling functions. +#define HAS_ARGBTORGBAROW_NEON +#define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORAWROW_NEON #define HAS_ABGRTOARGBROW_NEON #define HAS_BGRATOARGBROW_NEON #define HAS_RGBATOARGBROW_NEON #define HAS_RAWTOARGBROW_NEON #define HAS_RGB24TOARGBROW_NEON +#define HAS_YUY2TOUV422ROW_NEON +#define HAS_YUY2TOUVROW_NEON +#define HAS_YUY2TOYROW_NEON +#define HAS_UYVYTOUV422ROW_NEON +#define HAS_UYVYTOUVROW_NEON +#define HAS_UYVYTOYROW_NEON + #endif #if defined(_MSC_VER) && !defined(__CLR_VER) @@ -542,6 +554,11 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix); void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_NEON(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix); void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix); @@ -552,6 +569,11 @@ void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix); void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Any_NEON(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, @@ -564,6 +586,12 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix); void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix); +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_NEON(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); + void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix); void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix); @@ -574,7 +602,11 @@ void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix); void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, int pix); - +void UYVYToYRow_Any_NEON(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 823928936..b90e39ea8 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 370 +#define LIBYUV_VERSION 371 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert.cc b/source/convert.cc index 2554b4ef8..66127e9dd 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -744,6 +744,21 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, } } } +#elif defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width > 8) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (width > 16) { + YUY2ToUVRow = YUY2ToUVRow_Any_NEON; + } + } + if (IS_ALIGNED(width, 8)) { + YUY2ToYRow = YUY2ToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + YUY2ToUVRow = YUY2ToUVRow_NEON; + } + } + } #endif for (int y = 0; y < height - 1; y += 2) { @@ -768,11 +783,6 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_u, int dst_stride_u, uint8* dst_v, int dst_stride_v, int width, int height) { - if (!src_uyvy || - !dst_y || !dst_u || !dst_v || - width <= 0 || height == 0) { - return -1; - } // Negative height means invert the image. if (height < 0) { height = -height; @@ -802,7 +812,23 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, } } } +#elif defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width > 8) { + UYVYToYRow = UYVYToYRow_Any_NEON; + if (width > 16) { + UYVYToUVRow = UYVYToUVRow_Any_NEON; + } + } + if (IS_ALIGNED(width, 8)) { + UYVYToYRow = UYVYToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + UYVYToUVRow = UYVYToUVRow_NEON; + } + } + } #endif + for (int y = 0; y < height - 1; y += 2) { UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width); UYVYToYRow(src_uyvy, dst_y, width); @@ -905,7 +931,55 @@ int V210ToI420(const uint8* src_v210, int src_stride_v210, UYVYToYRow = UYVYToYRow_SSE2; } } +#elif defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width > 8) { + UYVYToYRow = UYVYToYRow_Any_NEON; + if (width > 16) { + UYVYToUVRow = UYVYToUVRow_Any_NEON; + } + } + if (IS_ALIGNED(width, 8)) { + UYVYToYRow = UYVYToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + UYVYToUVRow = UYVYToUVRow_NEON; + } + } + } #endif + +#if defined(HAS_UYVYTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + if (width > 16) { + UYVYToUVRow = UYVYToUVRow_Any_SSE2; + UYVYToYRow = UYVYToYRow_Any_SSE2; + } + if (IS_ALIGNED(width, 16)) { + UYVYToYRow = UYVYToYRow_Unaligned_SSE2; + UYVYToUVRow = UYVYToUVRow_SSE2; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + UYVYToYRow = UYVYToYRow_SSE2; + } + } + } +#endif +#if defined(HAS_UYVYTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width > 8) { + UYVYToYRow = UYVYToYRow_Any_NEON; + if (width > 16) { + UYVYToUVRow = UYVYToUVRow_Any_NEON; + } + } + if (IS_ALIGNED(width, 8)) { + UYVYToYRow = UYVYToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + UYVYToUVRow = UYVYToUVRow_NEON; + } + } + } +#endif + for (int y = 0; y < height - 1; y += 2) { V210ToUYVYRow(src_v210, row, width); V210ToUYVYRow(src_v210 + src_stride_v210, row + kMaxStride, width); diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 5e624eb68..65dbf8f0c 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -671,7 +671,23 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, } } } +#elif defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width > 8) { + YUY2ToYRow = YUY2ToYRow_Any_NEON; + if (width > 16) { + YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; + } + } + if (IS_ALIGNED(width, 8)) { + YUY2ToYRow = YUY2ToYRow_NEON; + if (IS_ALIGNED(width, 16)) { + YUY2ToUV422Row = YUY2ToUV422Row_NEON; + } + } + } #endif + void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, diff --git a/source/convert_from.cc b/source/convert_from.cc index b8b759cf0..f5b9dda3e 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -933,7 +933,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, if (width * 3 <= kMaxStride) { ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; } - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 8)) { ARGBToRGB24Row = ARGBToRGB24Row_NEON; } } @@ -1004,7 +1004,7 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, if (width * 3 <= kMaxStride) { ARGBToRAWRow = ARGBToRAWRow_Any_NEON; } - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 8)) { ARGBToRAWRow = ARGBToRAWRow_NEON; } } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 4aa618a07..ed8d1eb14 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -321,8 +321,7 @@ int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, } #endif #if defined(HAS_ARGBTORGBAROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && - IS_ALIGNED(width, 16)) { + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { ARGBToRGBARow = ARGBToRGBARow_NEON; } #endif @@ -366,7 +365,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, if (width * 3 <= kMaxStride) { ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; } - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 8)) { ARGBToRGB24Row = ARGBToRGB24Row_NEON; } } @@ -411,7 +410,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, if (width * 3 <= kMaxStride) { ARGBToRAWRow = ARGBToRAWRow_Any_NEON; } - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 8)) { ARGBToRAWRow = ARGBToRAWRow_NEON; } } diff --git a/source/row_common.cc b/source/row_common.cc index b3cf5a979..5af81ddc5 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -957,8 +957,6 @@ YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1) YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1) YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1) YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1) -#endif -#ifdef HAS_I422TORGBAROW_NEON YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1) #endif #undef YANY @@ -1000,6 +998,10 @@ YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4) #endif YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2) YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2) +#ifdef HAS_YUY2TOYROW_NEON +YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2) +YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2) +#endif #undef YANY #define UVANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP) \ @@ -1021,6 +1023,10 @@ UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4) #endif UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2) UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2) +#ifdef HAS_YUY2TOUVROW_NEON +UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2) +UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2) +#endif #undef UVANY #define UV422ANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP) \ @@ -1038,6 +1044,12 @@ UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2, \ YUY2ToUV422Row_C, 2) UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2, \ UYVYToUV422Row_C, 2) +#ifdef HAS_YUY2TOUV422ROW_NEON +UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, \ + YUY2ToUV422Row_C, 2) +UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, \ + UYVYToUV422Row_C, 2) +#endif #undef UV422ANY #endif diff --git a/source/row_neon.cc b/source/row_neon.cc index b7d14a712..465c5e737 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -72,7 +72,7 @@ void I422ToARGBRow_NEON(const uint8* y_buf, YUV422TORGB "vmov.u8 d21, d16 \n" "vmov.u8 d23, #255 \n" - "vst4.u8 {d20, d21, d22, d23}, [%3]! \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "subs %4, %4, #8 \n" "bgt 1b \n" : "+r"(y_buf), // %0 @@ -105,7 +105,7 @@ void I422ToBGRARow_NEON(const uint8* y_buf, "vswp.u8 d20, d22 \n" "vmov.u8 d21, d16 \n" "vmov.u8 d19, #255 \n" - "vst4.u8 {d19, d20, d21, d22}, [%3]! \n" + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" "subs %4, %4, #8 \n" "bgt 1b \n" : "+r"(y_buf), // %0 @@ -138,7 +138,7 @@ void I422ToABGRRow_NEON(const uint8* y_buf, "vswp.u8 d20, d22 \n" "vmov.u8 d21, d16 \n" "vmov.u8 d23, #255 \n" - "vst4.u8 {d20, d21, d22, d23}, [%3]! \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "subs %4, %4, #8 \n" "bgt 1b \n" : "+r"(y_buf), // %0 @@ -170,7 +170,7 @@ void I422ToRGBARow_NEON(const uint8* y_buf, YUV422TORGB "vmov.u8 d21, d16 \n" "vmov.u8 d19, #255 \n" - "vst4.u8 {d19, d20, d21, d22}, [%3]! \n" + "vst4.8 {d19, d20, d21, d22}, [%3]! \n" "subs %4, %4, #8 \n" "bgt 1b \n" : "+r"(y_buf), // %0 @@ -192,7 +192,7 @@ void I422ToRGBARow_NEON(const uint8* y_buf, void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "1: \n" - "vld2.u8 {q0,q1}, [%0]! \n" // load 16 pairs of UV + "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "subs %3, %3, #16 \n" // 16 processed per loop "vst1.u8 {q0}, [%1]! \n" // store U "vst1.u8 {q1}, [%2]! \n" // Store V @@ -213,9 +213,9 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile ( "1: \n" "pld [%0, #0xC0] \n" // preload - "vldm %0!,{q0,q1,q2,q3} \n" // load 64 + "vldm %0!,{q0, q1, q2, q3} \n" // load 64 "subs %2, %2, #64 \n" // 64 processed per loop - "vstm %1!,{q0,q1,q2,q3} \n" // store 64 + "vstm %1!,{q0, q1, q2, q3} \n" // store 64 "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -360,21 +360,22 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { } #endif // HAS_MIRRORROWUV_NEON +// TODO(fbarchard): Avoid d4-d7. #ifdef HAS_BGRATOARGBROW_NEON void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) { asm volatile ( "1: \n" - "vld4.u8 {q1,q2,q3,q4}, [%0]! \n" // load 16 pixels of BGRA. - "subs %2, %2, #16 \n" // 16 processed per loop. - "vswp.u8 q2, q3 \n" // swap G, R - "vswp.u8 q1, q4 \n" // swap B, A - "vst4.u8 {q1,q2,q3,q4}, [%1]! \n" // store 16 pixels of ARGB. + "vld4.8 {d5, d6, d7, d8}, [%0]! \n" // load 8 pixels of BGRA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d6, d7 \n" // swap G, R + "vswp.u8 d5, d8 \n" // swap B, A + "vst4.8 {d5, d6, d7, d8}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : - : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List + : "memory", "cc", "d5", "d6", "d7", "d8" // Clobber List ); } #endif // HAS_BGRATOARGBROW_NEON @@ -383,16 +384,16 @@ void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) { void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) { asm volatile ( "1: \n" - "vld4.u8 {q1,q2,q3,q4}, [%0]! \n" // load 16 pixels of ABGR. - "subs %2, %2, #16 \n" // 16 processed per loop. - "vswp.u8 q1, q3 \n" // swap R, B - "vst4.u8 {q1,q2,q3,q4}, [%1]! \n" // store 16 pixels of ARGB. + "vld4.8 {d5, d6, d7, d8}, [%0]! \n" // load 8 pixels of ABGR. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d5, d7 \n" // swap R, B + "vst4.8 {d5, d6, d7, d8}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : - : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List + : "memory", "cc", "d5", "d6", "d7", "d8" // Clobber List ); } #endif // HAS_ABGRTOARGBROW_NEON @@ -400,17 +401,17 @@ void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) { #ifdef HAS_RGBATOARGBROW_NEON void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) { asm volatile ( - "1: \n" - "vld4.u8 {q1,q2,q3,q4}, [%0]! \n" // load 16 pixels of RGBA. - "subs %2, %2, #16 \n" // 16 processed per loop. - "vmov.u8 q5, q1 \n" // move A after RGB - "vst4.u8 {q2,q3,q4,q5}, [%1]! \n" // store 16 pixels of ARGB. - "bgt 1b \n" + "1: \n" + "vld4.8 {d5, d6, d7, d8}, [%0]! \n" // load 8 pixels of RGBA. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmov.u8 d9, d5 \n" // move A after RGB + "vst4.8 {d6, d7, d8, d9}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : - : "memory", "cc", "q1", "q2", "q3", "q4", "q5" // Clobber List + : "memory", "cc", "d5", "d6", "d7", "d8", "d9" // Clobber List ); } #endif // HAS_RGBATOARGBROW_NEON @@ -418,17 +419,17 @@ void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) { #ifdef HAS_RGB24TOARGBROW_NEON void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { asm volatile ( - "vmov.u8 q4, #255 \n" // Alpha + "vmov.u8 d8, #255 \n" // Alpha "1: \n" - "vld3.u8 {q1,q2,q3}, [%0]! \n" // load 16 pixels of RGB24. - "subs %2, %2, #16 \n" // 16 processed per loop. - "vst4.u8 {q1,q2,q3,q4}, [%1]! \n" // store 16 pixels of ARGB. + "vld3.8 {d5, d6, d7}, [%0]! \n" // load 8 pixels of RGB24. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst4.8 {d5, d6, d7, d8}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_rgb24), // %0 - "+r"(dst_argb), // %1 - "+r"(pix) // %2 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 : - : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List + : "memory", "cc", "d5", "d6", "d7", "d8" // Clobber List ); } #endif // HAS_RGB24TOARGBROW_NEON @@ -436,18 +437,18 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { #ifdef HAS_RAWTOARGBROW_NEON void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { asm volatile ( - "vmov.u8 q4, #255 \n" // Alpha + "vmov.u8 d8, #255 \n" // Alpha "1: \n" - "vld3.u8 {q1,q2,q3}, [%0]! \n" // load 16 pixels of RAW. - "subs %2, %2, #16 \n" // 16 processed per loop. - "vswp.u8 q1, q3 \n" // swap R, B - "vst4.u8 {q1,q2,q3,q4}, [%1]! \n" // store 16 pixels of ARGB. + "vld3.8 {d5, d6, d7}, [%0]! \n" // load 8 pixels of RAW. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vswp.u8 d5, d7 \n" // swap R, B + "vst4.8 {d5, d6, d7, d8}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" - : "+r"(src_raw), // %0 + : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : - : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List + : "memory", "cc", "d5", "d6", "d7", "d8" // Clobber List ); } #endif // HAS_RAWTOARGBROW_NEON @@ -456,16 +457,16 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) { asm volatile ( "1: \n" - "vld4.u8 {q1,q2,q3,q4}, [%0]! \n" // load 16 pixels of ARGB. - "subs %2, %2, #16 \n" // 16 processed per loop. - "vmov.u8 q0, q4 \n" - "vst4.u8 {q0,q1,q2,q3}, [%1]! \n" // store 16 pixels of RGBA. + "vld4.8 {d5, d6, d7, d8}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vmov.u8 d4, d8 \n" + "vst4.8 {d4, d5, d6, d7}, [%1]! \n" // store 8 pixels of RGBA. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgba), // %1 "+r"(pix) // %2 : - : "memory", "cc", "q0", "q1", "q2", "q3", "q4" // Clobber List + : "memory", "cc", "d4", "d5", "d6", "d7", "d8" // Clobber List ); } #endif // HAS_ARGBTORGBAROW_NEON @@ -474,15 +475,15 @@ void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) { void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { asm volatile ( "1: \n" - "vld4.u8 {q1,q2,q3,q4}, [%0]! \n" // load 16 pixels of ARGB. - "subs %2, %2, #16 \n" // 16 processed per loop. - "vst3.u8 {q1,q2,q3}, [%1]! \n" // store 16 pixels of RGB24. + "vld4.8 {d5, d6, d7, d8}, [%0]! \n" // load 8 pixels of ARGB. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst3.8 {d5, d6, d7}, [%1]! \n" // store 8 pixels of RGB24. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 "+r"(pix) // %2 : - : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List + : "memory", "cc", "d5", "d6", "d7", "d8" // Clobber List ); } #endif // HAS_ARGBTORGB24ROW_NEON @@ -491,20 +492,144 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { asm volatile ( "1: \n" - "vld4.u8 {q1,q2,q3,q4}, [%0]! \n" // load 16 pixels of ARGB. - "vswp.u8 q1, q3 \n" // swap R, B - "subs %2, %2, #16 \n" // 16 processed per loop. - "vst3.u8 {q1,q2,q3}, [%1]! \n" // store 16 pixels of RAW. + "vld4.8 {d5, d6, d7, d8}, [%0]! \n" // load 8 pixels of ARGB. + "vswp.u8 d5, d7 \n" // swap R, B + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst3.8 {d5, d6, d7}, [%1]! \n" // store 8 pixels of RAW. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 "+r"(pix) // %2 : - : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List + : "memory", "cc", "d5", "d6", "d7", "d8" // Clobber List ); } #endif // HAS_ARGBTORAWROW_NEON +#ifdef HAS_YUY2TOYROW_NEON +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { + asm volatile ( + "1: \n" + "vld2.u8 {d0, d1}, [%0]! \n" // load 8 pixels of YUY2. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst1.u8 {d0}, [%1]! \n" // store 8 pixels of Y. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "d0", "d1" // Clobber List + ); +} +#endif // HAS_YUY2TOYROW_NEON + +#ifdef HAS_UYVYTOYROW_NEON +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { + asm volatile ( + "1: \n" + "vld2.u8 {d0, d1}, [%0]! \n" // load 8 pixels of UYVY. + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst1.u8 {d1}, [%1]! \n" // store 8 pixels of Y. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "d0", "d1" // Clobber List + ); +} +#endif // HAS_UYVYTOYROW_NEON + +#ifdef HAS_YUY2TOYROW_NEON +void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.u8 {d1}, [%1]! \n" // store 8 U. + "vst1.u8 {d3}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List + ); +} +#endif // HAS_YUY2TOYROW_NEON + +#ifdef HAS_UYVYTOYROW_NEON +void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vst1.u8 {d0}, [%1]! \n" // store 8 U. + "vst1.u8 {d2}, [%2]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc", "d0", "d1", "d2", "d3" // Clobber List + ); +} +#endif // HAS_UYVYTOYROW_NEON + +#ifdef HAS_YUY2TOYROW_NEON +void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "adds %1, %0, %1 \n" // stride + src_yuy2 + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vrhadd.u8 d1, d1, d5 \n" // average rows of U + "vrhadd.u8 d3, d3, d7 \n" // average rows of V + "vst1.u8 {d1}, [%2]! \n" // store 8 U. + "vst1.u8 {d3}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(stride_yuy2), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + ); +} +#endif // HAS_YUY2TOYROW_NEON + +#ifdef HAS_UYVYTOYROW_NEON +void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "adds %1, %0, %1 \n" // stride + src_uyvy + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. + "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + "vrhadd.u8 d0, d0, d4 \n" // average rows of U + "vrhadd.u8 d2, d2, d6 \n" // average rows of V + "vst1.u8 {d0}, [%2]! \n" // store 8 U. + "vst1.u8 {d2}, [%3]! \n" // store 8 V. + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(stride_uyvy), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + ); +} +#endif // HAS_UYVYTOYROW_NEON + #endif // __ARM_NEON__ #ifdef __cplusplus diff --git a/source/scale.cc b/source/scale.cc index 877d8d93e..46deea2b9 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -2755,13 +2755,11 @@ void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, * its original size. * */ -static void ScalePlaneDown2(int src_width, int src_height, +static void ScalePlaneDown2(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr, FilterMode filtering) { - assert(IS_ALIGNED(src_width, 2)); - assert(IS_ALIGNED(src_height, 2)); void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) = filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C; @@ -2795,13 +2793,11 @@ static void ScalePlaneDown2(int src_width, int src_height, * This is an optimized version for scaling down a plane to 1/4 of * its original size. */ -static void ScalePlaneDown4(int src_width, int src_height, +static void ScalePlaneDown4(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr, FilterMode filtering) { - assert(IS_ALIGNED(src_width, 4)); - assert(IS_ALIGNED(src_height, 4)); void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) = filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C; @@ -2832,13 +2828,11 @@ static void ScalePlaneDown4(int src_width, int src_height, * of its original size. * */ -static void ScalePlaneDown8(int src_width, int src_height, +static void ScalePlaneDown8(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr, FilterMode filtering) { - assert(IS_ALIGNED(src_width, 8)); - assert(IS_ALIGNED(src_height, 8)); void (*ScaleRowDown8)(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) = filtering && (dst_width <= kMaxOutputWidth) ? @@ -2864,7 +2858,7 @@ static void ScalePlaneDown8(int src_width, int src_height, * Provided by Frank Barchard (fbarchard@google.com) * */ -static void ScalePlaneDown34(int src_width, int src_height, +static void ScalePlaneDown34(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr, @@ -2953,7 +2947,7 @@ static void ScalePlaneDown34(int src_width, int src_height, * ggghhhii * Boxes are 3x3, 2x3, 3x2 and 2x2 */ -static void ScalePlaneDown38(int src_width, int src_height, +static void ScalePlaneDown38(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr, diff --git a/source/scale_argb.cc b/source/scale_argb.cc index e264a01b5..fea8b6294 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -791,13 +791,11 @@ void ScaleARGBFilterRows_C(uint8* dst_ptr, const uint8* src_ptr, * its original size. * */ -static void ScaleARGBDown2(int src_width, int src_height, +static void ScaleARGBDown2(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, const uint8* src_ptr, uint8* dst_ptr, FilterMode filtering) { - assert(IS_ALIGNED(src_width, 2)); - assert(IS_ALIGNED(src_height, 2)); void (*ScaleARGBRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) = filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C;