diff --git a/include/libyuv/row.h b/include/libyuv/row.h index f7fc3e768..31a9b6207 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -34,36 +34,35 @@ extern "C" { // The following are available on all x86 platforms: #if !defined(YUV_DISABLE_ASM) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) +// Conversions. #define HAS_ABGRTOARGBROW_SSSE3 #define HAS_ABGRTOUVROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3 -#define HAS_ARGBTORGBAROW_SSSE3 #define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2 -#define HAS_ARGBATTENUATEROW_SSSE3 -#define HAS_ARGBBLENDROW_SSSE3 #define HAS_ARGBTOARGB1555ROW_SSE2 #define HAS_ARGBTOARGB4444ROW_SSE2 #define HAS_ARGBTORAWROW_SSSE3 #define HAS_ARGBTORGB24ROW_SSSE3 #define HAS_ARGBTORGB565ROW_SSE2 +#define HAS_ARGBTORGBAROW_SSSE3 #define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 -#define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_BGRATOARGBROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 #define HAS_COPYROW_SSE2 #define HAS_COPYROW_X86 +#define HAS_I400TOARGBROW_SSE2 +#define HAS_I411TOARGBROW_SSSE3 +#define HAS_I422TOABGRROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3 #define HAS_I422TOBGRAROW_SSSE3 -#define HAS_I422TOABGRROW_SSSE3 #define HAS_I444TOARGBROW_SSSE3 -#define HAS_I411TOARGBROW_SSSE3 -#define HAS_I400TOARGBROW_SSE2 #define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORROWUV_SSSE3 -#define HAS_ARGBMIRRORROW_SSSE3 +#define HAS_NV12TOARGBROW_SSSE3 +#define HAS_NV21TOARGBROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3 #define HAS_RGB565TOARGBROW_SSE2 @@ -71,26 +70,31 @@ extern "C" { #define HAS_UYVYTOUVROW_SSE2 #define HAS_UYVYTOYROW_SSE2 #define HAS_YTOARGBROW_SSE2 -#define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOUV422ROW_SSE2 +#define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOYROW_SSE2 -#define HAS_ARGBGRAYROW_SSSE3 -#define HAS_ARGBSEPIAROW_SSSE3 + +// Effects +#define HAS_ARGBMIRRORROW_SSSE3 +#define HAS_ARGBAFFINEROW_SSE2 +#define HAS_ARGBATTENUATEROW_SSSE3 +#define HAS_ARGBBLENDROW_SSSE3 #define HAS_ARGBCOLORMATRIXROW_SSSE3 +#define HAS_ARGBGRAYROW_SSSE3 +#define HAS_ARGBINTERPOLATEROW_SSSE3 #define HAS_ARGBQUANTIZEROW_SSE2 +#define HAS_ARGBSEPIAROW_SSSE3 +#define HAS_ARGBSHADE_SSE2 +#define HAS_ARGBUNATTENUATEROW_SSE2 #define HAS_COMPUTECUMULATIVESUMROW_SSE2 #define HAS_CUMULATIVESUMTOAVERAGE_SSE2 -#define HAS_ARGBSHADE_SSE2 -#define HAS_ARGBAFFINEROW_SSE2 -#define HAS_ARGBINTERPOLATEROW_SSSE3 -#define HAS_NV12TOARGBROW_SSSE3 -#define HAS_NV21TOARGBROW_SSSE3 #endif // The following are Windows only: #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_ARGBCOLORTABLEROW_X86 #define HAS_I422TORGBAROW_SSSE3 +#define HAS_ABGRTOARGBROW_SSSE3 #define HAS_RGBATOARGBROW_SSSE3 #define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOYROW_SSSE3 @@ -115,9 +119,11 @@ extern "C" { #define HAS_I422TOBGRAROW_NEON #define HAS_I422TOABGRROW_NEON #define HAS_I422TORGBAROW_NEON -#define HAS_ARGBTORGBAROW_NEON -#define HAS_ARGBTORGB24ROW_NEON -#define HAS_ARGBTORAWROW_NEON +#define HAS_ABGRTOARGBROW_NEON +#define HAS_BGRATOARGBROW_NEON +#define HAS_RGBATOARGBROW_NEON +#define HAS_RAWTOARGBROW_NEON +#define HAS_RGB24TOARGBROW_NEON #endif #if defined(_MSC_VER) && !defined(__CLR_VER) @@ -243,6 +249,12 @@ void ARGB1555ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix); void RGB565ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix); void ARGB4444ToARGBRow_SSE2(const uint8* src_argb, uint8* dst_argb, int pix); +void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix); +void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix); +void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix); +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); +void RAWToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix); + void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix); void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix); void RGBAToARGBRow_C(const uint8* src_rgba, uint8* dst_argb, int pix); diff --git a/source/row_neon.cc b/source/row_neon.cc index 43ca6f825..b7d14a712 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -360,6 +360,98 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { } #endif // HAS_MIRRORROWUV_NEON +#ifdef HAS_BGRATOARGBROW_NEON +void BGRAToARGBRow_NEON(const uint8* src_bgra, uint8* dst_argb, int pix) { + asm volatile ( + "1: \n" + "vld4.u8 {q1,q2,q3,q4}, [%0]! \n" // load 16 pixels of BGRA. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vswp.u8 q2, q3 \n" // swap G, R + "vswp.u8 q1, q4 \n" // swap B, A + "vst4.u8 {q1,q2,q3,q4}, [%1]! \n" // store 16 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List + ); +} +#endif // HAS_BGRATOARGBROW_NEON + +#ifdef HAS_ABGRTOARGBROW_NEON +void ABGRToARGBRow_NEON(const uint8* src_abgr, uint8* dst_argb, int pix) { + asm volatile ( + "1: \n" + "vld4.u8 {q1,q2,q3,q4}, [%0]! \n" // load 16 pixels of ABGR. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vswp.u8 q1, q3 \n" // swap R, B + "vst4.u8 {q1,q2,q3,q4}, [%1]! \n" // store 16 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List + ); +} +#endif // HAS_ABGRTOARGBROW_NEON + +#ifdef HAS_RGBATOARGBROW_NEON +void RGBAToARGBRow_NEON(const uint8* src_rgba, uint8* dst_argb, int pix) { + asm volatile ( + "1: \n" + "vld4.u8 {q1,q2,q3,q4}, [%0]! \n" // load 16 pixels of RGBA. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vmov.u8 q5, q1 \n" // move A after RGB + "vst4.u8 {q2,q3,q4,q5}, [%1]! \n" // store 16 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "q1", "q2", "q3", "q4", "q5" // Clobber List + ); +} +#endif // HAS_RGBATOARGBROW_NEON + +#ifdef HAS_RGB24TOARGBROW_NEON +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { + asm volatile ( + "vmov.u8 q4, #255 \n" // Alpha + "1: \n" + "vld3.u8 {q1,q2,q3}, [%0]! \n" // load 16 pixels of RGB24. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst4.u8 {q1,q2,q3,q4}, [%1]! \n" // store 16 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List + ); +} +#endif // HAS_RGB24TOARGBROW_NEON + +#ifdef HAS_RAWTOARGBROW_NEON +void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { + asm volatile ( + "vmov.u8 q4, #255 \n" // Alpha + "1: \n" + "vld3.u8 {q1,q2,q3}, [%0]! \n" // load 16 pixels of RAW. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vswp.u8 q1, q3 \n" // swap R, B + "vst4.u8 {q1,q2,q3,q4}, [%1]! \n" // store 16 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List + ); +} +#endif // HAS_RAWTOARGBROW_NEON + #ifdef HAS_ARGBTORGBAROW_NEON void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) { asm volatile (