diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 087e95881..7e0c42c48 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -184,6 +184,8 @@ extern "C" { #define HAS_ARGBTOYROW_NEON #define HAS_MERGEUV_NEON #define HAS_YTOARGBROW_NEON +#define HAS_I444TOARGBROW_NEON +#define HAS_I411TOARGBROW_NEON #endif // The following are available on Mips platforms @@ -231,11 +233,21 @@ typedef uint32 uvec32[4]; #define OMITFP __attribute__((optimize("omit-frame-pointer"))) #endif +void I444ToARGBRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); void I422ToARGBRow_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); +void I411ToARGBRow_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); void I422ToBGRARow_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -757,11 +769,21 @@ void RGBAToUVRow_Any_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); void ARGBToYRow_Any_NEON(const uint8* src_argb, uint8* dst_y, int pix); +void I444ToARGBRow_Any_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); void I422ToARGBRow_Any_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); +void I411ToARGBRow_Any_NEON(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); void I422ToBGRARow_Any_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 8b8016b4f..6f4c5ea10 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -80,6 +80,13 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, } } } +#elif defined(HAS_I444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I444ToARGBRow = I444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I444ToARGBRow = I444ToARGBRow_NEON; + } + } #endif for (int y = 0; y < height; ++y) { @@ -185,6 +192,13 @@ int I411ToARGB(const uint8* src_y, int src_stride_y, } } } +#elif defined(HAS_I411TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I411ToARGBRow = I411ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I411ToARGBRow = I411ToARGBRow_NEON; + } + } #endif for (int y = 0; y < height; ++y) { diff --git a/source/row_any.cc b/source/row_any.cc index bc5ea964d..3ece77b33 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -61,7 +61,9 @@ YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15) YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15) #endif // HAS_I422TOARGBROW_SSSE3 #ifdef HAS_I422TOARGBROW_NEON +YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7) YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7) +YANY(I411ToARGBRow_Any_NEON, I411ToARGBRow_NEON, I411ToARGBRow_C, 2, 4, 7) YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1, 4, 7) YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1, 4, 7) YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1, 4, 7) diff --git a/source/row_common.cc b/source/row_common.cc index dd1622672..b438409a2 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -467,6 +467,29 @@ static __inline void YuvPixel2(uint8 y, uint8 u, uint8 v, *r = Clip(static_cast((u * UR + v * VR) - (BR) + y1) >> 6); } +#if defined(__ARM_NEON__) +// C mimic assembly. +// TODO(fbarchard): Remove subsampling from Neon. +void I444ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + uint8 u = (u_buf[0] + u_buf[1] + 1) >> 1; + uint8 v = (v_buf[0] + v_buf[1] + 1) >> 1; + YuvPixel(y_buf[0], u, v, rgb_buf + 0, 24, 16, 8, 0); + YuvPixel(y_buf[1], u, v, rgb_buf + 4, 24, 16, 8, 0); + y_buf += 2; + u_buf += 2; + v_buf += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); + } +} +#else void I444ToARGBRow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -480,7 +503,7 @@ void I444ToARGBRow_C(const uint8* y_buf, rgb_buf += 4; // Advance 1 pixel. } } - +#endif // Also used for 420 void I422ToARGBRow_C(const uint8* y_buf, const uint8* u_buf, diff --git a/source/row_neon.cc b/source/row_neon.cc index f84d7ba47..89072f83e 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -24,6 +24,22 @@ extern "C" { "vld1.u32 {d2[0]}, [%1]! \n" \ "vld1.u32 {d2[1]}, [%2]! \n" +// Read 8 Y, 2 U and 2 V from 422 +#define READYUV411 \ + "vld1.u8 {d0}, [%0]! \n" \ + "vld1.u16 {d2[0]}, [%1]! \n" \ + "vld1.u16 {d2[1]}, [%2]! \n" \ + "vmov.u8 d3, d2 \n" \ + "vzip.u8 d2, d3 \n" + +// Read 8 Y, 8 U and 8 V from 444 +#define READYUV444 \ + "vld1.u8 {d0}, [%0]! \n" \ + "vld1.u8 {d2}, [%1]! \n" \ + "vld1.u8 {d3}, [%2]! \n" \ + "vpaddl.u8 q1, q1 \n" \ + "vrshrn.u16 d2, q1, #1 \n" + // Read 8 Y, and set 4 U and 4 V to 128 #define READYUV400 \ "vld1.u8 {d0}, [%0]! \n" \ @@ -79,6 +95,39 @@ static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, 0, 0, 0, 0, 0, 0, 0, 0 }; #endif +#ifdef HAS_I444TOARGBROW_NEON +void I444ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + "vld1.u8 {d24}, [%5] \n" + "vld1.u8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV444 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I444TOARGBROW_NEON + #ifdef HAS_I422TOARGBROW_NEON void I422ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, @@ -112,6 +161,39 @@ void I422ToARGBRow_NEON(const uint8* src_y, } #endif // HAS_I422TOARGBROW_NEON +#ifdef HAS_I411TOARGBROW_NEON +void I411ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { + asm volatile ( + "vld1.u8 {d24}, [%5] \n" + "vld1.u8 {d25}, [%6] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV411 + YUV422TORGB + "subs %4, %4, #8 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%3]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : "r"(&kUVToRB), // %5 + "r"(&kUVToG) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_I411TOARGBROW_NEON + #ifdef HAS_I422TOBGRAROW_NEON void I422ToBGRARow_NEON(const uint8* src_y, const uint8* src_u,