diff --git a/include/libyuv/row.h b/include/libyuv/row.h index f238fd0f9..a7824ee16 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -69,6 +69,8 @@ extern "C" { #define HAS_MIRRORROWUV_SSSE3 #define HAS_NV12TOARGBROW_SSSE3 #define HAS_NV21TOARGBROW_SSSE3 +#define HAS_NV12TORGB565ROW_SSSE3 +#define HAS_NV21TORGB565ROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3 #define HAS_RGB565TOARGBROW_SSE2 @@ -160,6 +162,8 @@ extern "C" { #define HAS_MIRRORROWUV_NEON #define HAS_NV12TOARGBROW_NEON #define HAS_NV21TOARGBROW_NEON +#define HAS_NV12TORGB565ROW_NEON +#define HAS_NV21TORGB565ROW_NEON #define HAS_RAWTOARGBROW_NEON #define HAS_RGB24TOARGBROW_NEON #define HAS_RGBATOARGBROW_NEON @@ -278,6 +282,14 @@ void NV21ToARGBRow_NEON(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf, int width); +void NV12ToRGB565Row_NEON(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width); +void NV21ToRGB565Row_NEON(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width); void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); @@ -463,6 +475,15 @@ void NV12ToARGBRow_C(const uint8* y_buf, uint8* argb_buf, int width); +void NV21ToRGB565Row_C(const uint8* y_buf, + const uint8* vu_buf, + uint8* argb_buf, + int width); +void NV12ToRGB565Row_C(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width); + void NV21ToARGBRow_C(const uint8* y_buf, const uint8* vu_buf, uint8* argb_buf, @@ -543,6 +564,16 @@ void NV21ToARGBRow_SSSE3(const uint8* y_buf, uint8* argb_buf, int width); +void NV12ToRGB565Row_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width); + +void NV21ToRGB565Row_SSSE3(const uint8* y_buf, + const uint8* vu_buf, + uint8* argb_buf, + int width); + void I422ToBGRARow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -650,6 +681,14 @@ void NV21ToARGBRow_Any_SSSE3(const uint8* y_buf, const uint8* vu_buf, uint8* argb_buf, int width); +void NV12ToRGB565Row_Any_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width); +void NV21ToRGB565Row_Any_SSSE3(const uint8* y_buf, + const uint8* vu_buf, + uint8* argb_buf, + int width); void I422ToBGRARow_Any_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -785,6 +824,14 @@ void NV21ToARGBRow_Any_NEON(const uint8* y_buf, const uint8* uv_buf, uint8* argb_buf, int width); +void NV12ToRGB565Row_Any_NEON(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width); +void NV21ToRGB565Row_Any_NEON(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width); void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 83efdad08..b5d4ffa57 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -554,13 +554,13 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y, } // Convert NV12 to RGB565. -// TODO(fbarchard): One pass conversion. LIBYUV_API int NV12ToRGB565(const uint8* src_y, int src_stride_y, const uint8* src_uv, int src_stride_uv, uint8* dst_rgb565, int dst_stride_rgb565, int width, int height) { - if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { + if (!src_y || !src_uv || !dst_rgb565 || + width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -569,43 +569,28 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; dst_stride_rgb565 = -dst_stride_rgb565; } - void (*NV12ToARGBRow)(const uint8* y_buf, - const uint8* uv_buf, - uint8* rgb_buf, - int width) = NV12ToARGBRow_C; -#if defined(HAS_NV12TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { - NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; + void (*NV12ToRGB565Row)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV12ToRGB565Row_C; +#if defined(HAS_NV12TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && width <= kMaxStride * 4) { + NV12ToRGB565Row = NV12ToRGB565Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_SSSE3; + NV12ToRGB565Row = NV12ToRGB565Row_SSSE3; } } -#elif defined(HAS_NV12TOARGBROW_NEON) +#elif defined(HAS_NV12TORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - NV12ToARGBRow = NV12ToARGBRow_Any_NEON; + NV12ToRGB565Row = NV12ToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { - NV12ToARGBRow = NV12ToARGBRow_NEON; - } - } -#endif - if (width * 4 > kMaxStride) { - return -1; - } - SIMD_ALIGNED(uint8 row[kMaxStride]); - void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = - ARGBToRGB565Row_C; -#if defined(HAS_ARGBTORGB565ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { - ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBToRGB565Row = ARGBToRGB565Row_SSE2; + NV12ToRGB565Row = NV12ToRGB565Row_NEON; } } #endif for (int y = 0; y < height; ++y) { - NV12ToARGBRow(src_y, src_uv, row, width); - ARGBToRGB565Row(row, dst_rgb565, width); + NV12ToRGB565Row(src_y, src_uv, dst_rgb565, width); dst_rgb565 += dst_stride_rgb565; src_y += src_stride_y; if (y & 1) { @@ -618,10 +603,11 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, // Convert NV21 to RGB565. LIBYUV_API int NV21ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, + const uint8* src_vu, int src_stride_vu, uint8* dst_rgb565, int dst_stride_rgb565, int width, int height) { - if (!src_y || !src_uv || !dst_rgb565 || width <= 0 || height == 0) { + if (!src_y || !src_vu || !dst_rgb565 || + width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -630,47 +616,32 @@ int NV21ToRGB565(const uint8* src_y, int src_stride_y, dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; dst_stride_rgb565 = -dst_stride_rgb565; } - void (*NV21ToARGBRow)(const uint8* y_buf, - const uint8* uv_buf, - uint8* rgb_buf, - int width) = NV21ToARGBRow_C; -#if defined(HAS_NV21TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { - NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3; + void (*NV21ToRGB565Row)(const uint8* y_buf, + const uint8* vu_buf, + uint8* rgb_buf, + int width) = NV21ToRGB565Row_C; +#if defined(HAS_NV21TORGB565ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && width <= kMaxStride * 4) { + NV21ToRGB565Row = NV21ToRGB565Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - NV21ToARGBRow = NV21ToARGBRow_SSSE3; + NV21ToRGB565Row = NV21ToRGB565Row_SSSE3; } } -#elif defined(HAS_NV21TOARGBROW_NEON) +#elif defined(HAS_NV21TORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - NV21ToARGBRow = NV21ToARGBRow_Any_NEON; + NV21ToRGB565Row = NV21ToRGB565Row_Any_NEON; if (IS_ALIGNED(width, 8)) { - NV21ToARGBRow = NV21ToARGBRow_NEON; - } - } -#endif - if (width * 4 > kMaxStride) { - return -1; - } - SIMD_ALIGNED(uint8 row[kMaxStride]); - void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = - ARGBToRGB565Row_C; -#if defined(HAS_ARGBTORGB565ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && width >= 4) { - ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; - if (IS_ALIGNED(width, 4)) { - ARGBToRGB565Row = ARGBToRGB565Row_SSE2; + NV21ToRGB565Row = NV21ToRGB565Row_NEON; } } #endif for (int y = 0; y < height; ++y) { - NV21ToARGBRow(src_y, src_uv, row, width); - ARGBToRGB565Row(row, dst_rgb565, width); + NV21ToRGB565Row(src_y, src_vu, dst_rgb565, width); dst_rgb565 += dst_stride_rgb565; src_y += src_stride_y; if (y & 1) { - src_uv += src_stride_uv; + src_vu += src_stride_vu; } } return 0; diff --git a/source/row_any.cc b/source/row_any.cc index fe932037a..8a06202f4 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -90,16 +90,26 @@ YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15) rgb_buf + n * BPP, width & 7); \ } -#ifdef HAS_I422TOARGBROW_SSSE3 +#ifdef HAS_NV12TOARGBROW_SSSE3 NV2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, - 0, 4) + 0, 4) NV2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, - 0, 4) -#endif // HAS_I422TOARGBROW_SSSE3 -#ifdef HAS_I422TOARGBROW_NEON + 0, 4) +#endif // HAS_NV12TOARGBROW_SSSE3 +#ifdef HAS_NV12TOARGBROW_NEON NV2NY(NV12ToARGBRow_Any_NEON, NV12ToARGBRow_NEON, NV12ToARGBRow_C, 0, 4) NV2NY(NV21ToARGBRow_Any_NEON, NV21ToARGBRow_NEON, NV21ToARGBRow_C, 0, 4) -#endif // HAS_I422TOARGBROW_NEON +#endif // HAS_NV12TOARGBROW_NEON +#ifdef HAS_NV12TORGB565ROW_SSSE3 +NV2NY(NV12ToRGB565Row_Any_SSSE3, NV12ToRGB565Row_SSSE3, NV12ToRGB565Row_C, + 0, 2) +NV2NY(NV21ToRGB565Row_Any_SSSE3, NV21ToRGB565Row_SSSE3, NV21ToRGB565Row_C, + 0, 2) +#endif // HAS_NV12TORGB565ROW_SSSE3 +#ifdef HAS_NV12TORGB565ROW_NEON +NV2NY(NV12ToRGB565Row_Any_NEON, NV12ToRGB565Row_NEON, NV12ToRGB565Row_C, 0, 2) +NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2) +#endif // HAS_NV12TORGB565ROW_NEON #undef NVANY // RGB to RGB does multiple of 16 pixels with SIMD and remainder with C. diff --git a/source/row_common.cc b/source/row_common.cc index 44da71b47..dd1622672 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -710,6 +710,74 @@ void NV21ToARGBRow_C(const uint8* y_buf, } } +void NV12ToRGB565Row_C(const uint8* y_buf, + const uint8* uv_buf, + uint8* dst_rgb565, + int width) { + uint8 b0; + uint8 g0; + uint8 r0; + uint8 b1; + uint8 g1; + uint8 r1; + for (int x = 0; x < width - 1; x += 2) { + YuvPixel2(y_buf[0], uv_buf[0], uv_buf[1], &b0, &g0, &r0); + YuvPixel2(y_buf[1], uv_buf[0], uv_buf[1], &b1, &g1, &r1); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 2; + r1 = r1 >> 3; + *reinterpret_cast(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27); + y_buf += 2; + uv_buf += 2; + dst_rgb565 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel2(y_buf[0], uv_buf[0], uv_buf[1], &b0, &g0, &r0); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + *reinterpret_cast(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + } +} + +void NV21ToRGB565Row_C(const uint8* y_buf, + const uint8* vu_buf, + uint8* dst_rgb565, + int width) { + uint8 b0; + uint8 g0; + uint8 r0; + uint8 b1; + uint8 g1; + uint8 r1; + for (int x = 0; x < width - 1; x += 2) { + YuvPixel2(y_buf[0], vu_buf[1], vu_buf[0], &b0, &g0, &r0); + YuvPixel2(y_buf[1], vu_buf[1], vu_buf[0], &b1, &g1, &r1); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + b1 = b1 >> 3; + g1 = g1 >> 2; + r1 = r1 >> 3; + *reinterpret_cast(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27); + y_buf += 2; + vu_buf += 2; + dst_rgb565 += 4; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel2(y_buf[0], vu_buf[1], vu_buf[0], &b0, &g0, &r0); + b0 = b0 >> 3; + g0 = g0 >> 2; + r0 = r0 >> 3; + *reinterpret_cast(dst_rgb565) = b0 | (g0 << 5) | (r0 << 11); + } +} + void I422ToBGRARow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1311,6 +1379,24 @@ void I422ToARGB4444Row_SSSE3(const uint8* y_buf, I422ToARGBRow_SSSE3(y_buf, u_buf, v_buf, row, width); ARGBToARGB4444Row_SSE2(row, rgb_buf, width); } +void NV12ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + SIMD_ALIGNED(uint8 row[kMaxStride]); + NV12ToARGBRow_SSSE3(src_y, src_uv, row, width); + ARGBToRGB565Row_SSE2(row, dst_rgb565, width); +} + +void NV21ToRGB565Row_SSSE3(const uint8* src_y, + const uint8* src_vu, + uint8* dst_rgb565, + int width) { + SIMD_ALIGNED(uint8 row[kMaxStride]); + NV21ToARGBRow_SSSE3(src_y, src_vu, row, width); + ARGBToRGB565Row_SSE2(row, dst_rgb565, width); +} + #endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) #endif // !defined(YUV_DISABLE_ASM) diff --git a/source/row_neon.cc b/source/row_neon.cc index 028f54bf9..52783dcdb 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -473,6 +473,68 @@ void NV21ToARGBRow_NEON(const uint8* src_y, } #endif // HAS_NV21TOARGBROW_NEON +#ifdef HAS_NV12TORGB565ROW_NEON +void NV12ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + asm volatile ( + "vld1.u8 {d24}, [%4] \n" + "vld1.u8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV12 + YUV422TORGB + "subs %3, %3, #8 \n" + ARGBTORGB565 + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_NV12TORGB565ROW_NEON + +#ifdef HAS_NV21TORGB565ROW_NEON +void NV21ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + asm volatile ( + "vld1.u8 {d24}, [%4] \n" + "vld1.u8 {d25}, [%5] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READNV21 + YUV422TORGB + "subs %3, %3, #8 \n" + ARGBTORGB565 + "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : "r"(&kUVToRB), // %4 + "r"(&kUVToG) // %5 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_NV21TORGB565ROW_NEON + #ifdef HAS_SPLITUV_NEON // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v // Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.