From 2d9fe08225ab28f62b515b2b914accc6a7b060fb Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Tue, 5 Jun 2012 22:11:34 +0000 Subject: [PATCH] direct conversion from NV12 to ARGB BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/645004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@281 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/planar_functions.h | 23 +- include/libyuv/version.h | 2 +- source/convert.cc | 2 +- source/planar_functions.cc | 277 +++++++++++------- source/row.h | 150 ++++++---- source/row_common.cc | 128 ++++++--- source/row_posix.cc | 449 ++++++++++++++++++++---------- source/row_win.cc | 200 +++++++++++++ unit_test/planar_test.cc | 119 +++++--- 10 files changed, 977 insertions(+), 375 deletions(-) diff --git a/README.chromium b/README.chromium index 2aac8c3a2..8d7cf524e 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 280 +Version: 281 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index a918a6fac..a7d38c1fc 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -47,16 +47,33 @@ int I420Mirror(const uint8* src_y, int src_stride_y, uint8* dst_v, int dst_stride_v, int width, int height); -// Convert NV12 to ARGB. Also used for NV21. +// Convert NV12 to ARGB. int NV12ToARGB(const uint8* src_y, int src_stride_y, const uint8* src_uv, int src_stride_uv, uint8* dst_frame, int dst_stride_frame, int width, int height); -// Convert NV12 to RGB565. Also used for NV21. +// Convert NV21 to ARGB. +int NV21ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert M420 to ARGB. +int M420ToARGB(const uint8* src_m420, int src_stride_m420, + uint8* dst_argb, int dst_stride_argb, + int width, int height); + +// Convert NV12 to RGB565. int NV12ToRGB565(const uint8* src_y, int src_stride_y, const uint8* src_uv, int src_stride_uv, - uint8* dst_frame, int dst_stride_frame, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height); + +// Convert NV21 to RGB565. +int NV21ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_rgb565, int dst_stride_rgb565, int width, int height); // Convert YUY2 to ARGB. diff --git a/include/libyuv/version.h b/include/libyuv/version.h index c94ca2477..4c36aedb3 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 280 +#define LIBYUV_VERSION 281 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index 1cfb4c4ac..f53db8e16 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -367,7 +367,7 @@ static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1, // Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for // easy conversion to I420. // M420 format description: -// M420 is row biplanar 420: 2 rows of Y and 1 row of VU. +// M420 is row biplanar 420: 2 rows of Y and 1 row of UV. // Chroma is half width / half height. (420) // src_stride_m420 is row planar. Normally this will be the width in pixels. // The UV plane is half width, but 2 values, so src_stride_m420 applies to diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 712a995c1..1100a1d17 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -839,51 +839,191 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* argb_buf, - int width) = I422ToARGBRow_C; -#if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - I422ToARGBRow = I422ToARGBRow_NEON; - } - } -#elif defined(HAS_I422TOARGBROW_SSSE3) + void (*NV12ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV12ToARGBRow_C; +#if defined(HAS_NV12TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; + NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + NV12ToARGBRow = NV12ToARGBRow_SSSE3; + } } } #endif - int halfwidth = (width + 1) >> 1; - void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) = - SplitUV_C; -#if defined(HAS_SPLITUV_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SplitUV = SplitUV_NEON; - } -#elif defined(HAS_SPLITUV_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16)) { - SplitUV = SplitUV_SSE2; - } -#endif - SIMD_ALIGNED(uint8 rowuv[kMaxStride * 2]); for (int y = 0; y < height; ++y) { - if ((y & 1) == 0) { - // Copy a row of UV. - SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth); - src_uv += src_stride_uv; - } - I422ToARGBRow(src_y, rowuv, rowuv + kMaxStride, dst_argb, width); + NV12ToARGBRow(src_y, src_uv, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert NV21 to ARGB. +int NV21ToARGB(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + void (*NV21ToARGBRow)(const uint8* y_buf, + const uint8* vu_buf, + uint8* rgb_buf, + int width) = NV21ToARGBRow_C; +#if defined(HAS_NV21TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + NV21ToARGBRow = NV21ToARGBRow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height; ++y) { + NV21ToARGBRow(src_y, src_vu, dst_argb, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_vu += src_stride_vu; + } + } + return 0; +} + +// Convert M420 to ARGB. +int M420ToARGB(const uint8* src_m420, int src_stride_m420, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } + void (*NV12ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV12ToARGBRow_C; +#if defined(HAS_NV12TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { + NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + NV12ToARGBRow = NV12ToARGBRow_SSSE3; + } + } + } +#endif + + for (int y = 0; y < height - 1; y += 2) { + NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width); + NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2, + dst_argb + dst_stride_argb, width); + dst_argb += dst_stride_argb * 2; + src_m420 += src_stride_m420 * 3; + } + if (height & 1) { + NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width); + } + return 0; +} + +// Convert NV12 to RGB565. +// TODO(fbarchard): (Re) Optimize for Neon. +int NV12ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_uv, int src_stride_uv, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } + void (*NV12ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV12ToARGBRow_C; +#if defined(HAS_NV12TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) { + NV12ToARGBRow = NV12ToARGBRow_SSSE3; + } +#endif + + SIMD_ALIGNED(uint8 row[kMaxStride]); + void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToRGB565Row_C; +#if defined(HAS_ARGBTORGB565ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { + ARGBToRGB565Row = ARGBToRGB565Row_SSE2; + } +#endif + + for (int y = 0; y < height; ++y) { + NV12ToARGBRow(src_y, src_uv, row, width); + ARGBToRGB565Row(row, dst_rgb565, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +// Convert NV21 to RGB565. +int NV21ToRGB565(const uint8* src_y, int src_stride_y, + const uint8* src_vu, int src_stride_vu, + uint8* dst_rgb565, int dst_stride_rgb565, + int width, int height) { + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565; + dst_stride_rgb565 = -dst_stride_rgb565; + } + void (*NV21ToARGBRow)(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) = NV21ToARGBRow_C; +#if defined(HAS_NV21TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width * 4 <= kMaxStride) { + NV21ToARGBRow = NV21ToARGBRow_SSSE3; + } +#endif + + SIMD_ALIGNED(uint8 row[kMaxStride]); + void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = + ARGBToRGB565Row_C; +#if defined(HAS_ARGBTORGB565ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { + ARGBToRGB565Row = ARGBToRGB565Row_SSE2; + } +#endif + + for (int y = 0; y < height; ++y) { + NV21ToARGBRow(src_y, src_vu, row, width); + ARGBToRGB565Row(row, dst_rgb565, width); + dst_rgb565 += dst_stride_rgb565; + src_y += src_stride_y; + if (y & 1) { + src_vu += src_stride_vu; + } } return 0; } @@ -1020,69 +1160,6 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, return 0; } -// Convert NV12 to RGB565. -int NV12ToRGB565(const uint8* src_y, int src_stride_y, - const uint8* src_uv, int src_stride_uv, - uint8* dst_rgb, int dst_stride_rgb, - int width, int height) { - // Negative height means invert the image. - if (height < 0) { - height = -height; - dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; - dst_stride_rgb = -dst_stride_rgb; - } - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I422ToARGBRow_C; -#if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_NEON; - } -#elif defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } -#endif - SIMD_ALIGNED(uint8 row[kMaxStride]); - void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = - ARGBToRGB565Row_C; -#if defined(HAS_ARGBTORGB565ROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { - ARGBToRGB565Row = ARGBToRGB565Row_SSE2; - } -#endif - - int halfwidth = (width + 1) >> 1; - void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) = - SplitUV_C; -#if defined(HAS_SPLITUV_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - SplitUV = SplitUV_NEON; - } -#elif defined(HAS_SPLITUV_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16)) { - SplitUV = SplitUV_SSE2; - } -#endif - SIMD_ALIGNED(uint8 rowuv[kMaxStride * 2]); - - for (int y = 0; y < height; ++y) { - if ((y & 1) == 0) { - // Copy a row of UV. - SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth); - src_uv += src_stride_uv; - } - I422ToARGBRow(src_y, rowuv, rowuv + kMaxStride, row, width); - ARGBToRGB565Row(row, dst_rgb, width); - dst_rgb += dst_stride_rgb; - src_y += src_stride_y; - } - return 0; -} - // SetRow8 writes 'count' bytes using a 32 bit value repeated // SetRow32 writes 'count' words using a 32 bit value repeated diff --git a/source/row.h b/source/row.h index 6a4ba990b..0254f92ad 100644 --- a/source/row.h +++ b/source/row.h @@ -54,12 +54,14 @@ extern "C" { #define HAS_BGRATOYROW_SSSE3 #define HAS_COPYROW_SSE2 #define HAS_COPYROW_X86 -#define HAS_I400TOARGBROW_SSE2 -#define HAS_I422TOABGRROW_SSSE3 -#define HAS_I422TOARGBROW_SSSE3 -#define HAS_I422TOBGRAROW_SSSE3 #define HAS_I444TOARGBROW_SSSE3 +#define HAS_I422TOARGBROW_SSSE3 #define HAS_I411TOARGBROW_SSSE3 +#define HAS_NV12TOARGBROW_SSSE3 +#define HAS_NV21TOARGBROW_SSSE3 +#define HAS_I422TOBGRAROW_SSSE3 +#define HAS_I422TOABGRROW_SSSE3 +#define HAS_I400TOARGBROW_SSE2 #define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORROWUV_SSSE3 #define HAS_ADDROW_SSE2 @@ -220,28 +222,16 @@ void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); -void I422ToARGBRow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void I422ToBGRARow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void I422ToABGRRow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - void I444ToARGBRow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, - uint8* rgb_buf, + uint8* argb_buf, + int width); + +void I422ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, int width); void I411ToARGBRow_C(const uint8* y_buf, @@ -250,6 +240,28 @@ void I411ToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width); +void NV12ToARGBRow_C(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width); + +void NV21ToARGBRow_C(const uint8* y_buf, + const uint8* vu_buf, + uint8* argb_buf, + int width); + +void I422ToBGRARow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* bgra_buf, + int width); + +void I422ToABGRRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* abgr_buf, + int width); + void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width); @@ -269,6 +281,16 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, void I411ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, + uint8* rgb_buf, + int width); + +void NV12ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width); + +void NV21ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* vu_buf, uint8* argb_buf, int width); @@ -299,6 +321,16 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, + uint8* rgb_buf, + int width); + +void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width); + +void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* vu_buf, uint8* argb_buf, int width); @@ -314,6 +346,46 @@ void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, uint8* abgr_buf, int width); +void I444ToARGBRow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width); + +void I422ToARGBRow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* argb_buf, + int width); + +void I411ToARGBRow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void NV12ToARGBRow_Any_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width); + +void NV21ToARGBRow_Any_SSSE3(const uint8* y_buf, + const uint8* vu_buf, + uint8* argb_buf, + int width); + +void I422ToBGRARow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* bgra_buf, + int width); + +void I422ToABGRRow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* abgr_buf, + int width); + void YToARGBRow_SSE2(const uint8* y_buf, uint8* argb_buf, int width); @@ -334,38 +406,6 @@ void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1, void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width); -// 'Any' functions handle any size and alignment. -void I444ToARGBRow_Any_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void I422ToARGBRow_Any_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void I411ToARGBRow_Any_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void I422ToBGRARow_Any_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - -void I422ToABGRRow_Any_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - - void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); diff --git a/source/row_common.cc b/source/row_common.cc index 4fe019ce7..bcc36ef2c 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -359,6 +359,20 @@ static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, uint8* rgb_buf, (255u << ashift); } +void I444ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width; ++x) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0); + y_buf += 1; + u_buf += 1; + v_buf += 1; + rgb_buf += 4; // Advance 1 pixel. + } +} + // Also used for 420 void I422ToARGBRow_C(const uint8* y_buf, const uint8* u_buf, @@ -378,6 +392,64 @@ void I422ToARGBRow_C(const uint8* y_buf, } } +void I411ToARGBRow_C(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 3; x += 4) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); + YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0); + YuvPixel(y_buf[2], u_buf[0], v_buf[0], rgb_buf + 8, 24, 16, 8, 0); + YuvPixel(y_buf[3], u_buf[0], v_buf[0], rgb_buf + 12, 24, 16, 8, 0); + y_buf += 4; + u_buf += 1; + v_buf += 1; + rgb_buf += 16; // Advance 4 pixels. + } + if (width & 2) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); + YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0); + y_buf += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); + } +} + +void NV12ToARGBRow_C(const uint8* y_buf, + const uint8* uv_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0); + YuvPixel(y_buf[1], uv_buf[0], uv_buf[1], rgb_buf + 4, 24, 16, 8, 0); + y_buf += 2; + uv_buf += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(y_buf[0], uv_buf[0], uv_buf[1], rgb_buf + 0, 24, 16, 8, 0); + } +} + +void NV21ToARGBRow_C(const uint8* y_buf, + const uint8* vu_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0); + YuvPixel(y_buf[1], vu_buf[1], vu_buf[0], rgb_buf + 4, 24, 16, 8, 0); + y_buf += 2; + vu_buf += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(y_buf[0], vu_buf[1], vu_buf[0], rgb_buf + 0, 24, 16, 8, 0); + } +} + void I422ToBGRARow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -414,46 +486,6 @@ void I422ToABGRRow_C(const uint8* y_buf, } } -void I444ToARGBRow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - for (int x = 0; x < width; ++x) { - YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0); - y_buf += 1; - u_buf += 1; - v_buf += 1; - rgb_buf += 4; // Advance 1 pixel. - } -} - -void I411ToARGBRow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) { - for (int x = 0; x < width - 3; x += 4) { - YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); - YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0); - YuvPixel(y_buf[2], u_buf[0], v_buf[0], rgb_buf + 8, 24, 16, 8, 0); - YuvPixel(y_buf[3], u_buf[0], v_buf[0], rgb_buf + 12, 24, 16, 8, 0); - y_buf += 4; - u_buf += 1; - v_buf += 1; - rgb_buf += 16; // Advance 4 pixels. - } - if (width & 2) { - YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); - YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0); - y_buf += 2; - rgb_buf += 8; // Advance 2 pixels. - } - if (width & 1) { - YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0); - } -} - void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) { for (int x = 0; x < width; ++x) { YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0); @@ -728,10 +760,26 @@ void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1, rgb_buf + n * 4, width & 7); \ } +// Wrappers to handle odd width +#define Y2NY(NAMEANY, NV12TORGB_SSE, NV12TORGB_C, UV_SHIFT) \ + void NAMEANY(const uint8* y_buf, \ + const uint8* uv_buf, \ + uint8* rgb_buf, \ + int width) { \ + int n = width & ~7; \ + NV12TORGB_SSE(y_buf, uv_buf, rgb_buf, n); \ + NV12TORGB_C(y_buf + n, \ + uv_buf + (n >> UV_SHIFT), \ + rgb_buf + n * 4, width & 7); \ + } + + #if defined(HAS_I422TOARGBROW_SSSE3) YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, 0) YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1) YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2) +Y2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, 0) +Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0) YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1) YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1) #endif diff --git a/source/row_posix.cc b/source/row_posix.cc index 28f10c040..6b4af0855 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -1231,14 +1231,17 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, #define YG 74 /* static_cast(1.164 * 64 + 0.5) */ struct { - vec8 kUVToB; - vec8 kUVToG; - vec8 kUVToR; - vec16 kUVBiasB; - vec16 kUVBiasG; - vec16 kUVBiasR; - vec16 kYSub16; - vec16 kYToRgb; + vec8 kUVToB; // 0 + vec8 kUVToG; // 16 + vec8 kUVToR; // 32 + vec16 kUVBiasB; // 48 + vec16 kUVBiasG; // 64 + vec16 kUVBiasR; // 80 + vec16 kYSub16; // 96 + vec16 kYToRgb; // 112 + vec8 kVUToB; // 128 + vec8 kVUToG; // 144 + vec8 kVUToR; // 160 } CONST SIMD_ALIGNED(kYuvConstants) = { { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }, { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, @@ -1247,48 +1250,58 @@ struct { { BG, BG, BG, BG, BG, BG, BG, BG }, { BR, BR, BR, BR, BR, BR, BR, BR }, { 16, 16, 16, 16, 16, 16, 16, 16 }, - { YG, YG, YG, YG, YG, YG, YG, YG } + { YG, YG, YG, YG, YG, YG, YG, YG }, + { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB }, + { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG }, + { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR } }; + // Read 8 UV from 411 #define READYUV444 \ - "movq (%1),%%xmm0 \n" \ - "movq (%1,%2,1),%%xmm1 \n" \ - "lea 0x8(%1),%1 \n" \ + "movq (%[u_buf]),%%xmm0 \n" \ + "movq (%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \ // Read 4 UV from 422, upsample to 8 UV #define READYUV422 \ - "movd (%1),%%xmm0 \n" \ - "movd (%1,%2,1),%%xmm1 \n" \ - "lea 0x4(%1),%1 \n" \ + "movd (%[u_buf]),%%xmm0 \n" \ + "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x4(%[u_buf]),%[u_buf] \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \ "punpcklwd %%xmm0,%%xmm0 \n" \ // Read 2 UV from 411, upsample to 8 UV #define READYUV411 \ - "movd (%1),%%xmm0 \n" \ - "movd (%1,%2,1),%%xmm1 \n" \ - "lea 0x2(%1),%1 \n" \ + "movd (%[u_buf]),%%xmm0 \n" \ + "movd (%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x2(%[u_buf]),%[u_buf] \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \ "punpcklwd %%xmm0,%%xmm0 \n" \ "punpckldq %%xmm0,%%xmm0 \n" \ +// Read 4 UV from NV12, upsample to 8 UV +#define READNV12 \ + "movq (%[uv_buf]),%%xmm0 \n" \ + "lea 0x8(%[uv_buf]),%[uv_buf] \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + // Convert 8 pixels: 8 UV and 8 Y #define YUVTORGB \ "movdqa %%xmm0,%%xmm1 \n" \ "movdqa %%xmm0,%%xmm2 \n" \ - "pmaddubsw (%5),%%xmm0 \n" \ - "pmaddubsw 16(%5),%%xmm1 \n" \ - "pmaddubsw 32(%5),%%xmm2 \n" \ - "psubw 48(%5),%%xmm0 \n" \ - "psubw 64(%5),%%xmm1 \n" \ - "psubw 80(%5),%%xmm2 \n" \ - "movq (%0),%%xmm3 \n" \ - "lea 0x8(%0),%0 \n" \ + "pmaddubsw (%[kYuvConstants]),%%xmm0 \n" \ + "pmaddubsw 16(%[kYuvConstants]),%%xmm1 \n" \ + "pmaddubsw 32(%[kYuvConstants]),%%xmm2 \n" \ + "psubw 48(%[kYuvConstants]),%%xmm0 \n" \ + "psubw 64(%[kYuvConstants]),%%xmm1 \n" \ + "psubw 80(%[kYuvConstants]),%%xmm2 \n" \ + "movq (%[y_buf]),%%xmm3 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" \ "punpcklbw %%xmm4,%%xmm3 \n" \ - "psubsw 96(%5),%%xmm3 \n" \ - "pmullw 112(%5),%%xmm3 \n" \ + "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \ + "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \ "paddsw %%xmm3,%%xmm0 \n" \ "paddsw %%xmm3,%%xmm1 \n" \ "paddsw %%xmm3,%%xmm2 \n" \ @@ -1297,7 +1310,32 @@ struct { "psraw $0x6,%%xmm2 \n" \ "packuswb %%xmm0,%%xmm0 \n" \ "packuswb %%xmm1,%%xmm1 \n" \ - "packuswb %%xmm2,%%xmm2 \n" + "packuswb %%xmm2,%%xmm2 \n" \ + +// Convert 8 pixels: 8 VU and 8 Y +#define YVUTORGB \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "pmaddubsw 128(%[kYuvConstants]),%%xmm0 \n" \ + "pmaddubsw 144(%[kYuvConstants]),%%xmm1 \n" \ + "pmaddubsw 160(%[kYuvConstants]),%%xmm2 \n" \ + "psubw 48(%[kYuvConstants]),%%xmm0 \n" \ + "psubw 64(%[kYuvConstants]),%%xmm1 \n" \ + "psubw 80(%[kYuvConstants]),%%xmm2 \n" \ + "movq (%[y_buf]),%%xmm3 \n" \ + "lea 0x8(%[y_buf]),%[y_buf] \n" \ + "punpcklbw %%xmm4,%%xmm3 \n" \ + "psubsw 96(%[kYuvConstants]),%%xmm3 \n" \ + "pmullw 112(%[kYuvConstants]),%%xmm3 \n" \ + "paddsw %%xmm3,%%xmm0 \n" \ + "paddsw %%xmm3,%%xmm1 \n" \ + "paddsw %%xmm3,%%xmm2 \n" \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" \ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, @@ -1305,7 +1343,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, uint8* argb_buf, int width) { asm volatile ( - "sub %1,%2 \n" + "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" @@ -1317,17 +1355,17 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,(%3) \n" - "movdqa %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x8,%4 \n" + "movdqa %%xmm0,(%[argb_buf]) \n" + "movdqa %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(argb_buf), // %3 - "+rm"(width) // %4 - : "r"(&kYuvConstants.kUVToB) // %5 + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -1341,7 +1379,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, uint8* argb_buf, int width) { asm volatile ( - "sub %1,%2 \n" + "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" @@ -1353,17 +1391,17 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,(%3) \n" - "movdqa %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x8,%4 \n" + "movdqa %%xmm0,(%[argb_buf]) \n" + "movdqa %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(argb_buf), // %3 - "+rm"(width) // %4 - : "r"(&kYuvConstants.kUVToB) // %5 + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -1377,7 +1415,7 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, uint8* argb_buf, int width) { asm volatile ( - "sub %1,%2 \n" + "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" @@ -1389,17 +1427,83 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqa %%xmm0,(%3) \n" - "movdqa %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x8,%4 \n" + "movdqa %%xmm0,(%[argb_buf]) \n" + "movdqa %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(argb_buf), // %3 - "+rm"(width) // %4 - : "r"(&kYuvConstants.kUVToB) // %5 + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READNV12 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,(%[argb_buf]) \n" + "movdqa %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* vu_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READNV12 + YVUTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm0,(%[argb_buf]) \n" + "movdqa %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(vu_buf), // %[uv_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -1413,7 +1517,7 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, uint8* argb_buf, int width) { asm volatile ( - "sub %1,%2 \n" + "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" @@ -1425,17 +1529,17 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%3) \n" - "movdqu %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x8,%4 \n" + "movdqu %%xmm0,(%[argb_buf]) \n" + "movdqu %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(argb_buf), // %3 - "+rm"(width) // %4 - : "r"(&kYuvConstants.kUVToB) // %5 + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -1449,7 +1553,7 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, uint8* argb_buf, int width) { asm volatile ( - "sub %1,%2 \n" + "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" @@ -1461,17 +1565,17 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%3) \n" - "movdqu %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x8,%4 \n" + "movdqu %%xmm0,(%[argb_buf]) \n" + "movdqu %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(argb_buf), // %3 - "+rm"(width) // %4 - : "r"(&kYuvConstants.kUVToB) // %5 + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -1485,7 +1589,7 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, uint8* argb_buf, int width) { asm volatile ( - "sub %1,%2 \n" + "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" @@ -1497,17 +1601,83 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, "movdqa %%xmm0,%%xmm1 \n" "punpcklwd %%xmm2,%%xmm0 \n" "punpckhwd %%xmm2,%%xmm1 \n" - "movdqu %%xmm0,(%3) \n" - "movdqu %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x8,%4 \n" + "movdqu %%xmm0,(%[argb_buf]) \n" + "movdqu %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(argb_buf), // %3 - "+rm"(width) // %4 - : "r"(&kYuvConstants.kUVToB) // %5 + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READNV12 + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%[argb_buf]) \n" + "movdqu %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* vu_buf, + uint8* argb_buf, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + ".p2align 4 \n" + "1: \n" + READNV12 + YVUTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%[argb_buf]) \n" + "movdqu %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(vu_buf), // %[uv_buf] + [argb_buf]"+r"(argb_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -1521,7 +1691,7 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, uint8* bgra_buf, int width) { asm volatile ( - "sub %1,%2 \n" + "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" @@ -1534,17 +1704,17 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, "movdqa %%xmm5,%%xmm0 \n" "punpcklwd %%xmm1,%%xmm5 \n" "punpckhwd %%xmm1,%%xmm0 \n" - "movdqa %%xmm5,(%3) \n" - "movdqa %%xmm0,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x8,%4 \n" + "movdqa %%xmm5,(%[argb_buf]) \n" + "movdqa %%xmm0,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(bgra_buf), // %3 - "+rm"(width) // %4 - : "r"(&kYuvConstants.kUVToB) // %5 + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(bgra_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -1558,7 +1728,7 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, uint8* abgr_buf, int width) { asm volatile ( - "sub %1,%2 \n" + "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" @@ -1570,17 +1740,17 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, "movdqa %%xmm2,%%xmm1 \n" "punpcklwd %%xmm0,%%xmm2 \n" "punpckhwd %%xmm0,%%xmm1 \n" - "movdqa %%xmm2,(%3) \n" - "movdqa %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x8,%4 \n" + "movdqa %%xmm2,(%[argb_buf]) \n" + "movdqa %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(abgr_buf), // %3 - "+rm"(width) // %4 - : "r"(&kYuvConstants.kUVToB) // %5 + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(abgr_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -1594,7 +1764,7 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, uint8* bgra_buf, int width) { asm volatile ( - "sub %1,%2 \n" + "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" @@ -1607,17 +1777,17 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, "movdqa %%xmm5,%%xmm0 \n" "punpcklwd %%xmm1,%%xmm5 \n" "punpckhwd %%xmm1,%%xmm0 \n" - "movdqu %%xmm5,(%3) \n" - "movdqu %%xmm0,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x8,%4 \n" + "movdqu %%xmm5,(%[argb_buf]) \n" + "movdqu %%xmm0,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(bgra_buf), // %3 - "+rm"(width) // %4 - : "r"(&kYuvConstants.kUVToB) // %5 + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(bgra_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" @@ -1631,7 +1801,7 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, uint8* abgr_buf, int width) { asm volatile ( - "sub %1,%2 \n" + "sub %[u_buf],%[v_buf] \n" "pcmpeqb %%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" @@ -1643,24 +1813,23 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, "movdqa %%xmm2,%%xmm1 \n" "punpcklwd %%xmm0,%%xmm2 \n" "punpckhwd %%xmm0,%%xmm1 \n" - "movdqu %%xmm2,(%3) \n" - "movdqu %%xmm1,0x10(%3) \n" - "lea 0x20(%3),%3 \n" - "sub $0x8,%4 \n" + "movdqu %%xmm2,(%[argb_buf]) \n" + "movdqu %%xmm1,0x10(%[argb_buf]) \n" + "lea 0x20(%[argb_buf]),%[argb_buf] \n" + "sub $0x8,%[width] \n" "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(abgr_buf), // %3 - "+rm"(width) // %4 - : "r"(&kYuvConstants.kUVToB) // %5 + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [argb_buf]"+r"(abgr_buf), // %[argb_buf] + [width]"+rm"(width) // %[width] + : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants] : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif ); } - #endif // HAS_I422TOARGBROW_SSSE3 #ifdef HAS_YTOARGBROW_SSE2 diff --git a/source/row_win.cc b/source/row_win.cc index 308b08747..3c1ac42cd 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -1230,6 +1230,18 @@ static const vec8 kUVToG = { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }; +static const vec8 kVUToB = { + VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, +}; + +static const vec8 kVUToR = { + VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, +}; + +static const vec8 kVUToG = { + VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, +}; + static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; @@ -1265,6 +1277,13 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ } +// Read 4 UV from NV12, upsample to 8 UV +#define READNV12 __asm { \ + __asm movq xmm0, qword ptr [esi] /* UV */ \ + __asm lea esi, [esi + 8] \ + __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ + } + // Convert 8 pixels: 8 UV and 8 Y #define YUVTORGB __asm { \ /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ @@ -1293,6 +1312,34 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; __asm packuswb xmm2, xmm2 /* R */ \ } +// Convert 8 pixels: 8 VU and 8 Y +#define YVUTORGB __asm { \ + /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ + __asm movdqa xmm1, xmm0 \ + __asm movdqa xmm2, xmm0 \ + __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \ + __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \ + __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \ + __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ + __asm psubw xmm1, kUVBiasG \ + __asm psubw xmm2, kUVBiasR \ + /* Step 2: Find Y contribution to 8 R,G,B values */ \ + __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ + __asm lea eax, [eax + 8] \ + __asm punpcklbw xmm3, xmm4 \ + __asm psubsw xmm3, kYSub16 \ + __asm pmullw xmm3, kYToRgb \ + __asm paddsw xmm0, xmm3 /* B += Y */ \ + __asm paddsw xmm1, xmm3 /* G += Y */ \ + __asm paddsw xmm2, xmm3 /* R += Y */ \ + __asm psraw xmm0, 6 \ + __asm psraw xmm1, 6 \ + __asm psraw xmm2, 6 \ + __asm packuswb xmm0, xmm0 /* B */ \ + __asm packuswb xmm1, xmm1 /* G */ \ + __asm packuswb xmm2, xmm2 /* R */ \ + } + // 8 pixels, dest aligned 16. // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes) __declspec(naked) __declspec(align(16)) @@ -1423,6 +1470,82 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, } } +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes) +__declspec(naked) __declspec(align(16)) +void NV12ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // UV + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + READNV12 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes) +__declspec(naked) __declspec(align(16)) +void NV21ToARGBRow_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // VU + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + READNV12 + YVUTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqa [edx], xmm0 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + // 8 pixels, unaligned. // 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes) __declspec(naked) __declspec(align(16)) @@ -1553,6 +1676,83 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, } } + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes) +__declspec(naked) __declspec(align(16)) +void NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // UV + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + READNV12 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes) +__declspec(naked) __declspec(align(16)) +void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* uv_buf, + uint8* argb_buf, + int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // Y + mov esi, [esp + 4 + 8] // VU + mov edx, [esp + 4 + 12] // argb + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + READNV12 + YVUTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop esi + ret + } +} + __declspec(naked) __declspec(align(16)) void I422ToBGRARow_SSSE3(const uint8* y_buf, const uint8* u_buf, diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 64b773c71..b327bdd95 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -26,7 +26,7 @@ namespace libyuv { #define TESTPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B) \ -TEST_F(libyuvTest, ##FMT_PLANAR##To##FMT_B##_CvsOPT) { \ +TEST_F(libyuvTest, ##FMT_PLANAR##To##FMT_B##_OptVsC) { \ const int kWidth = 1280; \ const int kHeight = 720; \ align_buffer_16(src_y, kWidth * kHeight); \ @@ -88,8 +88,60 @@ TESTPLANARTOB(I411, 4, 1, ARGB, 4) TESTPLANARTOB(I422, 2, 1, ARGB, 4) TESTPLANARTOB(I444, 1, 1, ARGB, 4) + +#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B) \ +TEST_F(libyuvTest, ##FMT_PLANAR##To##FMT_B##_OptVsC) { \ + const int kWidth = 1280; \ + const int kHeight = 720; \ + align_buffer_16(src_y, kWidth * kHeight); \ + align_buffer_16(src_uv, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y * 2); \ + align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight); \ + align_buffer_16(dst_argb_opt, (kWidth * BPP_B) * kHeight); \ + srandom(time(NULL)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kWidth; ++j) \ + src_y[(i * kWidth) + j] = (random() & 0xff); \ + for (int i = 0; i < kHeight / SUBSAMP_X; ++i) \ + for (int j = 0; j < kWidth / SUBSAMP_Y * 2; ++j) { \ + src_uv[(i * kWidth / SUBSAMP_X) * 2 + j] = (random() & 0xff); \ + } \ + MaskCpuFlags(kCpuInitialized); \ + ##FMT_PLANAR##To##FMT_B(src_y, kWidth, \ + src_uv, kWidth / SUBSAMP_X * 2, \ + dst_argb_c, kWidth * BPP_B, \ + kWidth, kHeight); \ + MaskCpuFlags(-1); \ + const int runs = 1000; \ + for (int i = 0; i < runs; ++i) { \ + ##FMT_PLANAR##To##FMT_B(src_y, kWidth, \ + src_uv, kWidth / SUBSAMP_X * 2, \ + dst_argb_opt, kWidth * BPP_B, \ + kWidth, kHeight); \ + } \ + int err = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth * BPP_B; ++j) { \ + int diff = static_cast(dst_argb_c[i * kWidth * BPP_B + j]) - \ + static_cast(dst_argb_opt[i * kWidth * BPP_B + j]); \ + if (abs(diff) > 2) { \ + ++err; \ + } \ + } \ + } \ + EXPECT_EQ(err, 0); \ + free_aligned_buffer_16(src_y) \ + free_aligned_buffer_16(src_uv) \ + free_aligned_buffer_16(dst_argb_c) \ + free_aligned_buffer_16(dst_argb_opt) \ +} + +TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4) +TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4) +TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2) +TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2) + #define TESTATOPLANAR(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y) \ -TEST_F(libyuvTest, ##FMT_A##To##FMT_PLANAR##_CvsOPT) { \ +TEST_F(libyuvTest, ##FMT_A##To##FMT_PLANAR##_OptVsC) { \ const int kWidth = 1280; \ const int kHeight = 720; \ align_buffer_16(src_argb, (kWidth * BPP_A) * kHeight); \ @@ -171,36 +223,34 @@ TESTATOPLANAR(ARGB, 4, I422, 2, 1) //TESTATOPLANAR(ARGB, 4, I444, 1, 1) // TODO(fbarchard): Implement and test 411 and 444 -#define TESTATOB(FMT_A, BPP_A, FMT_B, BPP_B) \ -TEST_F(libyuvTest, ##FMT_A##To##FMT_B##_CvsOPT) { \ +#define TESTATOB(FMT_A, BPP_A, STRIDE_A, FMT_B, BPP_B) \ +TEST_F(libyuvTest, ##FMT_A##To##FMT_B##_OptVsC) { \ const int kWidth = 1280; \ const int kHeight = 720; \ - align_buffer_16(src_argb, kWidth * kHeight * BPP_A); \ + align_buffer_16(src_argb, (kWidth * BPP_A) * kHeight); \ align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight); \ align_buffer_16(dst_argb_opt, (kWidth * BPP_B) * kHeight); \ srandom(time(NULL)); \ - for (int i = 0; i < kHeight; ++i) \ - for (int j = 0; j < kWidth * BPP_A; ++j) \ - src_argb[(i * kWidth * BPP_A) + j] = (random() & 0xff); \ + for (int i = 0; i < kHeight * kWidth * BPP_A; ++i) { \ + src_argb[i] = (random() & 0xff); \ + } \ MaskCpuFlags(kCpuInitialized); \ - ##FMT_A##To##FMT_B(src_argb, kWidth * BPP_A, \ + ##FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A, \ dst_argb_c, kWidth * BPP_B, \ kWidth, kHeight); \ MaskCpuFlags(-1); \ const int runs = 1000; \ for (int i = 0; i < runs; ++i) { \ - ##FMT_A##To##FMT_B(src_argb, kWidth * BPP_A, \ + ##FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A, \ dst_argb_opt, kWidth * BPP_B, \ kWidth, kHeight); \ } \ int err = 0; \ - for (int i = 0; i < kHeight; ++i) { \ - for (int j = 0; j < kWidth * BPP_B; ++j) { \ - int diff = static_cast(dst_argb_c[i * kWidth * BPP_B + j]) - \ - static_cast(dst_argb_opt[i * kWidth * BPP_B + j]); \ - if (abs(diff) > 2) \ - err++; \ - } \ + for (int i = 0; i < kHeight * kWidth * BPP_B; ++i) { \ + int diff = static_cast(dst_argb_c[i]) - \ + static_cast(dst_argb_opt[i]); \ + if (abs(diff) > 2) \ + err++; \ } \ EXPECT_EQ(err, 0); \ free_aligned_buffer_16(src_argb) \ @@ -208,25 +258,26 @@ TEST_F(libyuvTest, ##FMT_A##To##FMT_B##_CvsOPT) { \ free_aligned_buffer_16(dst_argb_opt) \ } -TESTATOB(ARGB, 4, ARGB, 4) -TESTATOB(ARGB, 4, BGRA, 4) -TESTATOB(ARGB, 4, ABGR, 4) -TESTATOB(ARGB, 4, RAW, 3) -TESTATOB(ARGB, 4, RGB24, 3) -TESTATOB(ARGB, 4, RGB565, 2) -TESTATOB(ARGB, 4, ARGB1555, 2) -TESTATOB(ARGB, 4, ARGB4444, 2) +TESTATOB(ARGB, 4, 4, ARGB, 4) +TESTATOB(ARGB, 4, 4, BGRA, 4) +TESTATOB(ARGB, 4, 4, ABGR, 4) +TESTATOB(ARGB, 4, 4, RAW, 3) +TESTATOB(ARGB, 4, 4, RGB24, 3) +TESTATOB(ARGB, 4, 4, RGB565, 2) +TESTATOB(ARGB, 4, 4, ARGB1555, 2) +TESTATOB(ARGB, 4, 4, ARGB4444, 2) -TESTATOB(BGRA, 4, ARGB, 4) -TESTATOB(ABGR, 4, ARGB, 4) -TESTATOB(RAW, 3, ARGB, 4) -TESTATOB(RGB24, 3, ARGB, 4) -TESTATOB(RGB565, 2, ARGB, 4) -TESTATOB(ARGB1555, 2, ARGB, 4) -TESTATOB(ARGB4444, 2, ARGB, 4) +TESTATOB(BGRA, 4, 4, ARGB, 4) +TESTATOB(ABGR, 4, 4, ARGB, 4) +TESTATOB(RAW, 3, 3, ARGB, 4) +TESTATOB(RGB24, 3, 3, ARGB, 4) +TESTATOB(RGB565, 2, 2, ARGB, 4) +TESTATOB(ARGB1555, 2, 2, ARGB, 4) +TESTATOB(ARGB4444, 2, 2, ARGB, 4) -TESTATOB(YUY2, 2, ARGB, 4) -TESTATOB(UYVY, 2, ARGB, 4) +TESTATOB(YUY2, 2, 2, ARGB, 4) +TESTATOB(UYVY, 2, 2, ARGB, 4) +TESTATOB(M420, 3 / 2, 1, ARGB, 4) TEST_F(libyuvTest, TestAttenuate) { SIMD_ALIGNED(uint8 orig_pixels[256][4]);