From 9eefb2e8dd2c40a8b6bd0f02d794fe78332fc08f Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Wed, 18 Jan 2012 23:56:30 +0000 Subject: [PATCH] ARGBToRGB functions optimized BUG=none TEST=media_unittest.exe --gunit_catch_exceptions=0 --yuvconverter_repeat=1000 --gunit_filter=LmiVideoFrameTest.ConvertTo*R* Review URL: https://webrtc-codereview.appspot.com/355002 git-svn-id: http://libyuv.googlecode.com/svn/trunk@138 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- include/libyuv/planar_functions.h | 10 + source/planar_functions.cc | 318 +++++++++++++++++++++--------- source/row.h | 20 ++ source/row_win.cc | 204 +++++++++++++++++++ 4 files changed, 462 insertions(+), 90 deletions(-) diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 26e9eb275..7a01c5129 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -192,6 +192,16 @@ int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra, uint8* dst_argb, int dst_stride_argb, int width, int height); +// Convert ARGB To RGB24. +int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb24, int dst_stride_rgb24, + int width, int height); + +// Convert ARGB To RAW. +int ARGBToRAW(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb, int dst_stride_rgb, + int width, int height); + // Convert ARGB to I400. int ARGBToI400(const uint8* src_argb, int src_stride_argb, uint8* dst_y, int dst_stride_y, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 20ae26a6c..018b9c11a 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1633,27 +1633,40 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*FastConvertYUVToRGB24Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); -#if defined(HAS_FASTCONVERTYUVTORGB24ROW_NEON) + void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { - FastConvertYUVToRGB24Row = FastConvertYUVToRGB24Row_NEON; + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON; } else -#elif defined(HAS_FASTCONVERTYUVTORGB24ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - FastConvertYUVToRGB24Row = FastConvertYUVToRGB24Row_SSSE3; +#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; } else #endif { - FastConvertYUVToRGB24Row = FastConvertYUVToRGB24Row_C; + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; } + + SIMD_ALIGNED(uint8 row[kMaxStride]); + void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix); +#if defined(HAS_ARGBTORGB24ROW_SSSE3_DISABLED) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; + } else +#endif + { + ARGBToRGB24Row = ARGBToRGB24Row_C; + } + for (int y = 0; y < height; ++y) { - FastConvertYUVToRGB24Row(src_y, src_u, src_v, dst_argb, width); + FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width); + ARGBToRGB24Row(row, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { @@ -1666,37 +1679,50 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, // Convert I420 to RAW. int I420ToRAW(const uint8* src_y, int src_stride_y, - const uint8* src_u, int src_stride_u, - const uint8* src_v, int src_stride_v, - uint8* dst_argb, int dst_stride_argb, - int width, int height) { + const uint8* src_u, int src_stride_u, + const uint8* src_v, int src_stride_v, + uint8* dst_argb, int dst_stride_argb, + int width, int height) { // Negative height means invert the image. if (height < 0) { height = -height; dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*FastConvertYUVToRAWRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); -#if defined(HAS_FASTCONVERTYUVTORAWROW_NEON) + void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { - FastConvertYUVToRAWRow = FastConvertYUVToRAWRow_NEON; + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON; } else -#elif defined(HAS_FASTCONVERTYUVTORAWROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - FastConvertYUVToRAWRow = FastConvertYUVToRAWRow_SSSE3; +#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; } else #endif { - FastConvertYUVToRAWRow = FastConvertYUVToRAWRow_C; + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; } + + SIMD_ALIGNED(uint8 row[kMaxStride]); + void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix); +#if defined(HAS_ARGBTORAWROW_SSSE3_DISABLED) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBToRAWRow = ARGBToRAWRow_SSSE3; + } else +#endif + { + ARGBToRAWRow = ARGBToRAWRow_C; + } + for (int y = 0; y < height; ++y) { - FastConvertYUVToRAWRow(src_y, src_u, src_v, dst_argb, width); + FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width); + ARGBToRAWRow(row, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { @@ -1719,27 +1745,40 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*FastConvertYUVToRGB565Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); -#if defined(HAS_FASTCONVERTYUVTORGB565ROW_NEON) + void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { - FastConvertYUVToRGB565Row = FastConvertYUVToRGB565Row_NEON; + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON; } else -#elif defined(HAS_FASTCONVERTYUVTORGB565ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - FastConvertYUVToRGB565Row = FastConvertYUVToRGB565Row_SSSE3; +#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; } else #endif { - FastConvertYUVToRGB565Row = FastConvertYUVToRGB565Row_C; + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; } + + SIMD_ALIGNED(uint8 row[kMaxStride]); + void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix); +#if defined(HAS_ARGBTORGB565ROW_SSE2_DISABLED) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBToRGB565Row = ARGBToRGB565Row_SSE2; + } else +#endif + { + ARGBToRGB565Row = ARGBToRGB565Row_C; + } + for (int y = 0; y < height; ++y) { - FastConvertYUVToRGB565Row(src_y, src_u, src_v, dst_argb, width); + FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width); + ARGBToRGB565Row(row, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { @@ -1762,27 +1801,40 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*FastConvertYUVToARGB1555Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); -#if defined(HAS_FASTCONVERTYUVTOARGB1555ROW_NEON) + void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { - FastConvertYUVToARGB1555Row = FastConvertYUVToARGB1555Row_NEON; + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON; } else -#elif defined(HAS_FASTCONVERTYUVTOARGB1555ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - FastConvertYUVToARGB1555Row = FastConvertYUVToARGB1555Row_SSSE3; +#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; } else #endif { - FastConvertYUVToARGB1555Row = FastConvertYUVToARGB1555Row_C; + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; } + + SIMD_ALIGNED(uint8 row[kMaxStride]); + void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix); +#if defined(HAS_ARGBTOARGB1555ROW_SSE2_DISABLED) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; + } else +#endif + { + ARGBToARGB1555Row = ARGBToARGB1555Row_C; + } + for (int y = 0; y < height; ++y) { - FastConvertYUVToARGB1555Row(src_y, src_u, src_v, dst_argb, width); + FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width); + ARGBToARGB1555Row(row, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { @@ -1792,6 +1844,7 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, } return 0; } + // Convert I420 to ARGB4444. int I420ToARGB4444(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, @@ -1804,27 +1857,40 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } - void (*FastConvertYUVToARGB4444Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); -#if defined(HAS_FASTCONVERTYUVTOARGB4444ROW_NEON) + void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { - FastConvertYUVToARGB4444Row = FastConvertYUVToARGB4444Row_NEON; + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON; } else -#elif defined(HAS_FASTCONVERTYUVTOARGB4444ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - FastConvertYUVToARGB4444Row = FastConvertYUVToARGB4444Row_SSSE3; +#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; } else #endif { - FastConvertYUVToARGB4444Row = FastConvertYUVToARGB4444Row_C; + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; } + + SIMD_ALIGNED(uint8 row[kMaxStride]); + void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix); +#if defined(HAS_ARGBTOARGB4444ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; + } else +#endif + { + ARGBToARGB4444Row = ARGBToARGB4444Row_C; + } + for (int y = 0; y < height; ++y) { - FastConvertYUVToARGB4444Row(src_y, src_u, src_v, dst_argb, width); + FastConvertYUVToARGBRow(src_y, src_u, src_v, row, width); + ARGBToARGB4444Row(row, dst_argb, width); dst_argb += dst_stride_argb; src_y += src_stride_y; if (y & 1) { @@ -2119,6 +2185,65 @@ int BG24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, return 0; } +// Convert ARGB To RGB24. +int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb24, int dst_stride_rgb24, + int width, int height) { + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix); +#if defined(HAS_ARGBTORGB24ROW_SSSE3_DISABLED) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; + } else +#endif + { + ARGBToRGB24Row = ARGBToRGB24Row_C; + } + + for (int y = 0; y < height; ++y) { + ARGBToRGB24Row(src_argb, dst_rgb24, width); + src_argb += src_stride_argb; + dst_rgb24 += dst_stride_rgb24; + } + return 0; +} + +// Convert ARGB To RAW. +int ARGBToRAW(const uint8* src_argb, int src_stride_argb, + uint8* dst_raw, int dst_stride_raw, + int width, int height) { + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix); +#if defined(HAS_ARGBTORAWROW_SSSE3_DISABLED) + if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && + IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) { + ARGBToRAWRow = ARGBToRAWRow_SSSE3; + } else +#endif + { + ARGBToRAWRow = ARGBToRAWRow_C; + } + + for (int y = 0; y < height; ++y) { + ARGBToRAWRow(src_argb, dst_raw, width); + src_argb += src_stride_argb; + dst_raw += dst_stride_raw; + } + return 0; +} // Convert NV12 to RGB565. int NV12ToRGB565(const uint8* src_y, int src_stride_y, @@ -2131,25 +2256,37 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, dst_rgb = dst_rgb + (height - 1) * dst_stride_rgb; dst_stride_rgb = -dst_stride_rgb; } - void (*FastConvertYUVToRGB565Row)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); -#if defined(HAS_FASTCONVERTYUVTORGB565ROW_NEON) + void (*FastConvertYUVToARGBRow)(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) { - FastConvertYUVToRGB565Row = FastConvertYUVToRGB565Row_NEON; + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_NEON; } else -#elif defined(HAS_FASTCONVERTYUVTORGB565ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && - IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_rgb, 16) && IS_ALIGNED(dst_stride_rgb, 16)) { - FastConvertYUVToRGB565Row = FastConvertYUVToRGB565Row_SSSE3; +#elif defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) { + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; } else #endif { - FastConvertYUVToRGB565Row = FastConvertYUVToRGB565Row_C; + FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; } + + SIMD_ALIGNED(uint8 row[kMaxStride]); + void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix); +#if defined(HAS_ARGBTORGB565ROW_SSE2_DISABLED) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + ARGBToRGB565Row = ARGBToRGB565Row_SSE2; + } else +#endif + { + ARGBToRGB565Row = ARGBToRGB565Row_C; + } + int halfwidth = (width + 1) >> 1; void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); #if defined(HAS_SPLITUV_NEON) @@ -2166,15 +2303,16 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, { SplitUV = SplitUV_C; } - SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + SIMD_ALIGNED(uint8 rowuv[kMaxStride * 2]); for (int y = 0; y < height; ++y) { if ((y & 1) == 0) { // Copy a row of UV. - SplitUV(src_uv, row, row + kMaxStride, halfwidth); + SplitUV(src_uv, rowuv, rowuv + kMaxStride, halfwidth); src_uv += src_stride_uv; } - FastConvertYUVToRGB565Row(src_y, row, row + kMaxStride, dst_rgb, width); + FastConvertYUVToARGBRow(src_y, rowuv, rowuv + kMaxStride, row, width); + ARGBToRGB565Row(row, dst_rgb, width); dst_rgb += dst_stride_rgb; src_y += src_stride_y; } diff --git a/source/row.h b/source/row.h index 0cbd7f0a7..7bca7c28f 100644 --- a/source/row.h +++ b/source/row.h @@ -63,6 +63,12 @@ #define HAS_RGB565TOARGBROW_SSE2 #define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2 + +#define HAS_ARGBTORGB24ROW_SSSE3 +#define HAS_ARGBTORAWROW_SSSE3 +#define HAS_ARGBTORGB565ROW_SSE2 +#define HAS_ARGBTOARGB1555ROW_SSE2 +#define HAS_ARGBTOARGB4444ROW_SSE2 #endif // The following are available on Neon platforms @@ -210,6 +216,20 @@ void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix); void ARGB1555ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); void ARGB4444ToARGBRow_C(const uint8* src_argb, uint8* dst_argb, int pix); +void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); + +// ARGBToABGRRow_C is same as ABGRToARGB +// ARGBToBGRARow_C is same as BGRAToARGB +void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); + void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); diff --git a/source/row_win.cc b/source/row_win.cc index 6fd398593..ecd9a82e9 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -85,6 +85,15 @@ static const uvec8 kShuffleMaskBGRAToARGB = { 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u }; +// Shuffle table for converting ARGB to RGB24. +static const uvec8 kShuffleMaskARGBToRGB24 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u }; + + +// Shuffle table for converting ARGB to RAW. +static const uvec8 kShuffleMaskARGBToRAW = { + 2u, 1u,0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u }; + __declspec(naked) void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { __asm { @@ -494,6 +503,201 @@ __asm { } } +// TODO(fbarchard): Port to gcc +__declspec(naked) +void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { +__asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + movdqa xmm5, kShuffleMaskARGBToRGB24 + + convertloop: + movdqa xmm0, [eax] // fetch 16 pixels of argb + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + lea eax, [eax + 64] + pshufb xmm0, xmm5 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm1, xmm5 + pshufb xmm2, xmm5 + pshufb xmm3, xmm5 + movdqa xmm4, xmm1 + psllq xmm4, 12 + por xmm4, xmm0 + movdqa [edx], xmm4 // first 16 bytes + movdqa xmm4, xmm2 + psrlq xmm1, 4 + psllq xmm4, 8 + por xmm1, xmm4 + movdqa [edx + 16], xmm1 // middle 16 bytes + psrlq xmm2, 8 + psllq xmm3, 4 + por xmm2, xmm3 + movdqa [edx + 32], xmm2 // last 16 bytes + lea edx, [edx + 48] + sub ecx, 16 + ja convertloop + ret + } +} + +// TODO(fbarchard): Port to gcc +__declspec(naked) +void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { +__asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + movdqa xmm5, kShuffleMaskARGBToRAW + + convertloop: + movdqa xmm0, [eax] // fetch 16 pixels of argb + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + lea eax, [eax + 64] + pshufb xmm0, xmm5 // pack 16 bytes of ARGB to 12 bytes of RGB + pshufb xmm1, xmm5 + pshufb xmm2, xmm5 + pshufb xmm3, xmm5 + movdqa xmm4, xmm1 + psllq xmm4, 12 + por xmm4, xmm0 + movdqa [edx], xmm4 // first 16 bytes + movdqa xmm4, xmm2 + psrlq xmm1, 4 + psllq xmm4, 8 + por xmm1, xmm4 + movdqa [edx + 16], xmm1 // middle 16 bytes + psrlq xmm2, 8 + psllq xmm3, 4 + por xmm2, xmm3 + movdqa [edx + 32], xmm2 // last 16 bytes + lea edx, [edx + 48] + sub ecx, 16 + ja convertloop + ret + } +} + +// TODO(fbarchard): Port to gcc +__declspec(naked) +void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { +__asm { + pcmpeqb xmm3, xmm3 // generate mask 0x001f001f + psrlw xmm3, 11 + pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 + psrlw xmm4, 10 + psllw xmm4, 5 + pcmpeqb xmm5, xmm5 // generate mask 0xf800f800 + psrlw xmm5, 11 + + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + + convertloop: + movdqa xmm0, [eax] // fetch 4 pixels of argb + lea eax, [eax + 16] + movdqa xmm1, xmm0 // B + psrlw xmm1, 3 + pand xmm1, xmm3 + movdqa xmm2, xmm0 // G + psrlw xmm2, 5 + pand xmm2, xmm4 + por xmm1, xmm2 + psrlw xmm0, 8 // R + pand xmm0, xmm5 + por xmm0, xmm1 + pslld xmm0, 16 + psrad xmm0, 16 + packssdw xmm0, xmm0 + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 + lea edx, [edx + 8] + sub ecx, 4 + ja convertloop + ret + } +} + +// TODO(fbarchard): Port to gcc +__declspec(naked) +void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { +__asm { + pcmpeqb xmm3, xmm3 // generate mask 0x001f001f + psrlw xmm3, 11 + movdqa xmm4, xmm3 // generate mask 0x03e003e0 + psllw xmm4, 5 + movdqa xmm5, xmm3 // generate mask 0x7c007c00 + psllw xmm5, 10 + pcmpeqb xmm6, xmm6 // generate mask 0x80008000 + psrlw xmm6, 15 + + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + + convertloop: + movdqa xmm0, [eax] // fetch 4 pixels of argb + lea eax, [eax + 16] + movdqa xmm1, xmm0 // B + psrlw xmm1, 3 + pand xmm1, xmm3 + movdqa xmm2, xmm0 // G + psrlw xmm2, 6 + pand xmm2, xmm4 + por xmm1, xmm2 + movdqa xmm2, xmm0 // R + psrlw xmm2, 9 + pand xmm2, xmm5 + por xmm1, xmm2 + movdqa xmm2, xmm0 // A + psrlw xmm2, 16 + pand xmm2, xmm6 + por xmm1, xmm2 + pslld xmm0, 16 + psrad xmm0, 16 + packssdw xmm1, xmm1 + movq qword ptr [edx], xmm1 // store 4 pixels of ARGB1555 + lea edx, [edx + 8] + sub ecx, 4 + ja convertloop + ret + } +} + +// TODO(fbarchard): Port to gcc +__declspec(naked) +void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { +__asm { + pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 + psllw xmm4, 12 + movdqa xmm3, xmm4 // generate mask 0x00f000f0 + psrlw xmm3, 8 + + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + + convertloop: + movdqa xmm0, [eax] // fetch 4 pixels of argb + lea eax, [eax + 16] + movdqa xmm1, xmm0 + pand xmm0, xmm3 // low nibble + pand xmm1, xmm4 // high nibble + psrl xmm0, 4 + psrl xmm1, 8 + por xmm0, xmm1 + packuswb xmm0, xmm0 + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 + lea edx, [edx + 8] + sub ecx, 4 + ja convertloop + ret + } +} + // Convert 16 ARGB pixels (64 bytes) to 16 Y values __declspec(naked) void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {