From 518833b9833a52b715b487445e6ccfe4f8881903 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Tue, 12 Mar 2013 21:44:56 +0000 Subject: [PATCH] Fix RGB565ToARGB_Any which uses SSE2 that requires ARGB alignment. Add row coalescing to convert_argb.cc. Improve coalescing on planar_functions.cc and convert_from_argb.cc. Use stride * 2 == width to test for even width. Apply coalescing to all functions that have same vertical subsampling. BUG=197 TESTED=libyuv unittest passes where _Opt uses row coalescing. Review URL: https://webrtc-codereview.appspot.com/1186004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@601 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/convert_argb.cc | 105 ++++++++++++++++++++++++++++++++++-- source/convert_from_argb.cc | 73 ++++++++++++++++++++++--- source/planar_functions.cc | 56 ++++++++++--------- source/row_any.cc | 50 ++++++++++------- 6 files changed, 233 insertions(+), 55 deletions(-) diff --git a/README.chromium b/README.chromium index 985e19607..5abf5889b 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 600 +Version: 601 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 25919e172..ea7a1be88 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 600 +#define LIBYUV_VERSION 601 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 6f799b582..0e93abe2c 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -15,7 +15,6 @@ #ifdef HAVE_JPEG #include "libyuv/mjpeg_decoder.h" #endif -#include "libyuv/planar_functions.h" #include "libyuv/rotate_argb.h" #include "libyuv/row.h" #include "libyuv/video_common.h" @@ -64,6 +63,17 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } + // Coalesce contiguous rows. + if (src_stride_y == width && + src_stride_u == width && + src_stride_v == width && + dst_stride_argb == width * 4) { + return I444ToARGB(src_y, 0, + src_u, 0, + src_v, 0, + dst_argb, 0, + width * height, 1); + } void (*I444ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -116,6 +126,17 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } + // Coalesce contiguous rows. + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && + dst_stride_argb == width * 4) { + return I422ToARGB(src_y, 0, + src_u, 0, + src_v, 0, + dst_argb, 0, + width * height, 1); + } void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -176,6 +197,17 @@ int I411ToARGB(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } + // Coalesce contiguous rows. + if (src_stride_y == width && + src_stride_u * 4 == width && + src_stride_v * 4 == width && + dst_stride_argb == width * 4) { + return I411ToARGB(src_y, 0, + src_u, 0, + src_v, 0, + dst_argb, 0, + width * height, 1); + } void (*I411ToARGBRow)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -225,6 +257,13 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_stride_argb = -dst_stride_argb; } + // Coalesce contiguous rows. + if (src_stride_y == width && + dst_stride_argb == width * 4) { + return I400ToARGB_Reference(src_y, 0, + dst_argb, 0, + width * height, 1); + } void (*YToARGBRow)(const uint8* y_buf, uint8* rgb_buf, int width) = YToARGBRow_C; @@ -268,6 +307,13 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, src_y = src_y + (height - 1) * src_stride_y; src_stride_y = -src_stride_y; } + // Coalesce contiguous rows. + if (src_stride_y == width && + dst_stride_argb == width * 4) { + return I400ToARGB(src_y, 0, + dst_argb, 0, + width * height, 1); + } void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) = I400ToARGBRow_C; #if defined(HAS_I400TOARGBROW_SSE2) @@ -359,6 +405,13 @@ int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24, src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; src_stride_rgb24 = -src_stride_rgb24; } + // Coalesce contiguous rows. + if (src_stride_rgb24 == width * 3 && + dst_stride_argb == width * 4) { + return RGB24ToARGB(src_rgb24, 0, + dst_argb, 0, + width * height, 1); + } void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = RGB24ToARGBRow_C; #if defined(HAS_RGB24TOARGBROW_SSSE3) @@ -401,6 +454,13 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw, src_raw = src_raw + (height - 1) * src_stride_raw; src_stride_raw = -src_stride_raw; } + // Coalesce contiguous rows. + if (src_stride_raw == width * 3 && + dst_stride_argb == width * 4) { + return RAWToARGB(src_raw, 0, + dst_argb, 0, + width * height, 1); + } void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = RAWToARGBRow_C; #if defined(HAS_RAWTOARGBROW_SSSE3) @@ -443,6 +503,13 @@ int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565, src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565; src_stride_rgb565 = -src_stride_rgb565; } + // Coalesce contiguous rows. + if (src_stride_rgb565 == width * 2 && + dst_stride_argb == width * 4) { + return RGB565ToARGB(src_rgb565, 0, + dst_argb, 0, + width * height, 1); + } void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) = RGB565ToARGBRow_C; #if defined(HAS_RGB565TOARGBROW_SSE2) @@ -485,6 +552,13 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; src_stride_argb1555 = -src_stride_argb1555; } + // Coalesce contiguous rows. + if (src_stride_argb1555 == width * 2 && + dst_stride_argb == width * 4) { + return ARGB1555ToARGB(src_argb1555, 0, + dst_argb, 0, + width * height, 1); + } void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb, int pix) = ARGB1555ToARGBRow_C; #if defined(HAS_ARGB1555TOARGBROW_SSE2) @@ -527,6 +601,13 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; src_stride_argb4444 = -src_stride_argb4444; } + // Coalesce contiguous rows. + if (src_stride_argb4444 == width * 2 && + dst_stride_argb == width * 4) { + return ARGB4444ToARGB(src_argb4444, 0, + dst_argb, 0, + width * height, 1); + } void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb, int pix) = ARGB4444ToARGBRow_C; #if defined(HAS_ARGB4444TOARGBROW_SSE2) @@ -721,10 +802,19 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; src_stride_yuy2 = -src_stride_yuy2; } + // Coalesce contiguous rows. + if (width * height <= kMaxStride && + src_stride_yuy2 == width * 2 && + dst_stride_argb == width * 4) { + return YUY2ToARGB(src_yuy2, 0, + dst_argb, 0, + width * height, 1); + } void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) = YUY2ToARGBRow_C; #if defined(HAS_YUY2TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { // posix it 16, win is 8. + // Posix it 16, Windows is 8. + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && width <= kMaxStride) { YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3; @@ -765,10 +855,19 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; src_stride_uyvy = -src_stride_uyvy; } + // Coalesce contiguous rows. + if (width * height <= kMaxStride && + src_stride_uyvy == width * 2 && + dst_stride_argb == width * 4) { + return UYVYToARGB(src_uyvy, 0, + dst_argb, 0, + width * height, 1); + } void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) = UYVYToARGBRow_C; #if defined(HAS_UYVYTOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { // posix it 16, win is 8. + // Posix it 16, Windows is 8. + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 && width <= kMaxStride) { UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3; diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index a897f6f9f..79ed48645 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -36,6 +36,17 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } + // Coalesce contiguous rows. + if (src_stride_argb == width * 4 && + dst_stride_y == width && + dst_stride_u == width && + dst_stride_v == width) { + return ARGBToI444(src_argb, 0, + dst_y, 0, + dst_u, 0, + dst_v, 0, + width * height, 1); + } void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, @@ -100,6 +111,17 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } + // Coalesce contiguous rows. + if (src_stride_argb == width * 4 && + dst_stride_y == width && + dst_stride_u * 2 == width && + dst_stride_v * 2 == width) { + return ARGBToI422(src_argb, 0, + dst_y, 0, + dst_u, 0, + dst_v, 0, + width * height, 1); + } void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix) = ARGBToUV422Row_C; #if defined(HAS_ARGBTOUV422ROW_SSSE3) @@ -168,6 +190,17 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } + // Coalesce contiguous rows. + if (src_stride_argb == width * 4 && + dst_stride_y == width && + dst_stride_u * 4 == width && + dst_stride_v * 4 == width) { + return ARGBToI411(src_argb, 0, + dst_y, 0, + dst_u, 0, + dst_v, 0, + width * height, 1); + } void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix) = ARGBToUV411Row_C; void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = @@ -446,6 +479,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; dst_stride_yuy2 = -dst_stride_yuy2; } + // Coalesce contiguous rows. + if (width * height <= kMaxStride && + src_stride_argb == width * 4 && + dst_stride_yuy2 == width * 2) { + return ARGBToYUY2(src_argb, 0, + dst_yuy2, 0, + width * height, 1); + } void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix) = ARGBToUV422Row_C; #if defined(HAS_ARGBTOUV422ROW_SSSE3) @@ -535,6 +576,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; dst_stride_uyvy = -dst_stride_uyvy; } + // Coalesce contiguous rows. + if (width * height <= kMaxStride && + src_stride_argb == width * 4 && + dst_stride_uyvy == width * 2) { + return ARGBToUYVY(src_argb, 0, + dst_uyvy, 0, + width * height, 1); + } void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix) = ARGBToUV422Row_C; #if defined(HAS_ARGBTOUV422ROW_SSSE3) @@ -624,7 +673,9 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb, // Coalesce contiguous rows. if (src_stride_argb == width * 4 && dst_stride_y == width) { - return ARGBToI400(src_argb, 0, dst_y, 0, width * height, 1); + return ARGBToI400(src_argb, 0, + dst_y, 0, + width * height, 1); } void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; @@ -704,7 +755,9 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, // Coalesce contiguous rows. if (src_stride_argb == width * 4 && dst_stride_rgb24 == width * 3) { - return ARGBToRGB24(src_argb, 0, dst_rgb24, 0, width * height, 1); + return ARGBToRGB24(src_argb, 0, + dst_rgb24, 0, + width * height, 1); } void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToRGB24Row_C; @@ -750,7 +803,9 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, // Coalesce contiguous rows. if (src_stride_argb == width * 4 && dst_stride_raw == width * 3) { - return ARGBToRAW(src_argb, 0, dst_raw, 0, width * height, 1); + return ARGBToRAW(src_argb, 0, + dst_raw, 0, + width * height, 1); } void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToRAWRow_C; @@ -796,7 +851,9 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, // Coalesce contiguous rows. if (src_stride_argb == width * 4 && dst_stride_rgb565 == width * 2) { - return ARGBToRGB565(src_argb, 0, dst_rgb565, 0, width * height, 1); + return ARGBToRGB565(src_argb, 0, + dst_rgb565, 0, + width * height, 1); } void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToRGB565Row_C; @@ -841,7 +898,9 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, // Coalesce contiguous rows. if (src_stride_argb == width * 4 && dst_stride_argb1555 == width * 2) { - return ARGBToARGB1555(src_argb, 0, dst_argb1555, 0, width * height, 1); + return ARGBToARGB1555(src_argb, 0, + dst_argb1555, 0, + width * height, 1); } void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToARGB1555Row_C; @@ -886,7 +945,9 @@ int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb, // Coalesce contiguous rows. if (src_stride_argb == width * 4 && dst_stride_argb4444 == width * 2) { - return ARGBToARGB4444(src_argb, 0, dst_argb4444, 0, width * height, 1); + return ARGBToARGB4444(src_argb, 0, + dst_argb4444, 0, + width * height, 1); } void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) = ARGBToARGB4444Row_C; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index a0cb0ae3c..cdad49c9e 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -29,8 +29,11 @@ void CopyPlane(const uint8* src_y, int src_stride_y, uint8* dst_y, int dst_stride_y, int width, int height) { // Coalesce contiguous rows. - if (src_stride_y == width && dst_stride_y == width) { - CopyPlane(src_y, 0, dst_y, 0, width * height, 1); + if (src_stride_y == width && + dst_stride_y == width) { + CopyPlane(src_y, 0, + dst_y, 0, + width * height, 1); return; } void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; @@ -228,12 +231,14 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2, src_stride_yuy2 = -src_stride_yuy2; } // Coalesce contiguous rows. - if (IS_ALIGNED(width, 2) && - src_stride_yuy2 == width * 2 && + if (src_stride_yuy2 == width * 2 && dst_stride_y == width && - dst_stride_u == (width + 1) / 2 && - dst_stride_v == (width + 1) / 2) { - return YUY2ToI422(src_yuy2, 0, dst_y, 0, dst_u, 0, dst_v, 0, + dst_stride_u * 2 == width && + dst_stride_v * 2 == width) { + return YUY2ToI422(src_yuy2, 0, + dst_y, 0, + dst_u, 0, + dst_v, 0, width * height, 1); } void (*YUY2ToUV422Row)(const uint8* src_yuy2, @@ -314,12 +319,14 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy, src_stride_uyvy = -src_stride_uyvy; } // Coalesce contiguous rows. - if (IS_ALIGNED(width, 2) && - src_stride_uyvy == width * 2 && + if (src_stride_uyvy == width * 2 && dst_stride_y == width && - dst_stride_u == (width + 1) / 2 && - dst_stride_v == (width + 1) / 2) { - return UYVYToI422(src_uyvy, 0, dst_y, 0, dst_u, 0, dst_v, 0, + dst_stride_u * 2 == width && + dst_stride_v * 2 == width) { + return UYVYToI422(src_uyvy, 0, + dst_y, 0, + dst_u, 0, + dst_v, 0, width * height, 1); } void (*UYVYToUV422Row)(const uint8* src_uyvy, @@ -793,10 +800,9 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y, dst_stride_bgra = -dst_stride_bgra; } // Coalesce contiguous rows. - if (IS_ALIGNED(width, 2) && - src_stride_y == width && - src_stride_u == (width + 1) / 2 && - src_stride_v == (width + 1) / 2 && + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_bgra == width * 4) { return I422ToBGRA(src_y, 0, src_u, 0, @@ -865,10 +871,9 @@ int I422ToABGR(const uint8* src_y, int src_stride_y, dst_stride_abgr = -dst_stride_abgr; } // Coalesce contiguous rows. - if (IS_ALIGNED(width, 2) && - src_stride_y == width && - src_stride_u == (width + 1) / 2 && - src_stride_v == (width + 1) / 2 && + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_abgr == width * 4) { return I422ToABGR(src_y, 0, src_u, 0, @@ -929,10 +934,9 @@ int I422ToRGBA(const uint8* src_y, int src_stride_y, dst_stride_rgba = -dst_stride_rgba; } // Coalesce contiguous rows. - if (IS_ALIGNED(width, 2) && - src_stride_y == width && - src_stride_u == (width + 1) / 2 && - src_stride_v == (width + 1) / 2 && + if (src_stride_y == width && + src_stride_u * 2 == width && + src_stride_v * 2 == width && dst_stride_rgba == width * 4) { return I422ToRGBA(src_y, 0, src_u, 0, @@ -1074,7 +1078,9 @@ void SetPlane(uint8* dst_y, int dst_stride_y, uint32 value) { // Coalesce contiguous rows. if (dst_stride_y == width) { - SetPlane(dst_y, 0, width * height, 1, value); + SetPlane(dst_y, 0, + width * height, 1, + value); return; } void (*SetRow)(uint8* dst, uint32 value, int pix) = SetRow_C; diff --git a/source/row_any.cc b/source/row_any.cc index a1f90f06a..7e5ea5076 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -115,8 +115,6 @@ NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2) #endif // HAS_NV12TORGB565ROW_NEON #undef NVANY -// TODO(fbarchard): RGBANY use last 16 method. -// ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst. #define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \ void NAMEANY(const uint8* src, \ uint8* dst, \ @@ -145,6 +143,17 @@ RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C, 15, 2, 4) RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C, 15, 2, 4) +// These require alignment on ARGB, so C is used for remainder. +RGBANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, RGB24ToARGBRow_C, + 15, 3, 4) +RGBANY(RAWToARGBRow_Any_SSSE3,RAWToARGBRow_SSSE3, RAWToARGBRow_C, + 15, 3, 4) +RGBANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, RGB565ToARGBRow_C, + 7, 2, 4) +RGBANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, ARGB1555ToARGBRow_C, + 7, 2, 4) +RGBANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, ARGB4444ToARGBRow_C, + 7, 2, 4) #endif #if defined(HAS_ARGBTORGB24ROW_NEON) RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3) @@ -188,30 +197,31 @@ BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C, // RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD. // TODO(fbarchard): Use last 16 method for all unsubsampled conversions. -#define YANY(NAMEANY, ARGBTOY_SIMD, SBPP, BPP, NUM) \ +#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_SIMDU, SBPP, BPP, NUM) \ void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \ ARGBTOY_SIMD(src_argb, dst_y, width - NUM); \ - ARGBTOY_SIMD(src_argb + (width - NUM) * SBPP, \ + ARGBTOY_SIMDU(src_argb + (width - NUM) * SBPP, \ dst_y + (width - NUM) * BPP, NUM); \ } #ifdef HAS_ARGBTOYROW_AVX2 -YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32) -YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32) -YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32) +YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, ARGBToYRow_AVX2, 4, 1, 32) +YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, YUY2ToYRow_AVX2, 2, 1, 32) +YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, UYVYToYRow_AVX2, 2, 1, 32) #endif #ifdef HAS_ARGBTOYROW_SSSE3 -YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16) -YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16) -YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16) -YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16) -YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16) -YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16) -YANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 3, 4, 16) -YANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 3, 4, 16) -YANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 2, 4, 8) -YANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 2, 4, 8) -YANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 2, 4, 8) +YANY(ARGBToYRow_Any_SSSE3, + ARGBToYRow_Unaligned_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16) +YANY(BGRAToYRow_Any_SSSE3, + BGRAToYRow_Unaligned_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16) +YANY(ABGRToYRow_Any_SSSE3, + ABGRToYRow_Unaligned_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4, 1, 16) +YANY(RGBAToYRow_Any_SSSE3, + RGBAToYRow_Unaligned_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4, 1, 16) +YANY(YUY2ToYRow_Any_SSE2, + YUY2ToYRow_Unaligned_SSE2, YUY2ToYRow_Unaligned_SSE2, 2, 1, 16) +YANY(UYVYToYRow_Any_SSE2, + UYVYToYRow_Unaligned_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16) #endif #ifdef HAS_ARGBTOYROW_NEON YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8) @@ -233,7 +243,6 @@ YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8) #endif #undef YANY -// Attenuate is destructive so last16 method can not be used due to overlap. #define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK) \ void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \ int n = width & ~MASK; \ @@ -242,6 +251,7 @@ YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8) dst_y + n * BPP, width & MASK); \ } +// Attenuate is destructive so last16 method can not be used due to overlap. #ifdef HAS_ARGBATTENUATEROW_SSSE3 YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C, 4, 4, 3) @@ -268,6 +278,8 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C, #endif #undef YANY + + // RGB/YUV to UV does multiple of 16 with SIMD and remainder with C. #define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK) \ void NAMEANY(const uint8* src_argb, int src_stride_argb, \