diff --git a/README.chromium b/README.chromium index e1dcb9ab4..e6b3e25c2 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 480 +Version: 481 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 76f39c55f..9158fc2cc 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -362,6 +362,16 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix); void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int pix); +void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int pix); +void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int pix); +void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int pix); +void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int pix); +void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int pix); void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, uint8* dst_u, uint8* dst_v, int pix); void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, @@ -433,6 +443,16 @@ void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix); void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int pix); +void BGRAToUVRow_Any_NEON(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int pix); +void ABGRToUVRow_Any_NEON(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int pix); +void RGBAToUVRow_Any_NEON(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int pix); +void RGB24ToUVRow_Any_NEON(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int pix); +void RAWToUVRow_Any_NEON(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int pix); void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565, uint8* dst_u, uint8* dst_v, int pix); void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555, @@ -449,6 +469,10 @@ void ABGRToUVRow_C(const uint8* src_abgr, int src_stride_abgr, uint8* dst_u, uint8* dst_v, int width); void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba, uint8* dst_u, uint8* dst_v, int width); +void RGB24ToUVRow_C(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int width); +void RAWToUVRow_C(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int width); void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, uint8* dst_u, uint8* dst_v, int width); void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 486a86488..ac17c19c5 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 480 +#define LIBYUV_VERSION 481 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert.cc b/source/convert.cc index f9bb84fca..d527fe601 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1012,6 +1012,12 @@ int BGRAToI420(const uint8* src_bgra, int src_stride_bgra, if (IS_ALIGNED(width, 8)) { BGRAToYRow = BGRAToYRow_NEON; } + if (width >= 16) { + BGRAToUVRow = BGRAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + BGRAToUVRow = BGRAToUVRow_NEON; + } + } } #endif @@ -1074,6 +1080,12 @@ int ABGRToI420(const uint8* src_abgr, int src_stride_abgr, if (IS_ALIGNED(width, 8)) { ABGRToYRow = ABGRToYRow_NEON; } + if (width >= 16) { + ABGRToUVRow = ABGRToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ABGRToUVRow = ABGRToUVRow_NEON; + } + } } #endif @@ -1136,6 +1148,12 @@ int RGBAToI420(const uint8* src_rgba, int src_stride_rgba, if (IS_ALIGNED(width, 8)) { RGBAToYRow = RGBAToYRow_NEON; } + if (width >= 16) { + RGBAToUVRow = RGBAToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGBAToUVRow = RGBAToUVRow_NEON; + } + } } #endif @@ -1173,6 +1191,25 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; src_stride_rgb24 = -src_stride_rgb24; } + +#if defined(HAS_RGB24TOYROW_NEON) + void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C; + void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) = + RGB24ToYRow_C; + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + RGB24ToYRow = RGB24ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RGB24ToYRow = RGB24ToYRow_NEON; + } + if (width >= 16) { + RGB24ToUVRow = RGB24ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24ToUVRow = RGB24ToUVRow_NEON; + } + } + } +#else // HAS_RGB24TOYROW_NEON SIMD_ALIGNED(uint8 row[kMaxStride * 2]); void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = RGB24ToARGBRow_C; @@ -1183,15 +1220,7 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, RGB24ToARGBRow = RGB24ToARGBRow_SSSE3; } } -#elif defined(HAS_RGB24TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RGB24ToARGBRow = RGB24ToARGBRow_NEON; - } - } #endif - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; #if defined(HAS_ARGBTOUVROW_SSSE3) @@ -1202,23 +1231,6 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, } } #endif - -#if defined(HAS_RGB24TOYROW_NEON) - void (*RGB24ToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = - RGB24ToYRow_C; - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - RGB24ToYRow = RGB24ToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RGB24ToYRow = RGB24ToYRow_NEON; - } - if (width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } - } - } -#else void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; #if defined(HAS_ARGBTOUVROW_SSSE3) @@ -1235,13 +1247,14 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, #endif // HAS_RGB24TOYROW_NEON for (int y = 0; y < height - 1; y += 2) { - RGB24ToARGBRow(src_rgb24, row, width); - RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kMaxStride, width); - ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); #if defined(HAS_RGB24TOYROW_NEON) + RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); RGB24ToYRow(src_rgb24, dst_y, width); RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); #else + RGB24ToARGBRow(src_rgb24, row, width); + RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kMaxStride, width); + ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width); #endif @@ -1251,11 +1264,12 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, dst_v += dst_stride_v; } if (height & 1) { - RGB24ToARGBRow_C(src_rgb24, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); #if defined(HAS_RGB24TOYROW_NEON) + RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width); RGB24ToYRow(src_rgb24, dst_y, width); #else + RGB24ToARGBRow(src_rgb24, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); #endif } @@ -1263,7 +1277,6 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, } // Convert RAW to I420. -// Same as RGB24 but RGB vs BGR LIBYUV_API int RAWToI420(const uint8* src_raw, int src_stride_raw, uint8* dst_y, int dst_stride_y, @@ -1281,6 +1294,25 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, src_raw = src_raw + (height - 1) * src_stride_raw; src_stride_raw = -src_stride_raw; } + +#if defined(HAS_RAWTOYROW_NEON) + void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C; + void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) = + RAWToYRow_C; + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + RAWToYRow = RAWToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + RAWToYRow = RAWToYRow_NEON; + } + if (width >= 16) { + RAWToUVRow = RAWToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RAWToUVRow = RAWToUVRow_NEON; + } + } + } +#else // HAS_RAWTOYROW_NEON SIMD_ALIGNED(uint8 row[kMaxStride * 2]); void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = RAWToARGBRow_C; @@ -1291,15 +1323,7 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, RAWToARGBRow = RAWToARGBRow_SSSE3; } } -#elif defined(HAS_RAWTOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - RAWToARGBRow = RAWToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RAWToARGBRow = RAWToARGBRow_NEON; - } - } #endif - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; #if defined(HAS_ARGBTOUVROW_SSSE3) @@ -1310,23 +1334,6 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, } } #endif - -#if defined(HAS_RAWTOYROW_NEON) - void (*RAWToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = - RAWToYRow_C; - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - RAWToYRow = RAWToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - RAWToYRow = RAWToYRow_NEON; - } - if (width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } - } - } -#else void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; #if defined(HAS_ARGBTOUVROW_SSSE3) @@ -1343,13 +1350,14 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, #endif // HAS_RAWTOYROW_NEON for (int y = 0; y < height - 1; y += 2) { - RAWToARGBRow(src_raw, row, width); - RAWToARGBRow(src_raw + src_stride_raw, row + kMaxStride, width); - ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); #if defined(HAS_RAWTOYROW_NEON) + RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width); RAWToYRow(src_raw, dst_y, width); RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width); #else + RAWToARGBRow(src_raw, row, width); + RAWToARGBRow(src_raw + src_stride_raw, row + kMaxStride, width); + ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width); #endif @@ -1359,11 +1367,12 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, dst_v += dst_stride_v; } if (height & 1) { - RAWToARGBRow_C(src_raw, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); #if defined(HAS_RAWTOYROW_NEON) + RAWToUVRow(src_raw, 0, dst_u, dst_v, width); RAWToYRow(src_raw, dst_y, width); #else + RAWToARGBRow(src_raw, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); #endif } @@ -1550,10 +1559,12 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, #if defined(HAS_ARGB1555TOYROW_NEON) ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); ARGB1555ToYRow(src_argb1555, dst_y, width); - ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, width); + ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, + width); #else ARGB1555ToARGBRow(src_argb1555, row, width); - ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kMaxStride, width); + ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kMaxStride, + width); ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width); diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 2f010ecff..b9c46d222 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -543,8 +543,8 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; src_stride_argb1555 = -src_stride_argb1555; } - void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb, int pix) = - ARGB1555ToARGBRow_C; + void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb, + int pix) = ARGB1555ToARGBRow_C; #if defined(HAS_ARGB1555TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { @@ -585,8 +585,8 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; src_stride_argb4444 = -src_stride_argb4444; } - void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb, int pix) = - ARGB4444ToARGBRow_C; + void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb, + int pix) = ARGB4444ToARGBRow_C; #if defined(HAS_ARGB4444TOARGBROW_SSE2) if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index e62a065ed..da8c83ede 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -95,8 +95,7 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; if (IS_ALIGNED(width, 16)) { ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && - IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { ARGBToUV422Row = ARGBToUV422Row_SSSE3; } } @@ -239,6 +238,9 @@ int ARGBToNV12(const uint8* src_argb, int src_stride_argb, ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; + } + if (width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_NEON; } @@ -345,6 +347,9 @@ int ARGBToNV21(const uint8* src_argb, int src_stride_argb, ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; + } + if (width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_NEON; } @@ -425,20 +430,27 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2; dst_stride_yuy2 = -dst_stride_yuy2; } - - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUV422Row_C; +#if defined(HAS_ARGBTOUV422ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUV422Row = ARGBToUV422Row_SSSE3; + } + } + } +#endif void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3; } } @@ -448,8 +460,11 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; + } + if (width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; + ARGBToUV422Row = ARGBToUV422Row_NEON; } } } @@ -479,7 +494,7 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb, SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]); for (int y = 0; y < height; ++y) { - ARGBToUVRow(src_argb, 0, row_u, row_v, width); + ARGBToUV422Row(src_argb, row_u, row_v, width); ARGBToYRow(src_argb, row_y, width); I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width); src_argb += src_stride_argb; @@ -504,20 +519,27 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy; dst_stride_uyvy = -dst_stride_uyvy; } - - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; + void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) = ARGBToUV422Row_C; +#if defined(HAS_ARGBTOUV422ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3; + if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToUV422Row = ARGBToUV422Row_SSSE3; + } + } + } +#endif void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; #if defined(HAS_ARGBTOYROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { - ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_SSSE3; } } @@ -527,8 +549,11 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; + } + if (width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; + ARGBToUV422Row = ARGBToUV422Row_NEON; } } } @@ -558,7 +583,7 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb, SIMD_ALIGNED(uint8 row_v[kMaxStride / 2]); for (int y = 0; y < height; ++y) { - ARGBToUVRow(src_argb, 0, row_u, row_v, width); + ARGBToUV422Row(src_argb, row_u, row_v, width); ARGBToYRow(src_argb, row_y, width); I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width); src_argb += src_stride_argb; diff --git a/source/format_conversion.cc b/source/format_conversion.cc index 6dcc6e5de..656d3e563 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -315,8 +315,8 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer, ARGBToUVRow = ARGBToUVRow_Any_SSSE3; ARGBToYRow = ARGBToYRow_Any_SSSE3; if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_SSSE3; ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + ARGBToUVRow = ARGBToUVRow_SSSE3; if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; } @@ -327,6 +327,9 @@ int BayerToI420(const uint8* src_bayer, int src_stride_bayer, ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; + } + if (width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_NEON; } diff --git a/source/row_any.cc b/source/row_any.cc index 73b2cf41d..1b29777b1 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -248,6 +248,11 @@ UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2) #endif #ifdef HAS_ARGBTOUVROW_NEON UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4) +UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4) +UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4) +UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4) +UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3) +UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3) UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2) UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2) UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2) diff --git a/source/row_common.cc b/source/row_common.cc index ef43297af..3927af461 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -109,7 +109,8 @@ void RGB565ToARGBRow_C(const uint8* src_rgb565, uint8* dst_argb, int width) { } } -void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb, int width) { +void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb, + int width) { for (int x = 0; x < width; ++x) { uint8 b = src_argb1555[0] & 0x1f; uint8 g = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); @@ -124,7 +125,8 @@ void ARGB1555ToARGBRow_C(const uint8* src_argb1555, uint8* dst_argb, int width) } } -void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb, int width) { +void ARGB4444ToARGBRow_C(const uint8* src_argb4444, uint8* dst_argb, + int width) { for (int x = 0; x < width; ++x) { uint8 b = src_argb4444[0] & 0x0f; uint8 g = src_argb4444[0] >> 4; diff --git a/source/row_neon.cc b/source/row_neon.cc index 57371e808..342ed2f2f 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1729,14 +1729,14 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. #ifdef HAS_ARGBTOUVROW_NEON -#define RGBTOUV \ - "vmul.s16 q8, q0, q10 \n" /* B */ \ - "vmls.s16 q8, q1, q11 \n" /* G */ \ - "vmls.s16 q8, q2, q12 \n" /* R */ \ +#define RGBTOUV(QB, QG, QR) \ + "vmul.s16 q8, " #QB ", q10 \n" /* B */ \ + "vmls.s16 q8, " #QG ", q11 \n" /* G */ \ + "vmls.s16 q8, " #QR ", q12 \n" /* R */ \ "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ - "vmul.s16 q9, q2, q10 \n" /* R */ \ - "vmls.s16 q9, q1, q14 \n" /* G */ \ - "vmls.s16 q9, q0, q13 \n" /* B */ \ + "vmul.s16 q9, " #QR ", q10 \n" /* R */ \ + "vmls.s16 q9, " #QG ", q14 \n" /* G */ \ + "vmls.s16 q9, " #QB ", q13 \n" /* B */ \ "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ @@ -1764,7 +1764,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. "subs %4, %4, #16 \n" // 32 processed per loop. - RGBTOUV + RGBTOUV(q0, q1, q2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" @@ -1778,6 +1778,197 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } + +void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_bgra + "vmov.s16 q10, #112 / 4 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 4 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 4 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. + "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. + "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts. + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q3, q2, q1) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_bgra), // %0 + "+r"(src_stride_bgra), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_abgr + "vmov.s16 q10, #112 / 4 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 4 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 4 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q2, q1, q0) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_abgr), // %0 + "+r"(src_stride_abgr), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgba + "vmov.s16 q10, #112 / 4 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 4 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 4 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. + "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. + "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts. + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgba), // %0 + "+r"(src_stride_rgba), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_rgb24 + "vmov.s16 q10, #112 / 4 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 4 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 4 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. + "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q0, q1, q2) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(src_stride_rgb24), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + +void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_raw + "vmov.s16 q10, #112 / 4 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 4 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 4 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. + "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. + "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. + "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. + "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. + "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts. + "subs %4, %4, #16 \n" // 32 processed per loop. + RGBTOUV(q2, q1, q0) + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_raw), // %0 + "+r"(src_stride_raw), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + #endif // HAS_ARGBTOUVROW_NEON // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.