diff --git a/README.chromium b/README.chromium index 00ae25c32..e1dcb9ab4 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 479 +Version: 480 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index a9d477aa1..76f39c55f 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -48,8 +48,8 @@ extern "C" { #define HAS_ARGBTORGB24ROW_SSSE3 #define HAS_ARGBTORGB565ROW_SSE2 #define HAS_ARGBTORGBAROW_SSSE3 -#define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOUV422ROW_SSSE3 +#define HAS_ARGBTOUVROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 #define HAS_BGRATOARGBROW_SSSE3 #define HAS_BGRATOUVROW_SSSE3 @@ -60,43 +60,43 @@ extern "C" { #define HAS_I400TOARGBROW_SSE2 #define HAS_I411TOARGBROW_SSSE3 #define HAS_I422TOABGRROW_SSSE3 +#define HAS_I422TOARGB1555ROW_SSSE3 +#define HAS_I422TOARGB4444ROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3 #define HAS_I422TOBGRAROW_SSSE3 #define HAS_I422TORAWROW_SSSE3 #define HAS_I422TORGB24ROW_SSSE3 +#define HAS_I422TORGB565ROW_SSSE3 #define HAS_I422TORGBAROW_SSSE3 +#define HAS_I422TOUYVYROW_SSE2 +#define HAS_I422TOYUY2ROW_SSE2 #define HAS_I444TOARGBROW_SSSE3 +#define HAS_MERGEUV_SSE2 #define HAS_MIRRORROW_SSSE3 #define HAS_MirrorUVRow_SSSE3 #define HAS_NV12TOARGBROW_SSSE3 -#define HAS_NV21TOARGBROW_SSSE3 #define HAS_NV12TORGB565ROW_SSSE3 +#define HAS_NV21TOARGBROW_SSSE3 #define HAS_NV21TORGB565ROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3 +#define HAS_RAWTOYROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3 +#define HAS_RGB24TOYROW_SSSE3 #define HAS_RGB565TOARGBROW_SSE2 #define HAS_RGBATOARGBROW_SSSE3 #define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOYROW_SSSE3 #define HAS_SETROW_X86 #define HAS_SPLITUV_SSE2 +#define HAS_UYVYTOARGBROW_SSSE3 #define HAS_UYVYTOUV422ROW_SSE2 #define HAS_UYVYTOUVROW_SSE2 #define HAS_UYVYTOYROW_SSE2 #define HAS_YTOARGBROW_SSE2 +#define HAS_YUY2TOARGBROW_SSSE3 #define HAS_YUY2TOUV422ROW_SSE2 #define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOYROW_SSE2 -#define HAS_I422TOYUY2ROW_SSE2 -#define HAS_I422TOUYVYROW_SSE2 -#define HAS_MERGEUV_SSE2 -#define HAS_I422TOARGB4444ROW_SSSE3 -#define HAS_I422TOARGB1555ROW_SSSE3 -#define HAS_I422TORGB565ROW_SSSE3 -#define HAS_YUY2TOARGBROW_SSSE3 -#define HAS_UYVYTOARGBROW_SSSE3 -#define HAS_RGB24TOYROW_SSSE3 -#define HAS_RAWTOYROW_SSSE3 // Effects #define HAS_ARGBAFFINEROW_SSE2 @@ -147,68 +147,75 @@ extern "C" { // The following are available on Neon platforms #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_ABGRTOARGBROW_NEON +#define HAS_ABGRTOUVROW_NEON +#define HAS_ABGRTOYROW_NEON +#define HAS_ARGB1555TOARGBROW_NEON +#define HAS_ARGB1555TOUVROW_NEON +#define HAS_ARGB1555TOYROW_NEON +#define HAS_ARGB4444TOARGBROW_NEON +#define HAS_ARGB4444TOUVROW_NEON +#define HAS_ARGB4444TOYROW_NEON +#define HAS_ARGBTOARGB1555ROW_NEON +#define HAS_ARGBTOARGB4444ROW_NEON #define HAS_ARGBTOBAYERROW_NEON #define HAS_ARGBTORAWROW_NEON -#define HAS_I400TOARGBROW_NEON #define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORGB565ROW_NEON #define HAS_ARGBTORGBAROW_NEON +#define HAS_ARGBTOUV411ROW_NEON +#define HAS_ARGBTOUV422ROW_NEON +#define HAS_ARGBTOUV444ROW_NEON +#define HAS_ARGBTOUVROW_NEON +#define HAS_ARGBTOYROW_NEON #define HAS_BGRATOARGBROW_NEON +#define HAS_BGRATOUVROW_NEON +#define HAS_BGRATOYROW_NEON #define HAS_COPYROW_NEON #define HAS_HALFROW_NEON +#define HAS_I400TOARGBROW_NEON +#define HAS_I411TOARGBROW_NEON #define HAS_I422TOABGRROW_NEON +#define HAS_I422TOARGB1555ROW_NEON +#define HAS_I422TOARGB4444ROW_NEON #define HAS_I422TOARGBROW_NEON #define HAS_I422TOBGRAROW_NEON #define HAS_I422TORAWROW_NEON #define HAS_I422TORGB24ROW_NEON -#define HAS_I422TOARGB4444ROW_NEON -#define HAS_I422TOARGB1555ROW_NEON #define HAS_I422TORGB565ROW_NEON #define HAS_I422TORGBAROW_NEON +#define HAS_I422TOUYVYROW_NEON +#define HAS_I422TOYUY2ROW_NEON +#define HAS_I444TOARGBROW_NEON +#define HAS_MERGEUV_NEON #define HAS_MIRRORROW_NEON #define HAS_MirrorUVRow_NEON #define HAS_NV12TOARGBROW_NEON -#define HAS_NV21TOARGBROW_NEON -#define HAS_YUY2TOARGBROW_NEON -#define HAS_UYVYTOARGBROW_NEON #define HAS_NV12TORGB565ROW_NEON +#define HAS_NV21TOARGBROW_NEON #define HAS_NV21TORGB565ROW_NEON #define HAS_RAWTOARGBROW_NEON +#define HAS_RAWTOUVROW_NEON +#define HAS_RAWTOYROW_NEON #define HAS_RGB24TOARGBROW_NEON +#define HAS_RGB24TOUVROW_NEON +#define HAS_RGB24TOYROW_NEON +#define HAS_RGB565TOARGBROW_NEON +#define HAS_RGB565TOUVROW_NEON +#define HAS_RGB565TOYROW_NEON #define HAS_RGBATOARGBROW_NEON +#define HAS_RGBATOUVROW_NEON +#define HAS_RGBATOYROW_NEON #define HAS_SETROW_NEON #define HAS_SPLITUV_NEON +#define HAS_UYVYTOARGBROW_NEON #define HAS_UYVYTOUV422ROW_NEON #define HAS_UYVYTOUVROW_NEON #define HAS_UYVYTOYROW_NEON +#define HAS_YTOARGBROW_NEON +#define HAS_YUY2TOARGBROW_NEON #define HAS_YUY2TOUV422ROW_NEON #define HAS_YUY2TOUVROW_NEON #define HAS_YUY2TOYROW_NEON -#define HAS_I422TOYUY2ROW_NEON -#define HAS_I422TOUYVYROW_NEON -#define HAS_ARGBTORGB565ROW_NEON -#define HAS_ARGBTOARGB1555ROW_NEON -#define HAS_ARGBTOARGB4444ROW_NEON -#define HAS_MERGEUV_NEON -#define HAS_YTOARGBROW_NEON -#define HAS_I444TOARGBROW_NEON -#define HAS_I411TOARGBROW_NEON -#define HAS_ARGBTOYROW_NEON -#define HAS_ARGBTOUV444ROW_NEON -#define HAS_ARGBTOUV422ROW_NEON -#define HAS_ARGBTOUV411ROW_NEON -#define HAS_ARGBTOUVROW_NEON -#define HAS_RGB565TOUVROW_NEON -#define HAS_BGRATOYROW_NEON -#define HAS_ABGRTOYROW_NEON -#define HAS_RGBATOYROW_NEON -#define HAS_RGB24TOYROW_NEON -#define HAS_RAWTOYROW_NEON -#define HAS_RGB565TOARGBROW_NEON -#define HAS_ARGB1555TOARGBROW_NEON -#define HAS_ARGB4444TOARGBROW_NEON -#define HAS_RGB565TOYROW_NEON -#define HAS_ARGB1555TOYROW_NEON -#define HAS_ARGB4444TOYROW_NEON #endif // The following are available on Mips platforms @@ -357,6 +364,10 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int pix); void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, uint8* dst_u, uint8* dst_v, int pix); +void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int pix); +void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int pix); void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix); void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix); void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix); @@ -414,10 +425,22 @@ void ABGRToUVRow_Any_SSSE3(const uint8* src_abgr, int src_stride_abgr, uint8* dst_u, uint8* dst_v, int width); void RGBAToUVRow_Any_SSSE3(const uint8* src_rgba, int src_stride_rgba, uint8* dst_u, uint8* dst_v, int width); +void ARGBToUV444Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUV422Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); +void ARGBToUV411Row_Any_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix); void ARGBToUVRow_Any_NEON(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int pix); void RGB565ToUVRow_Any_NEON(const uint8* src_rgb565, int src_stride_rgb565, uint8* dst_u, uint8* dst_v, int pix); +void ARGB1555ToUVRow_Any_NEON(const uint8* src_argb1555, + int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int pix); +void ARGB4444ToUVRow_Any_NEON(const uint8* src_argb4444, + int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int pix); void ARGBToUVRow_C(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); void BGRAToUVRow_C(const uint8* src_bgra, int src_stride_bgra, @@ -428,6 +451,10 @@ void RGBAToUVRow_C(const uint8* src_rgba, int src_stride_rgba, uint8* dst_u, uint8* dst_v, int width); void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, uint8* dst_u, uint8* dst_v, int width); +void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int width); +void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int width); void ARGBToUV422Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int width); @@ -1087,6 +1114,10 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix); void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix); +void ARGBToBayerRow_Any_SSSE3(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix); +void ARGBToBayerRow_Any_NEON(const uint8* src_argb, + uint8* dst_bayer, uint32 selector, int pix); void I422ToYUY2Row_C(const uint8* src_y, const uint8* src_u, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 1b10f8b9d..486a86488 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 479 +#define LIBYUV_VERSION 480 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert.cc b/source/convert.cc index 5eaa19d60..f9bb84fca 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -943,6 +943,9 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb, ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; + } + if (width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_NEON; } @@ -1207,6 +1210,9 @@ int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24, RGB24ToYRow = RGB24ToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RGB24ToYRow = RGB24ToYRow_NEON; + } + if (width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_NEON; } @@ -1312,6 +1318,9 @@ int RAWToI420(const uint8* src_raw, int src_stride_raw, RAWToYRow = RAWToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { RAWToYRow = RAWToYRow_NEON; + } + if (width >= 16) { + ARGBToUVRow = ARGBToUVRow_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUVRow = ARGBToUVRow_NEON; } @@ -1482,6 +1491,25 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; src_stride_argb1555 = -src_stride_argb1555; } + +#if defined(HAS_ARGB1555TOYROW_NEON) + void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C; + void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) = + ARGB1555ToYRow_C; + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToYRow = ARGB1555ToYRow_NEON; + } + if (width >= 16) { + ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB1555ToUVRow = ARGB1555ToUVRow_NEON; + } + } + } +#else // HAS_ARGB1555TOYROW_NEON SIMD_ALIGNED(uint8 row[kMaxStride * 2]); void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = ARGB1555ToARGBRow_C; @@ -1492,15 +1520,7 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; } } -#elif defined(HAS_ARGB1555TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON; - } - } #endif - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; #if defined(HAS_ARGBTOUVROW_SSSE3) @@ -1511,20 +1531,6 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, } } #endif - -#if defined(HAS_ARGB1555TOYROW_NEON) - void (*ARGB1555ToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = - ARGB1555ToYRow_C; - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGB1555ToYRow = ARGB1555ToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } - } - } -#else void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; #if defined(HAS_ARGBTOUVROW_SSSE3) @@ -1541,13 +1547,14 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, #endif // HAS_ARGB1555TOYROW_NEON for (int y = 0; y < height - 1; y += 2) { - ARGB1555ToARGBRow(src_argb1555, row, width); - ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kMaxStride, width); - ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); #if defined(HAS_ARGB1555TOYROW_NEON) + ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width); ARGB1555ToYRow(src_argb1555, dst_y, width); ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y, width); #else + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kMaxStride, width); + ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width); #endif @@ -1557,11 +1564,12 @@ int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555, dst_v += dst_stride_v; } if (height & 1) { - ARGB1555ToARGBRow_C(src_argb1555, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); #if defined(HAS_ARGB1555TOYROW_NEON) + ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width); ARGB1555ToYRow(src_argb1555, dst_y, width); #else + ARGB1555ToARGBRow(src_argb1555, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); #endif } @@ -1586,6 +1594,25 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; src_stride_argb4444 = -src_stride_argb4444; } + +#if defined(HAS_ARGB4444TOYROW_NEON) + void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C; + void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) = + ARGB4444ToYRow_C; + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToYRow = ARGB4444ToYRow_NEON; + } + if (width >= 16) { + ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGB4444ToUVRow = ARGB4444ToUVRow_NEON; + } + } + } +#else // HAS_ARGB4444TOYROW_NEON SIMD_ALIGNED(uint8 row[kMaxStride * 2]); void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) = ARGB4444ToARGBRow_C; @@ -1596,15 +1623,7 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; } } -#elif defined(HAS_ARGB4444TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON; - } - } #endif - void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C; #if defined(HAS_ARGBTOUVROW_SSSE3) @@ -1615,20 +1634,6 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, } } #endif - -#if defined(HAS_ARGB4444TOYROW_NEON) - void (*ARGB4444ToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = - ARGB4444ToYRow_C; - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - ARGB4444ToYRow = ARGB4444ToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - ARGBToUVRow = ARGBToUVRow_NEON; - } - } - } -#else void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = ARGBToYRow_C; #if defined(HAS_ARGBTOUVROW_SSSE3) @@ -1645,13 +1650,16 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, #endif // HAS_ARGB4444TOYROW_NEON for (int y = 0; y < height - 1; y += 2) { - ARGB4444ToARGBRow(src_argb4444, row, width); - ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kMaxStride, width); - ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); #if defined(HAS_ARGB4444TOYROW_NEON) + ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width); ARGB4444ToYRow(src_argb4444, dst_y, width); - ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, width); + ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y, + width); #else + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kMaxStride, + width); + ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width); #endif @@ -1661,11 +1669,12 @@ int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444, dst_v += dst_stride_v; } if (height & 1) { - ARGB4444ToARGBRow_C(src_argb4444, row, width); - ARGBToUVRow(row, 0, dst_u, dst_v, width); #if defined(HAS_ARGB4444TOYROW_NEON) + ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width); ARGB4444ToYRow(src_argb4444, dst_y, width); #else + ARGB4444ToARGBRow(src_argb4444, row, width); + ARGBToUVRow(row, 0, dst_u, dst_v, width); ARGBToYRow(row, dst_y, width); #endif } diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 98b00ecc4..e62a065ed 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -54,6 +54,7 @@ int ARGBToI444(const uint8* src_argb, int src_stride_argb, #elif defined(HAS_ARGBTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYRow = ARGBToYRow_Any_NEON; + ARGBToUV444Row = ARGBToUV444Row_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; ARGBToUV444Row = ARGBToUV444Row_NEON; @@ -120,6 +121,9 @@ int ARGBToI422(const uint8* src_argb, int src_stride_argb, ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; + } + if (width >= 16) { + ARGBToUV422Row = ARGBToUV422Row_Any_NEON; if (IS_ALIGNED(width, 16)) { ARGBToUV422Row = ARGBToUV422Row_NEON; } @@ -173,6 +177,9 @@ int ARGBToI411(const uint8* src_argb, int src_stride_argb, ARGBToYRow = ARGBToYRow_Any_NEON; if (IS_ALIGNED(width, 8)) { ARGBToYRow = ARGBToYRow_NEON; + } + if (width >= 32) { + ARGBToUV411Row = ARGBToUV411Row_Any_NEON; if (IS_ALIGNED(width, 32)) { ARGBToUV411Row = ARGBToUV411Row_NEON; } diff --git a/source/format_conversion.cc b/source/format_conversion.cc index 0cb745f39..6dcc6e5de 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -72,13 +72,19 @@ int ARGBToBayer(const uint8* src_argb, int src_stride_argb, void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) = ARGBToBayerRow_C; #if defined(HAS_ARGBTOBAYERROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) && + if (TestCpuFlag(kCpuHasSSSE3) && width >= 4 && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToBayerRow = ARGBToBayerRow_SSSE3; + ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToBayerRow = ARGBToBayerRow_SSSE3; + } } #elif defined(HAS_ARGBTOBAYERROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) { - ARGBToBayerRow = ARGBToBayerRow_NEON; + if (TestCpuFlag(kCpuHasNEON) && width >= 4) { + ARGBToBayerRow = ARGBToBayerRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBToBayerRow = ARGBToBayerRow_NEON; + } } #endif const int blue_index = 0; // Offsets for ARGB format @@ -398,7 +404,7 @@ int I420ToBayer(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToARGBRow = I422ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; + I422ToARGBRow = I422ToARGBRow_SSSE3; } } #elif defined(HAS_I422TOARGBROW_NEON) @@ -408,20 +414,34 @@ int I420ToBayer(const uint8* src_y, int src_stride_y, I422ToARGBRow = I422ToARGBRow_NEON; } } +#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } #endif SIMD_ALIGNED(uint8 row[kMaxStride]); void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) = ARGBToBayerRow_C; #if defined(HAS_ARGBTOBAYERROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { - ARGBToBayerRow = ARGBToBayerRow_SSSE3; + if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) { + ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToBayerRow = ARGBToBayerRow_SSSE3; + } } #elif defined(HAS_ARGBTOBAYERROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) { - ARGBToBayerRow = ARGBToBayerRow_NEON; + if (TestCpuFlag(kCpuHasNEON) && width >= 4) { + ARGBToBayerRow = ARGBToBayerRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBToBayerRow = ARGBToBayerRow_NEON; + } } #endif + const int blue_index = 0; // Offsets for ARGB format const int green_index = 1; const int red_index = 2; diff --git a/source/row_any.cc b/source/row_any.cc index b10a85d1d..73b2cf41d 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -19,6 +19,9 @@ namespace libyuv { extern "C" { #endif +// TODO(fbarchard): Consider 'any' functions handling any quantity of pixels. +// TODO(fbarchard): Consider 'any' functions handling odd alignment. + // YUV to RGB does multiple of 8 with SIMD and remainder with C. #define YANY(NAMEANY, I420TORGB_SIMD, I420TORGB_C, UV_SHIFT, BPP, MASK) \ void NAMEANY(const uint8* y_buf, \ @@ -114,12 +117,8 @@ NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2) #endif // HAS_NV12TORGB565ROW_NEON #undef NVANY -// YUY2 to RGB does 8 at a time. -// RGB to RGB does multiple of 16 pixels with SIMD and remainder with C. -// SSSE3 RGB24 is multiple of 16 pixels, aligned source and destination. -// SSE2 RGB565 is multiple of 4 pixels, ARGB must be aligned to 16 bytes. -// NEON RGB24 is multiple of 8 pixels, unaligned source and destination. -// I400 To ARGB does multiple of 8 pixels with SIMD and remainder with C. +// TODO(fbarchard): RGBANY use last 16 method. +// ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst. #define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \ void NAMEANY(const uint8* src, \ uint8* dst, \ @@ -165,6 +164,26 @@ RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, #endif #undef RGBANY +// ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst. +#define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \ + void NAMEANY(const uint8* src, \ + uint8* dst, uint32 selector, \ + int width) { \ + int n = width & ~MASK; \ + ARGBTORGB_SIMD(src, dst, selector, n); \ + ARGBTORGB_C(src + n * SBPP, dst + n * BPP, selector, width & MASK); \ + } + +#if defined(HAS_ARGBTOBAYERROW_SSSE3) +BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C, + 3, 4, 1) +#endif +#if defined(HAS_ARGBTOBAYERROW_NEON) +BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C, + 3, 4, 1) +#endif +#undef BAYERANY + // RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD. // TODO(fbarchard): Use last 16 method for all unsubsampled conversions. #define YANY(NAMEANY, ARGBTOY_SIMD, SBPP, BPP, NUM) \ @@ -230,37 +249,43 @@ UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2) #ifdef HAS_ARGBTOUVROW_NEON UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4) UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2) +UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2) +UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2) UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2) UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2) #endif #undef UVANY -#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP) \ +#define UV422ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK, SHIFT) \ void NAMEANY(const uint8* src_uv, \ uint8* dst_u, uint8* dst_v, int width) { \ - int n = width & ~15; \ + int n = width & ~MASK; \ ANYTOUV_SIMD(src_uv, dst_u, dst_v, n); \ ANYTOUV_C(src_uv + n * BPP, \ - dst_u + (n >> 1), \ - dst_v + (n >> 1), \ - width & 15); \ + dst_u + (n >> SHIFT), \ + dst_v + (n >> SHIFT), \ + width & MASK); \ } #ifdef HAS_ARGBTOUVROW_SSSE3 UV422ANY(ARGBToUV422Row_Any_SSSE3, ARGBToUV422Row_Unaligned_SSSE3, - ARGBToUV422Row_C, 4) -#endif -#ifdef HAS_YUY2TOUV422ROW_SSE2 + ARGBToUV422Row_C, 4, 15, 1) UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2, - YUY2ToUV422Row_C, 2) + YUY2ToUV422Row_C, 2, 15, 1) UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2, - UYVYToUV422Row_C, 2) + UYVYToUV422Row_C, 2, 15, 1) #endif #ifdef HAS_YUY2TOUV422ROW_NEON +UV422ANY(ARGBToUV444Row_Any_NEON, ARGBToUV444Row_NEON, + ARGBToUV444Row_C, 4, 8, 0) +UV422ANY(ARGBToUV422Row_Any_NEON, ARGBToUV422Row_NEON, + ARGBToUV422Row_C, 4, 15, 1) +UV422ANY(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, + ARGBToUV411Row_C, 4, 31, 2) UV422ANY(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, - YUY2ToUV422Row_C, 2) + YUY2ToUV422Row_C, 2, 15, 1) UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, - UYVYToUV422Row_C, 2) + UYVYToUV422Row_C, 2, 15, 1) #endif #undef UV422ANY diff --git a/source/row_common.cc b/source/row_common.cc index 3a76a0d41..ef43297af 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -350,7 +350,7 @@ void ARGB4444ToYRow_C(const uint8* src_argb4444, uint8* dst_y, int width) { } void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, uint8* dst_v, int width) { const uint8* next_rgb565 = src_rgb565 + src_stride_rgb565; for (int x = 0; x < width - 1; x += 2) { uint8 b0 = src_rgb565[0] & 0x1f; @@ -365,13 +365,13 @@ void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, uint8 b3 = next_rgb565[2] & 0x1f; uint8 g3 = (next_rgb565[2] >> 5) | ((next_rgb565[3] & 0x07) << 3); uint8 r3 = next_rgb565[3] >> 3; - uint8 ab = (b0 + b1 + b2 + b3); - uint8 ag = (g0 + g1 + g2 + g3); - uint8 ar = (r0 + r1 + r2 + r3); - ab = (ab << 1) | (ab >> 6); - ar = (ar << 1) | (ar >> 6); - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); + uint8 b = (b0 + b1 + b2 + b3); // 565 * 4 = 787. + uint8 g = (g0 + g1 + g2 + g3); + uint8 r = (r0 + r1 + r2 + r3); + b = (b << 1) | (b >> 6); // 787 -> 888. + r = (r << 1) | (r >> 6); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); src_rgb565 += 4; next_rgb565 += 4; dst_u += 1; @@ -384,14 +384,108 @@ void RGB565ToUVRow_C(const uint8* src_rgb565, int src_stride_rgb565, uint8 b2 = next_rgb565[0] & 0x1f; uint8 g2 = (next_rgb565[0] >> 5) | ((next_rgb565[1] & 0x07) << 3); uint8 r2 = next_rgb565[1] >> 3; - uint8 ab = (b0 + b2); - uint8 ag = (g0 + g2); - uint8 ar = (r0 + r2); - ab = (ab << 2) | (ab >> 4); - ag = (ag << 1) | (ag >> 6); - ar = (ar << 2) | (ar >> 4); - dst_u[0] = RGBToU(ar, ag, ab); - dst_v[0] = RGBToV(ar, ag, ab); + uint8 b = (b0 + b2); // 565 * 2 = 676. + uint8 g = (g0 + g2); + uint8 r = (r0 + r2); + b = (b << 2) | (b >> 4); // 676 -> 888 + g = (g << 1) | (g >> 6); + r = (r << 2) | (r >> 4); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + } +} + +void ARGB1555ToUVRow_C(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int width) { + const uint8* next_argb1555 = src_argb1555 + src_stride_argb1555; + for (int x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb1555[0] & 0x1f; + uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8 r0 = (src_argb1555[1] & 0x7c) >> 2; + uint8 b1 = src_argb1555[2] & 0x1f; + uint8 g1 = (src_argb1555[2] >> 5) | ((src_argb1555[3] & 0x03) << 3); + uint8 r1 = (src_argb1555[3] & 0x7c) >> 2; + uint8 b2 = next_argb1555[0] & 0x1f; + uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); + uint8 r2 = (next_argb1555[1] & 0x7c) >> 2; + uint8 b3 = next_argb1555[2] & 0x1f; + uint8 g3 = (next_argb1555[2] >> 5) | ((next_argb1555[3] & 0x03) << 3); + uint8 r3 = (next_argb1555[3] & 0x7c) >> 2; + uint8 b = (b0 + b1 + b2 + b3); // 555 * 4 = 777. + uint8 g = (g0 + g1 + g2 + g3); + uint8 r = (r0 + r1 + r2 + r3); + b = (b << 1) | (b >> 6); // 777 -> 888. + g = (g << 1) | (g >> 6); + r = (r << 1) | (r >> 6); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + src_argb1555 += 4; + next_argb1555 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 b0 = src_argb1555[0] & 0x1f; + uint8 g0 = (src_argb1555[0] >> 5) | ((src_argb1555[1] & 0x03) << 3); + uint8 r0 = (src_argb1555[1] & 0x7c) >> 2; + uint8 b2 = next_argb1555[0] & 0x1f; + uint8 g2 = (next_argb1555[0] >> 5) | ((next_argb1555[1] & 0x03) << 3); + uint8 r2 = next_argb1555[1] >> 3; + uint8 b = (b0 + b2); // 555 * 2 = 666. + uint8 g = (g0 + g2); + uint8 r = (r0 + r2); + b = (b << 2) | (b >> 4); // 666 -> 888. + g = (g << 2) | (g >> 4); + r = (r << 2) | (r >> 4); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + } +} + +void ARGB4444ToUVRow_C(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int width) { + const uint8* next_argb4444 = src_argb4444 + src_stride_argb4444; + for (int x = 0; x < width - 1; x += 2) { + uint8 b0 = src_argb4444[0] & 0x0f; + uint8 g0 = src_argb4444[0] >> 4; + uint8 r0 = src_argb4444[1] & 0x0f; + uint8 b1 = src_argb4444[2] & 0x0f; + uint8 g1 = src_argb4444[2] >> 4; + uint8 r1 = src_argb4444[3] & 0x0f; + uint8 b2 = next_argb4444[0] & 0x0f; + uint8 g2 = next_argb4444[0] >> 4; + uint8 r2 = next_argb4444[1] & 0x0f; + uint8 b3 = next_argb4444[2] & 0x0f; + uint8 g3 = next_argb4444[2] >> 4; + uint8 r3 = next_argb4444[3] & 0x0f; + uint8 b = (b0 + b1 + b2 + b3); // 444 * 4 = 666. + uint8 g = (g0 + g1 + g2 + g3); + uint8 r = (r0 + r1 + r2 + r3); + b = (b << 2) | (b >> 4); // 666 -> 888. + g = (g << 2) | (g >> 4); + r = (r << 2) | (r >> 4); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); + src_argb4444 += 4; + next_argb4444 += 4; + dst_u += 1; + dst_v += 1; + } + if (width & 1) { + uint8 b0 = src_argb4444[0] & 0x0f; + uint8 g0 = src_argb4444[0] >> 4; + uint8 r0 = src_argb4444[1] & 0x0f; + uint8 b2 = next_argb4444[0] & 0x0f; + uint8 g2 = next_argb4444[0] >> 4; + uint8 r2 = next_argb4444[1] & 0x0f; + uint8 b = (b0 + b2); // 444 * 2 = 555. + uint8 g = (g0 + g2); + uint8 r = (r0 + r2); + b = (b << 3) | (b >> 2); // 555 -> 888. + g = (g << 3) | (g >> 2); + r = (r << 3) | (r >> 2); + dst_u[0] = RGBToU(r, g, b); + dst_v[0] = RGBToV(r, g, b); } } @@ -799,10 +893,10 @@ void I422ToARGB1555Row_C(const uint8* src_y, } void I422ToRGB565Row_C(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - uint8* dst_rgb565, - int width) { + const uint8* src_u, + const uint8* src_v, + uint8* dst_rgb565, + int width) { uint8 b0; uint8 g0; uint8 r0; diff --git a/source/row_neon.cc b/source/row_neon.cc index 78c4f00fa..57371e808 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1159,6 +1159,19 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { "vorr.u8 q1, q1, q3 \n" /* R,A */ \ "vorr.u8 q0, q0, q2 \n" /* B,G */ \ +// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha. +#define RGB555TOARGB \ + "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \ + "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \ + "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \ + "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \ + "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \ + "vorr.u8 d0, d0, d4 \n" /* B */ \ + "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \ + "vorr.u8 d2, d1, d5 \n" /* R */ \ + "vorr.u8 d1, d4, d6 \n" /* G */ + void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, int pix) { asm volatile ( @@ -1715,6 +1728,19 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. #ifdef HAS_ARGBTOUVROW_NEON + +#define RGBTOUV \ + "vmul.s16 q8, q0, q10 \n" /* B */ \ + "vmls.s16 q8, q1, q11 \n" /* G */ \ + "vmls.s16 q8, q2, q12 \n" /* R */ \ + "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \ + "vmul.s16 q9, q2, q10 \n" /* R */ \ + "vmls.s16 q9, q1, q14 \n" /* G */ \ + "vmls.s16 q9, q0, q13 \n" /* B */ \ + "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \ + "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \ + "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ + void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( @@ -1738,16 +1764,7 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. "subs %4, %4, #16 \n" // 32 processed per loop. - "vmul.s16 q8, q0, q10 \n" // B - "vmls.s16 q8, q1, q11 \n" // G - "vmls.s16 q8, q2, q12 \n" // R - "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned - "vmul.s16 q9, q2, q10 \n" // R - "vmls.s16 q9, q1, q14 \n" // G - "vmls.s16 q9, q0, q13 \n" // B - "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned - "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U - "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + RGBTOUV "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" @@ -1763,7 +1780,6 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, } #endif // HAS_ARGBTOUVROW_NEON - // 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. #ifdef HAS_RGB565TOUVROW_NEON void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, @@ -1824,7 +1840,131 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" ); } -#endif // HAS_ARGBTOUVROW_NEON +#endif // HAS_RGB565TOUVROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_ARGB1555TOUVROW_NEON +void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 4 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 4 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 4 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. + RGB555TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(src_stride_argb1555), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_ARGB1555TOUVROW_NEON + +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_ARGB4444TOUVROW_NEON +void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_argb + "vmov.s16 q10, #112 / 4 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #74 / 4 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #38 / 4 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #18 / 4 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #94 / 4 \n" // VG -0.7344 coefficient + "vmov.u16 q15, #0x8080 \n" // 128.5 + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. + ARGB4444TOARGB + "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. + "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. + "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + + "subs %4, %4, #16 \n" // 16 processed per loop. + "vmul.s16 q8, q4, q10 \n" // B + "vmls.s16 q8, q5, q11 \n" // G + "vmls.s16 q8, q6, q12 \n" // R + "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned + "vmul.s16 q9, q6, q10 \n" // R + "vmls.s16 q9, q5, q14 \n" // G + "vmls.s16 q9, q4, q13 \n" // B + "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned + "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U + "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(src_stride_argb4444), // %1 + "+r"(dst_u), // %2 + "+r"(dst_v), // %3 + "+r"(pix) // %4 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_ARGB4444TOUVROW_NEON #ifdef HAS_RGB565TOYROW_NEON void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 7586cd880..33afa5535 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -612,8 +612,9 @@ TESTATOPLANAR(RGBA, 4, I420, 2, 2, 4) TESTATOPLANAR(RAW, 3, I420, 2, 2, 4) TESTATOPLANAR(RGB24, 3, I420, 2, 2, 4) TESTATOPLANAR(RGB565, 2, I420, 2, 2, 5) -TESTATOPLANAR(ARGB1555, 2, I420, 2, 2, 4) -TESTATOPLANAR(ARGB4444, 2, I420, 2, 2, 4) +// TODO(fbarchard): Make 1555 neon work same as C code, reduce to diff 9. +TESTATOPLANAR(ARGB1555, 2, I420, 2, 2, 15) +TESTATOPLANAR(ARGB4444, 2, I420, 2, 2, 17) TESTATOPLANAR(ARGB, 4, I411, 4, 1, 4) TESTATOPLANAR(ARGB, 4, I422, 2, 1, 2) TESTATOPLANAR(ARGB, 4, I444, 1, 1, 2)