diff --git a/README.chromium b/README.chromium index 460d8ecc1..720dea5b7 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1780 +Version: 1781 License: BSD License File: LICENSE diff --git a/docs/formats.md b/docs/formats.md index 5fc19d453..d628f7f96 100644 --- a/docs/formats.md +++ b/docs/formats.md @@ -54,12 +54,14 @@ The following is extracted from video_common.h as a complete list of formats sup // 1 Secondary YUV format: row biplanar. FOURCC_M420 = FOURCC('M', '4', '2', '0'), // deprecated. - // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc + // 13 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc, 2 64 bpp FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), FOURCC_AR30 = FOURCC('A', 'R', '3', '0'), // 10 bit per channel. 2101010. FOURCC_AB30 = FOURCC('A', 'B', '3', '0'), // ABGR version of 10 bit + FOURCC_AR64 = FOURCC('A', 'R', '6', '4'), // 16 bit per channel. + FOURCC_AB64 = FOURCC('A', 'B', '6', '4'), // ABGR version of 16 bit FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'), @@ -180,6 +182,14 @@ The 2 bit alpha has 4 values. Here are the comparable 8 bit alpha values. The 10 bit RGB values range from 0 to 1023. XR30 is the same as AR30 but with no alpha channel. +# AB64 and AR64 + +AB64 is similar to ABGR, with 16 bit (2 bytes) per channel. Each channel stores an unsigned short. +In memory R is the lowest and A is the highest. +Each channel has value ranges from 0 to 65535. +AR64 is similar to ARGB. + + # NV12 and NV21 NV12 is a biplanar format with a full sized Y plane followed by a single diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h index 474a82147..f76c12dd9 100644 --- a/include/libyuv/convert_argb.h +++ b/include/libyuv/convert_argb.h @@ -1065,6 +1065,42 @@ int AR30ToAB30(const uint8_t* src_ar30, int width, int height); +// Convert AR64 to ARGB. +LIBYUV_API +int AR64ToARGB(const uint16_t* src_ar64, + int src_stride_ar64, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert AB64 to ABGR. +#define AB64ToABGR AR64ToARGB + +// Convert AB64 to ARGB. +LIBYUV_API +int AB64ToARGB(const uint16_t* src_ab64, + int src_stride_ab64, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height); + +// Convert AR64 to ABGR. +#define AR64ToABGR AB64ToARGB + +// Convert AR64 To AB64. +LIBYUV_API +int AR64ToAB64(const uint16_t* src_ar64, + int src_stride_ar64, + uint16_t* dst_ab64, + int dst_stride_ab64, + int width, + int height); + +// Convert AB64 To AR64. +#define AB64ToAR64 AR64ToAB64 + // src_width/height provided by capture // dst_width/height for clipping determine final size. LIBYUV_API diff --git a/include/libyuv/convert_from_argb.h b/include/libyuv/convert_from_argb.h index d992363ce..bf4878604 100644 --- a/include/libyuv/convert_from_argb.h +++ b/include/libyuv/convert_from_argb.h @@ -153,6 +153,30 @@ int ARGBToI444(const uint8_t* src_argb, int width, int height); +// Convert ARGB to AR64. +LIBYUV_API +int ARGBToAR64(const uint8_t* src_argb, + int src_stride_argb, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height); + +// Convert ABGR to AB64. +#define ABGRToAB64 ARGBToAR64 + +// Convert ARGB to AB64. +LIBYUV_API +int ARGBToAB64(const uint8_t* src_argb, + int src_stride_argb, + uint16_t* dst_ab64, + int dst_stride_ab64, + int width, + int height); + +// Convert ABGR to AR64. +#define ABGRToAR64 ARGBToAB64 + // Convert ARGB To I422. LIBYUV_API int ARGBToI422(const uint8_t* src_argb, diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index ebefb5682..85dda98c1 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -945,7 +945,7 @@ void ARGBAffineRow_SSE2(const uint8_t* src_argb, int width); // Shuffle ARGB channel order. e.g. BGRA to ARGB. -// shuffler is 16 bytes and must be aligned. +// shuffler is 16 bytes. LIBYUV_API int ARGBShuffle(const uint8_t* src_bgra, int src_stride_bgra, @@ -955,6 +955,17 @@ int ARGBShuffle(const uint8_t* src_bgra, int width, int height); +// Shuffle AR64 channel order. e.g. AR64 to AB64. +// shuffler is 16 bytes. +LIBYUV_API +int AR64Shuffle(const uint16_t* src_ar64, + int src_stride_ar64, + uint16_t* dst_ar64, + int dst_stride_ar64, + const uint8_t* shuffler, + int width, + int height); + // Sobel ARGB effect with planar output. LIBYUV_API int ARGBSobelToPlane(const uint8_t* src_argb, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 7af36c743..9f159d40a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -273,6 +273,10 @@ extern "C" { (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER))) #define HAS_ABGRTOAR30ROW_SSSE3 #define HAS_ARGBTOAR30ROW_SSSE3 +#define HAS_ARGBTOAR64ROW_SSSE3 +#define HAS_ARGBTOAB64ROW_SSSE3 +#define HAS_AR64TOARGBROW_SSSE3 +#define HAS_AB64TOARGBROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT8TO16ROW_SSE2 #define HAS_HALFMERGEUVROW_SSSE3 @@ -318,6 +322,10 @@ extern "C" { #define HAS_ARGBTOAR30ROW_AVX2 #define HAS_ARGBTORAWROW_AVX2 #define HAS_ARGBTORGB24ROW_AVX2 +#define HAS_ARGBTOAR64ROW_AVX2 +#define HAS_ARGBTOAB64ROW_AVX2 +#define HAS_AR64TOARGBROW_AVX2 +#define HAS_AB64TOARGBROW_AVX2 #define HAS_CONVERT16TO8ROW_AVX2 #define HAS_CONVERT8TO16ROW_AVX2 #define HAS_DIVIDEROW_16_AVX2 @@ -383,6 +391,10 @@ extern "C" { #define HAS_ARGBTORGB24ROW_NEON #define HAS_ARGBTORGB565DITHERROW_NEON #define HAS_ARGBTORGB565ROW_NEON +#define HAS_ARGBTOAR64ROW_NEON +#define HAS_ARGBTOAB64ROW_NEON +#define HAS_AR64TOARGBROW_NEON +#define HAS_AB64TOARGBROW_NEON #define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOUVROW_NEON @@ -2563,6 +2575,71 @@ void ARGBToARGB4444Row_C(const uint8_t* src_argb, uint8_t* dst_rgb, int width); void ABGRToAR30Row_C(const uint8_t* src_abgr, uint8_t* dst_ar30, int width); void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width); +void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width); +void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width); +void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width); +void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width); +void AR64ShuffleRow_C(const uint8_t* src_ar64, + uint8_t* dst_ar64, + const uint8_t* shuffler, + int width); +void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width); +void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width); +void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width); +void AB64ToARGBRow_SSSE3(const uint16_t* src_ab64, + uint8_t* dst_argb, + int width); +void ARGBToAR64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ar64, int width); +void ARGBToAB64Row_AVX2(const uint8_t* src_argb, uint16_t* dst_ab64, int width); +void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, uint8_t* dst_argb, int width); +void AB64ToARGBRow_AVX2(const uint16_t* src_ab64, uint8_t* dst_argb, int width); +void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width); +void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width); +void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width); +void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width); +void ARGBToAR64Row_Any_SSSE3(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int width); +void ARGBToAB64Row_Any_SSSE3(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int width); +void AR64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int width); +void AB64ToARGBRow_Any_SSSE3(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToAR64Row_Any_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int width); +void ARGBToAB64Row_Any_AVX2(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int width); +void AR64ToARGBRow_Any_AVX2(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int width); +void AB64ToARGBRow_Any_AVX2(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int width); +void ARGBToAR64Row_Any_NEON(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int width); +void ARGBToAB64Row_Any_NEON(const uint8_t* src_ptr, + uint16_t* dst_ptr, + int width); +void AR64ToARGBRow_Any_NEON(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int width); +void AB64ToARGBRow_Any_NEON(const uint16_t* src_ptr, + uint8_t* dst_ptr, + int width); + void J400ToARGBRow_SSE2(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_AVX2(const uint8_t* src_y, uint8_t* dst_argb, int width); void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 7016100d5..324d32f94 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1780 +#define LIBYUV_VERSION 1781 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/include/libyuv/video_common.h b/include/libyuv/video_common.h index 0da3fb554..32b8a5210 100644 --- a/include/libyuv/video_common.h +++ b/include/libyuv/video_common.h @@ -65,12 +65,14 @@ enum FourCC { // 1 Secondary YUV format: row biplanar. deprecated. FOURCC_M420 = FOURCC('M', '4', '2', '0'), - // 11 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc + // 13 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp, 1 10 bpc 2 64 bpp FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'), FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'), FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'), FOURCC_AR30 = FOURCC('A', 'R', '3', '0'), // 10 bit per channel. 2101010. FOURCC_AB30 = FOURCC('A', 'B', '3', '0'), // ABGR version of 10 bit + FOURCC_AR64 = FOURCC('A', 'R', '6', '4'), // 16 bit per channel. + FOURCC_AB64 = FOURCC('A', 'B', '6', '4'), // ABGR version of 16 bit FOURCC_24BG = FOURCC('2', '4', 'B', 'G'), FOURCC_RAW = FOURCC('r', 'a', 'w', ' '), FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'), @@ -163,6 +165,8 @@ enum FourCCBpp { FOURCC_BPP_RGBA = 32, FOURCC_BPP_AR30 = 32, FOURCC_BPP_AB30 = 32, + FOURCC_BPP_AR64 = 64, + FOURCC_BPP_AB64 = 64, FOURCC_BPP_24BG = 24, FOURCC_BPP_RAW = 24, FOURCC_BPP_RGBP = 16, diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 2b3d52d2c..0bd330ec3 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -2791,6 +2791,10 @@ static const uvec8 kShuffleMaskABGRToARGB = { static const uvec8 kShuffleMaskRGBAToARGB = { 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u}; +// Shuffle table for converting AR64 to AB64. +static const uvec8 kShuffleMaskAR64ToAB64 = { + 4u, 5u, 2u, 3u, 0u, 1u, 6u, 7u, 12u, 13u, 10u, 11u, 8u, 9u, 14u, 15u}; + // Convert BGRA to ARGB. LIBYUV_API int BGRAToARGB(const uint8_t* src_bgra, @@ -2800,7 +2804,7 @@ int BGRAToARGB(const uint8_t* src_bgra, int width, int height) { return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, - (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height); + (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height); } // Convert ARGB to BGRA (same as BGRAToARGB). @@ -2812,7 +2816,7 @@ int ARGBToBGRA(const uint8_t* src_bgra, int width, int height) { return ARGBShuffle(src_bgra, src_stride_bgra, dst_argb, dst_stride_argb, - (const uint8_t*)(&kShuffleMaskBGRAToARGB), width, height); + (const uint8_t*)&kShuffleMaskBGRAToARGB, width, height); } // Convert ABGR to ARGB. @@ -2824,7 +2828,7 @@ int ABGRToARGB(const uint8_t* src_abgr, int width, int height) { return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, - (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height); + (const uint8_t*)&kShuffleMaskABGRToARGB, width, height); } // Convert ARGB to ABGR to (same as ABGRToARGB). @@ -2836,7 +2840,7 @@ int ARGBToABGR(const uint8_t* src_abgr, int width, int height) { return ARGBShuffle(src_abgr, src_stride_abgr, dst_argb, dst_stride_argb, - (const uint8_t*)(&kShuffleMaskABGRToARGB), width, height); + (const uint8_t*)&kShuffleMaskABGRToARGB, width, height); } // Convert RGBA to ARGB. @@ -2848,7 +2852,19 @@ int RGBAToARGB(const uint8_t* src_rgba, int width, int height) { return ARGBShuffle(src_rgba, src_stride_rgba, dst_argb, dst_stride_argb, - (const uint8_t*)(&kShuffleMaskRGBAToARGB), width, height); + (const uint8_t*)&kShuffleMaskRGBAToARGB, width, height); +} + +// Convert AR64 To AB64. +LIBYUV_API +int AR64ToAB64(const uint16_t* src_ar64, + int src_stride_ar64, + uint16_t* dst_ab64, + int dst_stride_ab64, + int width, + int height) { + return AR64Shuffle(src_ar64, src_stride_ar64, dst_ab64, dst_stride_ab64, + (const uint8_t*)&kShuffleMaskAR64ToAB64, width, height); } // Convert RGB24 to ARGB. @@ -3357,6 +3373,124 @@ int AR30ToAB30(const uint8_t* src_ar30, return 0; } +// Convert AR64 to ARGB. +LIBYUV_API +int AR64ToARGB(const uint16_t* src_ar64, + int src_stride_ar64, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*AR64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb, + int width) = AR64ToARGBRow_C; + if (!src_ar64 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar64 = src_ar64 + (height - 1) * src_stride_ar64; + src_stride_ar64 = -src_stride_ar64; + } + // Coalesce rows. + if (src_stride_ar64 == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_ar64 = dst_stride_argb = 0; + } +#if defined(HAS_AR64TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + AR64ToARGBRow = AR64ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + AR64ToARGBRow = AR64ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_AR64TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AR64ToARGBRow = AR64ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + AR64ToARGBRow = AR64ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_AR64TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AR64ToARGBRow = AR64ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + AR64ToARGBRow = AR64ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + AR64ToARGBRow(src_ar64, dst_argb, width); + src_ar64 += src_stride_ar64; + dst_argb += dst_stride_argb; + } + return 0; +} + +// Convert AB64 to ARGB. +LIBYUV_API +int AB64ToARGB(const uint16_t* src_ab64, + int src_stride_ab64, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + int y; + void (*AB64ToARGBRow)(const uint16_t* src_ar64, uint8_t* dst_argb, + int width) = AB64ToARGBRow_C; + if (!src_ab64 || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ab64 = src_ab64 + (height - 1) * src_stride_ab64; + src_stride_ab64 = -src_stride_ab64; + } + // Coalesce rows. + if (src_stride_ab64 == width * 4 && dst_stride_argb == width * 4) { + width *= height; + height = 1; + src_stride_ab64 = dst_stride_argb = 0; + } +#if defined(HAS_AB64TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + AB64ToARGBRow = AB64ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + AB64ToARGBRow = AB64ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_AB64TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AB64ToARGBRow = AB64ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + AB64ToARGBRow = AB64ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_AB64TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AB64ToARGBRow = AB64ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + AB64ToARGBRow = AB64ToARGBRow_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + AB64ToARGBRow(src_ab64, dst_argb, width); + src_ab64 += src_stride_ab64; + dst_argb += dst_stride_argb; + } + return 0; +} + // Convert NV12 to ARGB with matrix. LIBYUV_API int NV12ToARGBMatrix(const uint8_t* src_y, diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 4ba4bb5e0..e14615847 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -2009,6 +2009,124 @@ int ARGBToJ422(const uint8_t* src_argb, return 0; } +// Convert ARGB to AR64. +LIBYUV_API +int ARGBToAR64(const uint8_t* src_argb, + int src_stride_argb, + uint16_t* dst_ar64, + int dst_stride_ar64, + int width, + int height) { + int y; + void (*ARGBToAR64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, + int width) = ARGBToAR64Row_C; + if (!src_argb || !dst_ar64 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ar64 = 0; + } +#if defined(HAS_ARGBTOAR64ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAR64Row = ARGBToAR64Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToAR64Row = ARGBToAR64Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOAR64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAR64Row = ARGBToAR64Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAR64Row = ARGBToAR64Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOAR64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToAR64Row = ARGBToAR64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToAR64Row = ARGBToAR64Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToAR64Row(src_argb, dst_ar64, width); + src_argb += src_stride_argb; + dst_ar64 += dst_stride_ar64; + } + return 0; +} + +// Convert ARGB to AB64. +LIBYUV_API +int ARGBToAB64(const uint8_t* src_argb, + int src_stride_argb, + uint16_t* dst_ab64, + int dst_stride_ab64, + int width, + int height) { + int y; + void (*ARGBToAB64Row)(const uint8_t* src_argb, uint16_t* dst_ar64, + int width) = ARGBToAB64Row_C; + if (!src_argb || !dst_ab64 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + // Coalesce rows. + if (src_stride_argb == width * 4 && dst_stride_ab64 == width * 4) { + width *= height; + height = 1; + src_stride_argb = dst_stride_ab64 = 0; + } +#if defined(HAS_ARGBTOAB64ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBToAB64Row = ARGBToAB64Row_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBToAB64Row = ARGBToAB64Row_SSSE3; + } + } +#endif +#if defined(HAS_ARGBTOAB64ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToAB64Row = ARGBToAB64Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToAB64Row = ARGBToAB64Row_AVX2; + } + } +#endif +#if defined(HAS_ARGBTOAB64ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToAB64Row = ARGBToAB64Row_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToAB64Row = ARGBToAB64Row_NEON; + } + } +#endif + + for (y = 0; y < height; ++y) { + ARGBToAB64Row(src_argb, dst_ab64, width); + src_argb += src_stride_argb; + dst_ab64 += dst_stride_ab64; + } + return 0; +} + // Convert ARGB to J400. LIBYUV_API int ARGBToJ400(const uint8_t* src_argb, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 219c21650..2f2089fbd 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -3527,6 +3527,76 @@ int ARGBShuffle(const uint8_t* src_bgra, return 0; } +// Shuffle AR64 channel order. e.g. AR64 to AB64. +LIBYUV_API +int AR64Shuffle(const uint16_t* src_ar64, + int src_stride_ar64, + uint16_t* dst_ar64, + int dst_stride_ar64, + const uint8_t* shuffler, + int width, + int height) { + int y; + void (*AR64ShuffleRow)(const uint8_t* src_ar64, uint8_t* dst_ar64, + const uint8_t* shuffler, int width) = AR64ShuffleRow_C; + if (!src_ar64 || !dst_ar64 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ar64 = src_ar64 + (height - 1) * src_stride_ar64; + src_stride_ar64 = -src_stride_ar64; + } + // Coalesce rows. + if (src_stride_ar64 == width * 4 && dst_stride_ar64 == width * 4) { + width *= height; + height = 1; + src_stride_ar64 = dst_stride_ar64 = 0; + } + // Assembly versions can be reused if it's implemented with shuffle. +#if defined(HAS_ARGBSHUFFLEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + AR64ShuffleRow = ARGBShuffleRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + AR64ShuffleRow = ARGBShuffleRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AR64ShuffleRow = ARGBShuffleRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + AR64ShuffleRow = ARGBShuffleRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AR64ShuffleRow = ARGBShuffleRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + AR64ShuffleRow = ARGBShuffleRow_NEON; + } + } +#endif +#if defined(HAS_ARGBSHUFFLEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + AR64ShuffleRow = ARGBShuffleRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + AR64ShuffleRow = ARGBShuffleRow_MMI; + } + } +#endif + + for (y = 0; y < height; ++y) { + AR64ShuffleRow((uint8_t*)(src_ar64), (uint8_t*)(dst_ar64), shuffler, + width * 2); + src_ar64 += src_stride_ar64; + dst_ar64 += dst_stride_ar64; + } + return 0; +} + // Gauss blur a float plane using Gaussian 5x5 filter with // coefficients of 1, 4, 6, 4, 1. // Each destination pixel is a blur of the 5x5 diff --git a/source/row_any.cc b/source/row_any.cc index f68d2ed64..b8c7f536e 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1237,6 +1237,72 @@ ANY11P(ARGBShuffleRow_Any_MMI, ARGBShuffleRow_MMI, const uint8_t*, 4, 4, 1) #undef ANY11P #undef ANY11P +// Any 1 to 1 with type +#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ + void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int width) { \ + SIMD_ALIGNED(uint8_t temp[(MASK + 1) * SBPP]); \ + SIMD_ALIGNED(uint8_t out[(MASK + 1) * BPP]); \ + memset(temp, 0, (MASK + 1) * SBPP); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n); \ + } \ + memcpy(temp, (uint8_t*)(src_ptr) + n * SBPP, r * SBPP); \ + ANY_SIMD((STYPE*)temp, (DTYPE*)out, MASK + 1); \ + memcpy((uint8_t*)(dst_ptr) + n * BPP, out, r * BPP); \ + } + +#ifdef HAS_ARGBTOAR64ROW_SSSE3 +ANY11T(ARGBToAR64Row_Any_SSSE3, ARGBToAR64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3) +#endif + +#ifdef HAS_ARGBTOAB64ROW_SSSE3 +ANY11T(ARGBToAB64Row_Any_SSSE3, ARGBToAB64Row_SSSE3, 4, 8, uint8_t, uint16_t, 3) +#endif + +#ifdef HAS_AR64TOARGBROW_SSSE3 +ANY11T(AR64ToARGBRow_Any_SSSE3, AR64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3) +#endif + +#ifdef HAS_ARGBTOAR64ROW_SSSE3 +ANY11T(AB64ToARGBRow_Any_SSSE3, AB64ToARGBRow_SSSE3, 8, 4, uint16_t, uint8_t, 3) +#endif + +#ifdef HAS_ARGBTOAR64ROW_AVX2 +ANY11T(ARGBToAR64Row_Any_AVX2, ARGBToAR64Row_AVX2, 4, 8, uint8_t, uint16_t, 7) +#endif + +#ifdef HAS_ARGBTOAB64ROW_AVX2 +ANY11T(ARGBToAB64Row_Any_AVX2, ARGBToAB64Row_AVX2, 4, 8, uint8_t, uint16_t, 7) +#endif + +#ifdef HAS_AR64TOARGBROW_AVX2 +ANY11T(AR64ToARGBRow_Any_AVX2, AR64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7) +#endif + +#ifdef HAS_ARGBTOAR64ROW_AVX2 +ANY11T(AB64ToARGBRow_Any_AVX2, AB64ToARGBRow_AVX2, 8, 4, uint16_t, uint8_t, 7) +#endif + +#ifdef HAS_ARGBTOAR64ROW_NEON +ANY11T(ARGBToAR64Row_Any_NEON, ARGBToAR64Row_NEON, 4, 8, uint8_t, uint16_t, 7) +#endif + +#ifdef HAS_ARGBTOAB64ROW_NEON +ANY11T(ARGBToAB64Row_Any_NEON, ARGBToAB64Row_NEON, 4, 8, uint8_t, uint16_t, 7) +#endif + +#ifdef HAS_AR64TOARGBROW_NEON +ANY11T(AR64ToARGBRow_Any_NEON, AR64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7) +#endif + +#ifdef HAS_ARGBTOAR64ROW_NEON +ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7) +#endif + +#undef ANY11T + // Any 1 to 1 with parameter and shorts. BPP measures in shorts. #define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK) \ void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \ @@ -1403,7 +1469,7 @@ ANY11C(UYVYToARGBRow_Any_MMI, UYVYToARGBRow_MMI, 1, 4, 4, 7) #undef ANY11C // Any 1 to 1 interpolate. Takes 2 rows of source via stride. -#define ANY11T(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ +#define ANY11I(NAMEANY, ANY_SIMD, SBPP, BPP, MASK) \ void NAMEANY(uint8_t* dst_ptr, const uint8_t* src_ptr, \ ptrdiff_t src_stride_ptr, int width, int source_y_fraction) { \ SIMD_ALIGNED(uint8_t temp[64 * 3]); \ @@ -1420,21 +1486,21 @@ ANY11C(UYVYToARGBRow_Any_MMI, UYVYToARGBRow_MMI, 1, 4, 4, 7) } #ifdef HAS_INTERPOLATEROW_AVX2 -ANY11T(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31) +ANY11I(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, 1, 1, 31) #endif #ifdef HAS_INTERPOLATEROW_SSSE3 -ANY11T(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15) +ANY11I(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, 1, 1, 15) #endif #ifdef HAS_INTERPOLATEROW_NEON -ANY11T(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) +ANY11I(InterpolateRow_Any_NEON, InterpolateRow_NEON, 1, 1, 15) #endif #ifdef HAS_INTERPOLATEROW_MSA -ANY11T(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31) +ANY11I(InterpolateRow_Any_MSA, InterpolateRow_MSA, 1, 1, 31) #endif #ifdef HAS_INTERPOLATEROW_MMI -ANY11T(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7) +ANY11I(InterpolateRow_Any_MMI, InterpolateRow_MMI, 1, 1, 7) #endif -#undef ANY11T +#undef ANY11I // Any 1 to 1 mirror. #define ANY11M(NAMEANY, ANY_SIMD, BPP, MASK) \ diff --git a/source/row_common.cc b/source/row_common.cc index 9a5543c4a..b80e0b3b9 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -418,6 +418,82 @@ void ARGBToAR30Row_C(const uint8_t* src_argb, uint8_t* dst_ar30, int width) { } } +void ARGBToAR64Row_C(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { + int x; + for (x = 0; x < width; ++x) { + dst_ar64[0] = src_argb[0] * 0x0101; + dst_ar64[1] = src_argb[1] * 0x0101; + dst_ar64[2] = src_argb[2] * 0x0101; + dst_ar64[3] = src_argb[3] * 0x0101; + dst_ar64 += 4; + src_argb += 4; + } +} + +void ARGBToAB64Row_C(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { + int x; + for (x = 0; x < width; ++x) { + dst_ab64[0] = src_argb[2] * 0x0101; + dst_ab64[1] = src_argb[1] * 0x0101; + dst_ab64[2] = src_argb[0] * 0x0101; + dst_ab64[3] = src_argb[3] * 0x0101; + dst_ab64 += 4; + src_argb += 4; + } +} + +void AR64ToARGBRow_C(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + dst_argb[0] = src_ar64[0] >> 8; + dst_argb[1] = src_ar64[1] >> 8; + dst_argb[2] = src_ar64[2] >> 8; + dst_argb[3] = src_ar64[3] >> 8; + dst_argb += 4; + src_ar64 += 4; + } +} + +void AB64ToARGBRow_C(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { + int x; + for (x = 0; x < width; ++x) { + dst_argb[0] = src_ab64[2] >> 8; + dst_argb[1] = src_ab64[1] >> 8; + dst_argb[2] = src_ab64[0] >> 8; + dst_argb[3] = src_ab64[3] >> 8; + dst_argb += 4; + src_ab64 += 4; + } +} + +// TODO(fbarchard): Make shuffle compatible with SIMD versions +void AR64ShuffleRow_C(const uint8_t* src_ar64, + uint8_t* dst_ar64, + const uint8_t* shuffler, + int width) { + const uint16_t* src_ar64_16 = (const uint16_t*)src_ar64; + uint16_t* dst_ar64_16 = (uint16_t*)dst_ar64; + int index0 = shuffler[0] / 2; + int index1 = shuffler[2] / 2; + int index2 = shuffler[4] / 2; + int index3 = shuffler[6] / 2; + // Shuffle a row of AR64. + int x; + for (x = 0; x < width / 2; ++x) { + // To support in-place conversion. + uint16_t b = src_ar64_16[index0]; + uint16_t g = src_ar64_16[index1]; + uint16_t r = src_ar64_16[index2]; + uint16_t a = src_ar64_16[index3]; + dst_ar64_16[0] = b; + dst_ar64_16[1] = g; + dst_ar64_16[2] = r; + dst_ar64_16[3] = a; + src_ar64_16 += 4; + dst_ar64_16 += 4; + } +} + #ifdef LIBYUV_RGB7 // Old 7 bit math for compatibility on unsupported platforms. static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { diff --git a/source/row_gcc.cc b/source/row_gcc.cc index f4d9978b8..49d453972 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1078,6 +1078,226 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { } #endif +static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, + 10, 9, 8, 11, 14, 13, 12, 15}; + +static const uvec8 kShuffleARGBToAB64Lo = {2, 2, 1, 1, 0, 0, 3, 3, + 6, 6, 5, 5, 4, 4, 7, 7}; +static const uvec8 kShuffleARGBToAB64Hi = {10, 10, 9, 9, 8, 8, 11, 11, + 14, 14, 13, 13, 12, 12, 15, 15}; + +void ARGBToAR64Row_SSSE3(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "punpckhbw %%xmm1,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} + +void ARGBToAB64Row_SSSE3(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { + asm volatile( + + "movdqa %3,%%xmm2 \n" + "movdqa %4,%%xmm3 \n" + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pshufb %%xmm2,%%xmm0 \n" + "pshufb %%xmm3,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x10(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToAB64Lo), // %3 + "m"(kShuffleARGBToAB64Hi) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} + +void AR64ToARGBRow_SSSE3(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrlw $8,%%xmm0 \n" + "psrlw $8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} + +void AB64ToARGBRow_SSSE3(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + + "movdqa %3,%%xmm2 \n" + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "psrlw $8,%%xmm0 \n" + "psrlw $8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "pshufb %%xmm2,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x10(%1),%1 \n" + "sub $0x4,%2 \n" + "jg 1b \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToABGR) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} + +#ifdef HAS_ARGBTOAR64ROW_AVX2 +void ARGBToAR64Row_AVX2(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpunpckhbw %%ymm0,%%ymm0,%%ymm1 \n" + "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif + +#ifdef HAS_ARGBTOAB64ROW_AVX2 +void ARGBToAB64Row_AVX2(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { + asm volatile( + + "vbroadcastf128 %3,%%ymm2 \n" + "vbroadcastf128 %4,%%ymm3 \n" + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm3,%%ymm0,%%ymm1 \n" + "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "vmovdqu %%ymm1,0x20(%1) \n" + "lea 0x20(%0),%0 \n" + "lea 0x40(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToAB64Lo), // %3 + "m"(kShuffleARGBToAB64Hi) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif + +#ifdef HAS_AR64TOARGBROW_AVX2 +void AR64ToARGBRow_AVX2(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpsrlw $8,%%ymm0,%%ymm0 \n" + "vpsrlw $8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x40(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "xmm0", "xmm1"); +} +#endif + +#ifdef HAS_AB64TOARGBROW_AVX2 +void AB64ToARGBRow_AVX2(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + + "vbroadcastf128 %3,%%ymm2 \n" + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm0 \n" + "vmovdqu 0x20(%0),%%ymm1 \n" + "vpsrlw $8,%%ymm0,%%ymm0 \n" + "vpsrlw $8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "vpshufb %%ymm2,%%ymm0,%%ymm0 \n" + "vmovdqu %%ymm0,(%1) \n" + "lea 0x40(%0),%0 \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToABGR) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2"); +} +#endif + // clang-format off // TODO(mraptis): Consider passing R, G, B multipliers as parameter. diff --git a/source/row_neon.cc b/source/row_neon.cc index 43a2cac75..5414d1ef4 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -2119,6 +2119,105 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); } +static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, + 10, 9, 8, 11, 14, 13, 12, 15}; + +void ARGBToAR64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + "1: \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q2}, [%0]! \n" + "vmov.u8 q1, q0 \n" + "vmov.u8 q3, q2 \n" + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels + "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +void ARGBToAB64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { + asm volatile( + "vld1.8 q4, %3 \n" // shuffler + "1: \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q2}, [%0]! \n" + "vtbl.8 d2, {d0, d1}, d8 \n" + "vtbl.8 d3, {d0, d1}, d9 \n" + "vtbl.8 d6, {d4, d5}, d8 \n" + "vtbl.8 d7, {d4, d5}, d9 \n" + "vmov.u8 q0, q1 \n" + "vmov.u8 q2, q3 \n" + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels + "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToABGR) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); +} + +void AR64ToARGBRow_NEON(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + "1: \n" + "vld1.16 {q0}, [%0]! \n" + "vld1.16 {q1}, [%0]! \n" + "vld1.16 {q2}, [%0]! \n" + "vld1.16 {q3}, [%0]! \n" + "vshrn.u16 d0, q0, #8 \n" + "vshrn.u16 d1, q1, #8 \n" + "vshrn.u16 d4, q2, #8 \n" + "vshrn.u16 d5, q3, #8 \n" + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst1.8 {q0}, [%1]! \n" // store 4 pixels + "vst1.8 {q2}, [%1]! \n" // store 4 pixels + "bgt 1b \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + +static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15}; + +void AB64ToARGBRow_NEON(const uint16_t* src_ab64, + uint8_t* dst_argb, + int width) { + asm volatile( + "vld1.8 d8, %3 \n" // shuffler + "1: \n" + "vld1.16 {q0}, [%0]! \n" + "vld1.16 {q1}, [%0]! \n" + "vld1.16 {q2}, [%0]! \n" + "vld1.16 {q3}, [%0]! \n" + "vtbl.8 d0, {d0, d1}, d8 \n" + "vtbl.8 d1, {d2, d3}, d8 \n" + "vtbl.8 d4, {d4, d5}, d8 \n" + "vtbl.8 d5, {d6, d7}, d8 \n" + "subs %2, %2, #8 \n" // 8 processed per loop. + "vst1.8 {q0}, [%1]! \n" // store 4 pixels + "vst1.8 {q2}, [%1]! \n" // store 4 pixels + "bgt 1b \n" + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAB64ToARGB) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); +} + void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { asm volatile( "vmov.u8 d6, #25 \n" // B * 0.1016 coefficient diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 941c9b980..9662cd3cb 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -1565,6 +1565,100 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"); } +static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, + 10, 9, 8, 11, 14, 13, 12, 15}; + +void ARGBToAR64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + "1: \n" + "ldp q0, q2, [%0], #32 \n" // load 8 pixels + "mov v1.16b, v0.16b \n" + "mov v3.16b, v2.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels + "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +void ARGBToAB64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ab64, + int width) { + asm volatile( + "ld1 {v4.16b}, %3 \n" // shuffler + "1: \n" + "ldp q0, q2, [%0], #32 \n" // load 8 pixels + "tbl v0.16b, {v0.16b}, v4.16b \n" + "tbl v2.16b, {v2.16b}, v4.16b \n" + "mov v1.16b, v0.16b \n" + "mov v3.16b, v2.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels + "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "m"(kShuffleARGBToABGR) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +static const uvec8 kShuffleAR64ToARGB = {1, 3, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31}; + +void AR64ToARGBRow_NEON(const uint16_t* src_ar64, + uint8_t* dst_argb, + int width) { + asm volatile( + "ld1 {v4.16b}, %3 \n" // shuffler + "1: \n" + "ldp q0, q1, [%0], #32 \n" // load 4 pixels + "ldp q2, q3, [%0], #32 \n" // load 4 pixels + "prfm pldl1keep, [%0, 448] \n" + "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" + "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "stp q0, q2, [%1], #32 \n" // store 8 pixels + "b.gt 1b \n" + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAR64ToARGB) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15, + 21, 19, 17, 23, 29, 27, 25, 31}; + +void AB64ToARGBRow_NEON(const uint16_t* src_ab64, + uint8_t* dst_argb, + int width) { + asm volatile( + "ld1 {v4.16b}, %3 \n" // shuffler + "1: \n" + "ldp q0, q1, [%0], #32 \n" // load 4 pixels + "ldp q2, q3, [%0], #32 \n" // load 4 pixels + "prfm pldl1keep, [%0, 448] \n" + "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" + "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "stp q0, q2, [%1], #32 \n" // store 8 pixels + "b.gt 1b \n" + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "m"(kShuffleAB64ToARGB) // %3 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { asm volatile( "movi v4.8b, #25 \n" // B * 0.1016 coefficient @@ -3595,8 +3689,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y, asm volatile( "dup v2.8h, %w2 \n" "1: \n" - "ldp q0, q1, [%0] \n" - "add %0, %0, #32 \n" + "ldp q0, q1, [%0], #32 \n" "prfm pldl1keep, [%0, 448] \n" "mul v0.8h, v0.8h, v2.8h \n" "mul v1.8h, v1.8h, v2.8h \n" @@ -3619,8 +3712,7 @@ void DivideRow_16_NEON(const uint16_t* src_y, asm volatile( "dup v0.8h, %w2 \n" "1: \n" - "ldp q1, q2, [%0] \n" - "add %0, %0, #32 \n" + "ldp q1, q2, [%0], #32 \n" "prfm pldl1keep, [%0, 448] \n" "ushll v3.4s, v1.4h, #0 \n" "ushll v4.4s, v2.4h, #0 \n" diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 8b4f4231d..e83eea9ad 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -1214,147 +1214,159 @@ TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2) TESTATOBIPLANAR(AYUV, 1, 4, NV12, 2, 2) TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2) -#define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, W1280, N, NEG, OFF) \ - TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = benchmark_height_; \ - const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ - const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ - const int kStrideA = \ - (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ - const int kStrideB = \ - (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ - align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \ - for (int i = 0; i < kStrideA * kHeightA; ++i) { \ - src_argb[i + OFF] = (fastrand() & 0xff); \ - } \ - memset(dst_argb_c, 1, kStrideB* kHeightB); \ - memset(dst_argb_opt, 101, kStrideB* kHeightB); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_c, kStrideB, kWidth, \ - NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_A##To##FMT_B(src_argb + OFF, kStrideA, dst_argb_opt, kStrideB, \ - kWidth, NEG kHeight); \ - } \ - for (int i = 0; i < kStrideB * kHeightB; ++i) { \ - EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ - } \ - free_aligned_buffer_page_end(src_argb); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ +#define TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \ + EPP_B, STRIDE_B, HEIGHT_B, W1280, N, NEG, OFF) \ + TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ + const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ + const int kStrideA = \ + (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ + const int kStrideB = \ + (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ + align_buffer_page_end(src_argb, \ + kStrideA* kHeightA * sizeof(TYPE_A) + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeightB * sizeof(TYPE_B)); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB * sizeof(TYPE_B)); \ + for (int i = 0; i < kStrideA * kHeightA * sizeof(TYPE_A); ++i) { \ + src_argb[i + OFF] = (fastrand() & 0xff); \ + } \ + memset(dst_argb_c, 1, kStrideB* kHeightB); \ + memset(dst_argb_opt, 101, kStrideB* kHeightB); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_B*)dst_argb_c, \ + kStrideB, kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_A##To##FMT_B((TYPE_A*)(src_argb + OFF), kStrideA, \ + (TYPE_B*)dst_argb_opt, kStrideB, kWidth, NEG kHeight); \ + } \ + for (int i = 0; i < kStrideB * kHeightB * sizeof(TYPE_B); ++i) { \ + EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ + } \ + free_aligned_buffer_page_end(src_argb); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ } -#define TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, \ - STRIDE_B, HEIGHT_B) \ - TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) { \ - for (int times = 0; times < benchmark_iterations_; ++times) { \ - const int kWidth = (fastrand() & 63) + 1; \ - const int kHeight = (fastrand() & 31) + 1; \ - const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ - const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ - const int kStrideA = \ - (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ - const int kStrideB = \ - (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ - align_buffer_page_end(src_argb, kStrideA* kHeightA); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeightB); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeightB); \ - for (int i = 0; i < kStrideA * kHeightA; ++i) { \ - src_argb[i] = (fastrand() & 0xff); \ - } \ - memset(dst_argb_c, 123, kStrideB* kHeightB); \ - memset(dst_argb_opt, 123, kStrideB* kHeightB); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_c, kStrideB, kWidth, \ - kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - FMT_A##To##FMT_B(src_argb, kStrideA, dst_argb_opt, kStrideB, kWidth, \ - kHeight); \ - for (int i = 0; i < kStrideB * kHeightB; ++i) { \ - EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ - } \ - free_aligned_buffer_page_end(src_argb); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ - } \ +#define TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, \ + TYPE_B, EPP_B, STRIDE_B, HEIGHT_B) \ + TEST_F(LibYUVConvertTest, FMT_A##To##FMT_B##_Random) { \ + for (int times = 0; times < benchmark_iterations_; ++times) { \ + const int kWidth = (fastrand() & 63) + 1; \ + const int kHeight = (fastrand() & 31) + 1; \ + const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ + const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ + const int kStrideA = \ + (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ + const int kStrideB = \ + (kWidth * EPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ + align_buffer_page_end(src_argb, kStrideA* kHeightA * sizeof(TYPE_A)); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeightB * sizeof(TYPE_B)); \ + align_buffer_page_end(dst_argb_opt, \ + kStrideB* kHeightB * sizeof(TYPE_B)); \ + for (int i = 0; i < kStrideA * kHeightA * sizeof(TYPE_A); ++i) { \ + src_argb[i] = 0xfe; \ + } \ + memset(dst_argb_c, 123, kStrideB* kHeightB); \ + memset(dst_argb_opt, 123, kStrideB* kHeightB); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_c, \ + kStrideB, kWidth, kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + FMT_A##To##FMT_B((TYPE_A*)src_argb, kStrideA, (TYPE_B*)dst_argb_opt, \ + kStrideB, kWidth, kHeight); \ + for (int i = 0; i < kStrideB * kHeightB * sizeof(TYPE_B); ++i) { \ + EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ + } \ + free_aligned_buffer_page_end(src_argb); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + } \ } -#define TESTATOB(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B) \ - TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, benchmark_width_ - 4, _Any, +, 0) \ - TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, benchmark_width_, _Unaligned, +, 1) \ - TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, benchmark_width_, _Invert, -, 0) \ - TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B, benchmark_width_, _Opt, +, 0) \ - TESTATOBRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ - HEIGHT_B) +#define TESTATOB(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \ + EPP_B, STRIDE_B, HEIGHT_B) \ + TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \ + STRIDE_B, HEIGHT_B, benchmark_width_ - 4, _Any, +, 0) \ + TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \ + STRIDE_B, HEIGHT_B, benchmark_width_, _Unaligned, +, 1) \ + TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \ + STRIDE_B, HEIGHT_B, benchmark_width_, _Invert, -, 0) \ + TESTATOBI(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, EPP_B, \ + STRIDE_B, HEIGHT_B, benchmark_width_, _Opt, +, 0) \ + TESTATOBRANDOM(FMT_A, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, FMT_B, TYPE_B, \ + EPP_B, STRIDE_B, HEIGHT_B) -TESTATOB(AB30, 4, 4, 1, ABGR, 4, 4, 1) -TESTATOB(AB30, 4, 4, 1, ARGB, 4, 4, 1) +TESTATOB(AB30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) +TESTATOB(AB30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1) +TESTATOB(ABGR, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1) #endif -TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1) +TESTATOB(ABGR, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTATOB(AR30, 4, 4, 1, AB30, 4, 4, 1) +TESTATOB(AR30, uint8_t, 4, 4, 1, AB30, uint8_t, 4, 4, 1) #endif -TESTATOB(AR30, 4, 4, 1, ABGR, 4, 4, 1) +TESTATOB(AR30, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTATOB(AR30, 4, 4, 1, AR30, 4, 4, 1) -TESTATOB(AR30, 4, 4, 1, ARGB, 4, 4, 1) +TESTATOB(AR30, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1) +TESTATOB(AR30, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) #endif -TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, AR30, uint8_t, 4, 4, 1) #endif -TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1) -TESTATOB(ARGB, 4, 4, 1, ARGB1555, 2, 2, 1) -TESTATOB(ARGB, 4, 4, 1, ARGB4444, 2, 2, 1) -TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1) -TESTATOB(ARGB, 4, 4, 1, BGRA, 4, 4, 1) -TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1) -TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1) -TESTATOB(RGBA, 4, 4, 1, J400, 1, 1, 1) -TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1) -TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1) -TESTATOB(ABGR, 4, 4, 1, RAW, 3, 3, 1) -TESTATOB(ABGR, 4, 4, 1, RGB24, 3, 3, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB1555, uint8_t, 2, 2, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGB4444, uint8_t, 2, 2, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, ARGBMirror, uint8_t, 4, 4, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, BGRA, uint8_t, 4, 4, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, I400, uint8_t, 1, 1, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1) +TESTATOB(RGBA, uint8_t, 4, 4, 1, J400, uint8_t, 1, 1, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1) +TESTATOB(ABGR, uint8_t, 4, 4, 1, RAW, uint8_t, 3, 3, 1) +TESTATOB(ABGR, uint8_t, 4, 4, 1, RGB24, uint8_t, 3, 3, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, RGB565, uint8_t, 2, 2, 1) #endif -TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1) -TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1) -TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1) // 4 -TESTATOB(ARGB1555, 2, 2, 1, ARGB, 4, 4, 1) -TESTATOB(ARGB4444, 2, 2, 1, ARGB, 4, 4, 1) -TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1) -TESTATOB(I400, 1, 1, 1, ARGB, 4, 4, 1) -TESTATOB(I400, 1, 1, 1, I400, 1, 1, 1) -TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1) -TESTATOB(J400, 1, 1, 1, ARGB, 4, 4, 1) -TESTATOB(J400, 1, 1, 1, J400, 1, 1, 1) -TESTATOB(RAW, 3, 3, 1, ARGB, 4, 4, 1) -TESTATOB(RAW, 3, 3, 1, RGBA, 4, 4, 1) -TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1) -TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1) -TESTATOB(RGB24, 3, 3, 1, J400, 1, 1, 1) -TESTATOB(RGB24, 3, 3, 1, RGB24Mirror, 3, 3, 1) -TESTATOB(RAW, 3, 3, 1, J400, 1, 1, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, RGBA, uint8_t, 4, 4, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, UYVY, uint8_t, 2, 4, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, YUY2, uint8_t, 2, 4, 1) // 4 +TESTATOB(ARGB1555, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(ARGB4444, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(BGRA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(I400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(I400, uint8_t, 1, 1, 1, I400, uint8_t, 1, 1, 1) +TESTATOB(I400, uint8_t, 1, 1, 1, I400Mirror, uint8_t, 1, 1, 1) +TESTATOB(J400, uint8_t, 1, 1, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(J400, uint8_t, 1, 1, 1, J400, uint8_t, 1, 1, 1) +TESTATOB(RAW, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(RAW, uint8_t, 3, 3, 1, RGBA, uint8_t, 4, 4, 1) +TESTATOB(RAW, uint8_t, 3, 3, 1, RGB24, uint8_t, 3, 3, 1) +TESTATOB(RGB24, uint8_t, 3, 3, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(RGB24, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1) +TESTATOB(RGB24, uint8_t, 3, 3, 1, RGB24Mirror, uint8_t, 3, 3, 1) +TESTATOB(RAW, uint8_t, 3, 3, 1, J400, uint8_t, 1, 1, 1) #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1) +TESTATOB(RGB565, uint8_t, 2, 2, 1, ARGB, uint8_t, 4, 4, 1) #endif -TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1) -TESTATOB(UYVY, 2, 4, 1, ARGB, 4, 4, 1) -TESTATOB(YUY2, 2, 4, 1, ARGB, 4, 4, 1) -TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1) +TESTATOB(RGBA, uint8_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(UYVY, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(YUY2, uint8_t, 2, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(YUY2, uint8_t, 2, 4, 1, Y, uint8_t, 1, 1, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1) +TESTATOB(ARGB, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1) +TESTATOB(ABGR, uint8_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1) +TESTATOB(ABGR, uint8_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1) +TESTATOB(AR64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(AB64, uint16_t, 4, 4, 1, ARGB, uint8_t, 4, 4, 1) +TESTATOB(AR64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) +TESTATOB(AB64, uint16_t, 4, 4, 1, ABGR, uint8_t, 4, 4, 1) +TESTATOB(AR64, uint16_t, 4, 4, 1, AB64, uint16_t, 4, 4, 1) +TESTATOB(AB64, uint16_t, 4, 4, 1, AR64, uint16_t, 4, 4, 1) #define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ HEIGHT_B, W1280, N, NEG, OFF) \ @@ -1443,35 +1455,38 @@ TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1) TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1) #endif -#define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG, OFF) \ +#define TESTSYMI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, W1280, N, NEG, \ + OFF) \ TEST_F(LibYUVConvertTest, FMT_ATOB##_Symetric##N) { \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ const int kHeight = benchmark_height_; \ const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ const int kStrideA = \ - (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ - align_buffer_page_end(src_argb, kStrideA* kHeightA + OFF); \ - align_buffer_page_end(dst_argb_c, kStrideA* kHeightA); \ - align_buffer_page_end(dst_argb_opt, kStrideA* kHeightA); \ - for (int i = 0; i < kStrideA * kHeightA; ++i) { \ + (kWidth * EPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ + align_buffer_page_end(src_argb, \ + kStrideA* kHeightA * sizeof(TYPE_A) + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideA* kHeightA * sizeof(TYPE_A)); \ + align_buffer_page_end(dst_argb_opt, kStrideA* kHeightA * sizeof(TYPE_A)); \ + for (int i = 0; i < kStrideA * kHeightA * sizeof(TYPE_A); ++i) { \ src_argb[i + OFF] = (fastrand() & 0xff); \ } \ memset(dst_argb_c, 1, kStrideA* kHeightA); \ memset(dst_argb_opt, 101, kStrideA* kHeightA); \ MaskCpuFlags(disable_cpu_flags_); \ - FMT_ATOB(src_argb + OFF, kStrideA, dst_argb_c, kStrideA, kWidth, \ - NEG kHeight); \ + FMT_ATOB((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_A*)dst_argb_c, \ + kStrideA, kWidth, NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_ATOB(src_argb + OFF, kStrideA, dst_argb_opt, kStrideA, kWidth, \ - NEG kHeight); \ + FMT_ATOB((TYPE_A*)(src_argb + OFF), kStrideA, (TYPE_A*)dst_argb_opt, \ + kStrideA, kWidth, NEG kHeight); \ } \ MaskCpuFlags(disable_cpu_flags_); \ - FMT_ATOB(dst_argb_c, kStrideA, dst_argb_c, kStrideA, kWidth, NEG kHeight); \ + FMT_ATOB((TYPE_A*)dst_argb_c, kStrideA, (TYPE_A*)dst_argb_c, kStrideA, \ + kWidth, NEG kHeight); \ MaskCpuFlags(benchmark_cpu_info_); \ - FMT_ATOB(dst_argb_opt, kStrideA, dst_argb_opt, kStrideA, kWidth, \ - NEG kHeight); \ - for (int i = 0; i < kStrideA * kHeightA; ++i) { \ + FMT_ATOB((TYPE_A*)dst_argb_opt, kStrideA, (TYPE_A*)dst_argb_opt, kStrideA, \ + kWidth, NEG kHeight); \ + for (int i = 0; i < kStrideA * kHeightA * sizeof(TYPE_A); ++i) { \ EXPECT_EQ(src_argb[i + OFF], dst_argb_opt[i]); \ EXPECT_EQ(dst_argb_c[i], dst_argb_opt[i]); \ } \ @@ -1480,18 +1495,20 @@ TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1) free_aligned_buffer_page_end(dst_argb_opt); \ } -#define TESTSYM(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A) \ - TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_ - 4, _Any, +, \ - 0) \ - TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, _Unaligned, \ - +, 1) \ - TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, _Opt, +, 0) +#define TESTSYM(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A) \ + TESTSYMI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_ - 4, \ + _Any, +, 0) \ + TESTSYMI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, \ + _Unaligned, +, 1) \ + TESTSYMI(FMT_ATOB, TYPE_A, EPP_A, STRIDE_A, HEIGHT_A, benchmark_width_, \ + _Opt, +, 0) -TESTSYM(ARGBToARGB, 4, 4, 1) -TESTSYM(ARGBToBGRA, 4, 4, 1) -TESTSYM(ARGBToABGR, 4, 4, 1) -TESTSYM(BGRAToARGB, 4, 4, 1) -TESTSYM(ABGRToARGB, 4, 4, 1) +TESTSYM(ARGBToARGB, uint8_t, 4, 4, 1) +TESTSYM(ARGBToBGRA, uint8_t, 4, 4, 1) +TESTSYM(ARGBToABGR, uint8_t, 4, 4, 1) +TESTSYM(BGRAToARGB, uint8_t, 4, 4, 1) +TESTSYM(ABGRToARGB, uint8_t, 4, 4, 1) +TESTSYM(AB64ToAR64, uint16_t, 4, 4, 1) TEST_F(LibYUVConvertTest, Test565) { SIMD_ALIGNED(uint8_t orig_pixels[256][4]); diff --git a/unit_test/video_common_test.cc b/unit_test/video_common_test.cc index 6c6a384d4..36728ea90 100644 --- a/unit_test/video_common_test.cc +++ b/unit_test/video_common_test.cc @@ -29,7 +29,7 @@ static bool TestValidFourCC(uint32_t fourcc, int bpp) { !TestValidChar((fourcc >> 24) & 0xff)) { return false; } - if (bpp < 0 || bpp > 32) { + if (bpp < 0 || bpp > 64) { return false; } return true; @@ -72,6 +72,8 @@ TEST_F(LibYUVBaseTest, TestFourCC) { EXPECT_TRUE(TestValidFourCC(FOURCC_ABGR, FOURCC_BPP_ABGR)); EXPECT_TRUE(TestValidFourCC(FOURCC_AR30, FOURCC_BPP_AR30)); EXPECT_TRUE(TestValidFourCC(FOURCC_AB30, FOURCC_BPP_AB30)); + EXPECT_TRUE(TestValidFourCC(FOURCC_AR64, FOURCC_BPP_AR64)); + EXPECT_TRUE(TestValidFourCC(FOURCC_AB64, FOURCC_BPP_AB64)); EXPECT_TRUE(TestValidFourCC(FOURCC_24BG, FOURCC_BPP_24BG)); EXPECT_TRUE(TestValidFourCC(FOURCC_RAW, FOURCC_BPP_RAW)); EXPECT_TRUE(TestValidFourCC(FOURCC_RGBA, FOURCC_BPP_RGBA));