From da41bca02b575354070666c49bfe885fdcdad528 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Wed, 20 May 2020 04:53:55 -0700 Subject: [PATCH] I400ToARGBMatrix Pass a color matrix to use different coefficients 32 bit Neon I400ToARGB_Opt (1937 ms) 64 bit C I400ToARGB_Opt (8957 ms) NEON I400ToARGB_Opt (2147 ms) x86 cI400ToARGB_Opt (1110 ms) AVX2 I400ToARGB_Opt (213 ms) SSE2 I400ToARGB_Opt (225 ms) Bug: libyuv:861, b/156642185 Change-Id: I96b6f4ebba6ff9c4ed8803291ce098de6f93fa4f Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2209718 Commit-Queue: Frank Barchard Reviewed-by: richard winterton --- README.chromium | 2 +- include/libyuv/convert_argb.h | 10 +++ include/libyuv/row.h | 50 +++++++++--- include/libyuv/version.h | 2 +- source/convert_argb.cc | 32 ++++++-- source/row_any.cc | 50 ++++++++++-- source/row_common.cc | 149 ++++++++++++++++++++-------------- source/row_gcc.cc | 61 +++++++------- source/row_mmi.cc | 4 +- source/row_msa.cc | 6 +- source/row_neon.cc | 13 +-- source/row_neon64.cc | 13 +-- source/row_win.cc | 3 + unit_test/convert_test.cc | 62 ++++++++++++++ 14 files changed, 320 insertions(+), 137 deletions(-) diff --git a/README.chromium b/README.chromium index d07f8e0e1..abf7e6f2e 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1755 +Version: 1756 License: BSD License File: LICENSE diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h index 0988c69ce..fd5f42dc4 100644 --- a/include/libyuv/convert_argb.h +++ b/include/libyuv/convert_argb.h @@ -1548,6 +1548,16 @@ int I420ToAR30Matrix(const uint8_t* src_y, int width, int height); +// Convert I400 (grey) to ARGB. Reverse of ARGBToI400. +LIBYUV_API +int I400ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + // Convert camera sample to ARGB with cropping, rotation and vertical flip. // "sample_size" is needed to parse MJPG. // "dst_stride_argb" number of bytes in a row of the dst_argb plane. diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 94ef575e2..1dc6e9b56 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -98,7 +98,6 @@ extern "C" { #define HAS_COPYROW_SSE2 #define HAS_H422TOARGBROW_SSSE3 #define HAS_HALFFLOATROW_SSE2 -#define HAS_I400TOARGBROW_SSE2 #define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TOARGB4444ROW_SSSE3 #define HAS_I422TOARGBROW_SSSE3 @@ -201,7 +200,6 @@ extern "C" { #define HAS_H422TOARGBROW_AVX2 #define HAS_HALFFLOATROW_AVX2 // #define HAS_HALFFLOATROW_F16C // Enable to test halffloat cast -#define HAS_I400TOARGBROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2 #define HAS_I422TOARGBROW_AVX2 @@ -275,6 +273,7 @@ extern "C" { #define HAS_HALFMERGEUVROW_SSSE3 #define HAS_I210TOAR30ROW_SSSE3 #define HAS_I210TOARGBROW_SSSE3 +#define HAS_I400TOARGBROW_SSE2 #define HAS_I422TOAR30ROW_SSSE3 #define HAS_MERGERGBROW_SSSE3 #define HAS_MIRRORUVROW_AVX2 @@ -303,6 +302,7 @@ extern "C" { #define HAS_HALFMERGEUVROW_AVX2 #define HAS_I210TOAR30ROW_AVX2 #define HAS_I210TOARGBROW_AVX2 +#define HAS_I400TOARGBROW_AVX2 #define HAS_I422TOAR30ROW_AVX2 #define HAS_I422TOUYVYROW_AVX2 #define HAS_I422TOYUY2ROW_AVX2 @@ -693,6 +693,7 @@ struct YuvConstants { int16_t kUVBiasG[16]; int16_t kUVBiasR[16]; int16_t kYToRgb[16]; + int16_t kYBiasToRgb[16]; }; // Offsets into YuvConstants structure @@ -703,6 +704,8 @@ struct YuvConstants { #define KUVBIASG 128 #define KUVBIASR 160 #define KYTORGB 192 +#define KYBIASTORGB 224 + #endif #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a)-1))) @@ -2796,23 +2799,50 @@ void I422ToRGB24Row_Any_AVX2(const uint8_t* y_buf, const struct YuvConstants* yuvconstants, int width); -void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width); -void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width); -void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width); -void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width); -void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width); -void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width); +void I400ToARGBRow_C(const uint8_t* src_y, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I400ToARGBRow_SSE2(const uint8_t* y_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I400ToARGBRow_AVX2(const uint8_t* y_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I400ToARGBRow_NEON(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I400ToARGBRow_MSA(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void I400ToARGBRow_MMI(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); void I400ToARGBRow_Any_SSE2(const uint8_t* src_ptr, uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, int width); void I400ToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, int width); void I400ToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, int width); -void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void I400ToARGBRow_Any_MSA(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); +void I400ToARGBRow_Any_MMI(const uint8_t* src_ptr, + uint8_t* dst_ptr, + const struct YuvConstants* yuvconstants, + int width); // ARGB preattenuated alpha blend. void ARGBBlendRow_SSSE3(const uint8_t* src_argb0, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index e8d48b2ff..7af8c605a 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1755 +#define LIBYUV_VERSION 1756 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert_argb.cc b/source/convert_argb.cc index a0f23c353..14fae1dbd 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -1559,16 +1559,18 @@ int I420AlphaToABGR(const uint8_t* src_y, width, height, attenuate); } -// Convert I400 to ARGB. +// Convert I400 to ARGB with matrix. LIBYUV_API -int I400ToARGB(const uint8_t* src_y, - int src_stride_y, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height) { +int I400ToARGBMatrix(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { int y; - void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, int width) = + void (*I400ToARGBRow)(const uint8_t* y_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = I400ToARGBRow_C; if (!src_y || !dst_argb || width <= 0 || height == 0) { return -1; @@ -1627,13 +1629,25 @@ int I400ToARGB(const uint8_t* src_y, #endif for (y = 0; y < height; ++y) { - I400ToARGBRow(src_y, dst_argb, width); + I400ToARGBRow(src_y, dst_argb, yuvconstants, width); dst_argb += dst_stride_argb; src_y += src_stride_y; } return 0; } +// Convert I400 to ARGB. +LIBYUV_API +int I400ToARGB(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_argb, + int dst_stride_argb, + int width, + int height) { + return I400ToARGBMatrix(src_y, src_stride_y, dst_argb, dst_stride_argb, + &kYuvI601Constants, width, height); +} + // Convert J400 to ARGB. LIBYUV_API int J400ToARGB(const uint8_t* src_y, diff --git a/source/row_any.cc b/source/row_any.cc index 207e3266c..8e3e91104 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -547,12 +547,6 @@ ANY11(J400ToARGBRow_Any_SSE2, J400ToARGBRow_SSE2, 0, 1, 4, 7) #if defined(HAS_J400TOARGBROW_AVX2) ANY11(J400ToARGBRow_Any_AVX2, J400ToARGBRow_AVX2, 0, 1, 4, 15) #endif -#if defined(HAS_I400TOARGBROW_SSE2) -ANY11(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, 0, 1, 4, 7) -#endif -#if defined(HAS_I400TOARGBROW_AVX2) -ANY11(I400ToARGBRow_Any_AVX2, I400ToARGBRow_AVX2, 0, 1, 4, 15) -#endif #if defined(HAS_RGB24TOARGBROW_SSSE3) ANY11(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 0, 3, 4, 15) ANY11(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 0, 3, 4, 15) @@ -582,7 +576,6 @@ ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, 0, 4, 2, 7) ANY11(J400ToARGBRow_Any_NEON, J400ToARGBRow_NEON, 0, 1, 4, 7) -ANY11(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, 0, 1, 4, 7) #endif #if defined(HAS_ARGBTORGB24ROW_MSA) ANY11(ARGBToRGB24Row_Any_MSA, ARGBToRGB24Row_MSA, 0, 4, 3, 15) @@ -591,7 +584,6 @@ ANY11(ARGBToRGB565Row_Any_MSA, ARGBToRGB565Row_MSA, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_MSA, ARGBToARGB1555Row_MSA, 0, 4, 2, 7) ANY11(ARGBToARGB4444Row_Any_MSA, ARGBToARGB4444Row_MSA, 0, 4, 2, 7) ANY11(J400ToARGBRow_Any_MSA, J400ToARGBRow_MSA, 0, 1, 4, 15) -ANY11(I400ToARGBRow_Any_MSA, I400ToARGBRow_MSA, 0, 1, 4, 15) #endif #if defined(HAS_ARGBTORGB24ROW_MMI) ANY11(ARGBToRGB24Row_Any_MMI, ARGBToRGB24Row_MMI, 0, 4, 3, 3) @@ -600,7 +592,6 @@ ANY11(ARGBToRGB565Row_Any_MMI, ARGBToRGB565Row_MMI, 0, 4, 2, 3) ANY11(ARGBToARGB1555Row_Any_MMI, ARGBToARGB1555Row_MMI, 0, 4, 2, 3) ANY11(ARGBToARGB4444Row_Any_MMI, ARGBToARGB4444Row_MMI, 0, 4, 2, 3) ANY11(J400ToARGBRow_Any_MMI, J400ToARGBRow_MMI, 0, 1, 4, 3) -ANY11(I400ToARGBRow_Any_MMI, I400ToARGBRow_MMI, 0, 1, 4, 7) #endif #if defined(HAS_RAWTORGB24ROW_NEON) ANY11(RAWToRGB24Row_Any_NEON, RAWToRGB24Row_NEON, 0, 3, 3, 7) @@ -920,6 +911,47 @@ ANY11B(ARGBCopyYToAlphaRow_Any_MMI, ARGBCopyYToAlphaRow_MMI, 0, 1, 4, 7) memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \ } +#if defined(HAS_I400TOARGBROW_SSE2) +ANY11P(I400ToARGBRow_Any_SSE2, + I400ToARGBRow_SSE2, + const struct YuvConstants*, + 1, + 4, + 7) +#endif +#if defined(HAS_I400TOARGBROW_AVX2) +ANY11P(I400ToARGBRow_Any_AVX2, + I400ToARGBRow_AVX2, + const struct YuvConstants*, + 1, + 4, + 15) +#endif +#if defined(HAS_I400TOARGBROW_NEON) +ANY11P(I400ToARGBRow_Any_NEON, + I400ToARGBRow_NEON, + const struct YuvConstants*, + 1, + 4, + 7) +#endif +#if defined(HAS_I400TOARGBROW_MSA) +ANY11P(I400ToARGBRow_Any_MSA, + I400ToARGBRow_MSA, + const struct YuvConstants*, + 1, + 4, + 15) +#endif +#if defined(HAS_I400TOARGBROW_MMI) +ANY11P(I400ToARGBRow_Any_MMI, + I400ToARGBRow_MMI, + const struct YuvConstants*, + 1, + 4, + 7) +#endif + #if defined(HAS_ARGBTORGB565DITHERROW_SSE2) ANY11P(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2, diff --git a/source/row_common.cc b/source/row_common.cc index a47c03937..fea9be869 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1353,26 +1353,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, {UG, VG, UG, VG, UG, VG, UG, VG}, {UG, VG, UG, VG, UG, VG, UG, VG}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BB, BG, BR, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, {VG, UG, VG, UG, VG, UG, VG, UG}, {VG, UG, VG, UG, VG, UG, VG, UG}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BR, BG, BB, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; #elif defined(__arm__) // 32 bit arm const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BB, BG, BR, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BR, BG, BB, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; #else const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, @@ -1384,7 +1384,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuvI601Constants) = { {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, + {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, + YGB}}; const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, @@ -1395,7 +1397,9 @@ const struct YuvConstants SIMD_ALIGNED(kYvuI601Constants) = { {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, + {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, + YGB}}; #endif #undef BB @@ -1434,26 +1438,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, {UG, VG, UG, VG, UG, VG, UG, VG}, {UG, VG, UG, VG, UG, VG, UG, VG}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BB, BG, BR, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, {VG, UG, VG, UG, VG, UG, VG, UG}, {VG, UG, VG, UG, VG, UG, VG, UG}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BR, BG, BB, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; #elif defined(__arm__) const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BB, BG, BR, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BR, BG, BB, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; #else const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, @@ -1465,7 +1469,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuvJPEGConstants) = { {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, + {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, + YGB}}; const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, @@ -1476,7 +1482,9 @@ const struct YuvConstants SIMD_ALIGNED(kYvuJPEGConstants) = { {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, + {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, + YGB}}; #endif #undef BB @@ -1517,26 +1525,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, {UG, VG, UG, VG, UG, VG, UG, VG}, {UG, VG, UG, VG, UG, VG, UG, VG}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BB, BG, BR, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, {VG, UG, VG, UG, VG, UG, VG, UG}, {VG, UG, VG, UG, VG, UG, VG, UG}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BR, BG, BB, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; #elif defined(__arm__) const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BB, BG, BR, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BR, BG, BB, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; #else const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, @@ -1548,7 +1556,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuvH709Constants) = { {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, + {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, + YGB}}; const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, @@ -1559,7 +1569,9 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, + {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, + YGB}}; #endif #undef BB @@ -1598,26 +1610,26 @@ const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = { {-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, {UG, VG, UG, VG, UG, VG, UG, VG}, {UG, VG, UG, VG, UG, VG, UG, VG}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BB, BG, BR, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = { {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, {-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, {VG, UG, VG, UG, VG, UG, VG, UG}, {VG, UG, VG, UG, VG, UG, VG, UG}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BR, BG, BB, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; #elif defined(__arm__) const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = { {-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, {UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BB, BG, BR, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BB, BG, BR, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = { {-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, {VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, - {BR, BG, BB, 0, 0, 0, 0, 0}, - {0x0101 * YG, 0, 0, 0}}; + {BR, BG, BB, YGB, 0, 0, 0, 0}, + {0x0101 * YG, YG, 0, 0}}; #else const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = { {UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, @@ -1629,7 +1641,9 @@ const struct YuvConstants SIMD_ALIGNED(kYuv2020Constants) = { {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, + {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, + YGB}}; const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = { {VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, @@ -1640,7 +1654,9 @@ const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = { {BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, {BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, {BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, - {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}}; + {YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, + {YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, + YGB}}; #endif #undef BB @@ -1655,7 +1671,6 @@ const struct YuvConstants SIMD_ALIGNED(kYvu2020Constants) = { // C reference code that mimics the YUV assembly. // Reads 8 bit YUV and leaves result as 16 bit. - static __inline void YuvPixel(uint8_t y, uint8_t u, uint8_t v, @@ -1671,7 +1686,7 @@ static __inline void YuvPixel(uint8_t y, int bb = yuvconstants->kUVBiasBGR[0]; int bg = yuvconstants->kUVBiasBGR[1]; int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; + int yg = yuvconstants->kYToRgb[1]; #elif defined(__arm__) int ub = -yuvconstants->kUVToRB[0]; int ug = yuvconstants->kUVToG[0]; @@ -1680,7 +1695,7 @@ static __inline void YuvPixel(uint8_t y, int bb = yuvconstants->kUVBiasBGR[0]; int bg = yuvconstants->kUVBiasBGR[1]; int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; + int yg = yuvconstants->kYToRgb[1]; #else int ub = yuvconstants->kUVToB[0]; int ug = yuvconstants->kUVToG[0]; @@ -1714,7 +1729,7 @@ static __inline void YuvPixel8_16(uint8_t y, int bb = yuvconstants->kUVBiasBGR[0]; int bg = yuvconstants->kUVBiasBGR[1]; int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; + int yg = yuvconstants->kYToRgb[1]; #elif defined(__arm__) int ub = -yuvconstants->kUVToRB[0]; int ug = yuvconstants->kUVToG[0]; @@ -1723,7 +1738,7 @@ static __inline void YuvPixel8_16(uint8_t y, int bb = yuvconstants->kUVBiasBGR[0]; int bg = yuvconstants->kUVBiasBGR[1]; int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; + int yg = yuvconstants->kYToRgb[1]; #else int ub = yuvconstants->kUVToB[0]; int ug = yuvconstants->kUVToG[0]; @@ -1758,7 +1773,7 @@ static __inline void YuvPixel16(int16_t y, int bb = yuvconstants->kUVBiasBGR[0]; int bg = yuvconstants->kUVBiasBGR[1]; int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; + int yg = yuvconstants->kYToRgb[1]; #elif defined(__arm__) int ub = -yuvconstants->kUVToRB[0]; int ug = yuvconstants->kUVToG[0]; @@ -1767,7 +1782,7 @@ static __inline void YuvPixel16(int16_t y, int bb = yuvconstants->kUVBiasBGR[0]; int bg = yuvconstants->kUVBiasBGR[1]; int br = yuvconstants->kUVBiasBGR[2]; - int yg = yuvconstants->kYToRgb[0] / 0x0101; + int yg = yuvconstants->kYToRgb[1]; #else int ub = yuvconstants->kUVToB[0]; int ug = yuvconstants->kUVToG[0]; @@ -1805,21 +1820,26 @@ static __inline void YuvPixel10(uint16_t y, *r = Clamp(r16 >> 6); } -// Y contribution to R,G,B. Scale and bias. -#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ -#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ - // C reference code that mimics the YUV assembly. -static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, uint8_t* r) { - uint32_t y1 = (uint32_t)(y * 0x0101 * YG) >> 16; - *b = Clamp((int32_t)(y1 + YGB) >> 6); - *g = Clamp((int32_t)(y1 + YGB) >> 6); - *r = Clamp((int32_t)(y1 + YGB) >> 6); +// Reads 8 bit YUV and leaves result as 16 bit. +static __inline void YPixel(uint8_t y, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) || defined(__arm__) + int ygb = yuvconstants->kUVBiasBGR[3]; + int yg = yuvconstants->kYToRgb[1]; +#else + int ygb = yuvconstants->kYBiasToRgb[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + uint32_t y1 = (uint32_t)(y * 0x0101 * yg) >> 16; + *b = Clamp(((int32_t)(y1) + ygb) >> 6); + *g = Clamp(((int32_t)(y1) + ygb) >> 6); + *r = Clamp(((int32_t)(y1) + ygb) >> 6); } -#undef YG -#undef YGB - #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(__aarch64__) || defined(LIBYUV_NEON)) // C mimic assembly. @@ -2353,18 +2373,21 @@ void I422ToRGBARow_C(const uint8_t* src_y, } } -void I400ToARGBRow_C(const uint8_t* src_y, uint8_t* rgb_buf, int width) { +void I400ToARGBRow_C(const uint8_t* src_y, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { int x; for (x = 0; x < width - 1; x += 2) { - YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6); + YPixel(src_y[1], rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2); + YPixel(src_y[0], rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 05e690743..a5de6af27 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -3089,16 +3089,14 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, #endif // HAS_UYVYTOARGBROW_AVX2 #ifdef HAS_I400TOARGBROW_SSE2 -void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { +void I400ToARGBRow_SSE2(const uint8_t* y_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { asm volatile( - "mov $0x4a354a35,%%eax \n" // 4a35 = 18997 = 1.164 - "movd %%eax,%%xmm2 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "mov $0x04880488,%%eax \n" // 0488 = 1160 = 1.164 * - // 16 - "movd %%eax,%%xmm3 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pcmpeqb %%xmm4,%%xmm4 \n" + "movdqa 192(%3),%%xmm2 \n" // yg = 18997 = 1.164 + "movdqa 224(%3),%%xmm3 \n" // ygb = 1160 = 1.164 * 16 + "pcmpeqb %%xmm4,%%xmm4 \n" // 0xff000000 "pslld $0x18,%%xmm4 \n" LABELALIGN @@ -3108,8 +3106,8 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { "lea 0x8(%0),%0 \n" "punpcklbw %%xmm0,%%xmm0 \n" "pmulhuw %%xmm2,%%xmm0 \n" - "psubusw %%xmm3,%%xmm0 \n" - "psrlw $6, %%xmm0 \n" + "paddsw %%xmm3,%%xmm0 \n" + "psraw $6, %%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" // Step 2: Weave into ARGB @@ -3125,27 +3123,26 @@ void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { "sub $0x8,%2 \n" "jg 1b \n" - : "+r"(y_buf), // %0 - "+r"(dst_argb), // %1 - "+rm"(width) // %2 - : - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : "r"(yuvconstants) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_I400TOARGBROW_SSE2 #ifdef HAS_I400TOARGBROW_AVX2 // 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). // note: vpunpcklbw mutates and vpackuswb unmutates. -void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { +void I400ToARGBRow_AVX2(const uint8_t* y_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { asm volatile( - "mov $0x4a354a35,%%eax \n" // 0488 = 1160 = 1.164 * + "vmovdqa 192(%3),%%ymm2 \n" // yg = 18997 = 1.164 + "vmovdqa 224(%3),%%ymm3 \n" // ygb = -1160 = 1.164 * // 16 - "vmovd %%eax,%%xmm2 \n" - "vbroadcastss %%xmm2,%%ymm2 \n" - "mov $0x4880488,%%eax \n" // 4a35 = 18997 = 1.164 - "vmovd %%eax,%%xmm3 \n" - "vbroadcastss %%xmm3,%%ymm3 \n" - "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" + "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 0xff000000 "vpslld $0x18,%%ymm4,%%ymm4 \n" LABELALIGN @@ -3156,8 +3153,8 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpunpcklbw %%ymm0,%%ymm0,%%ymm0 \n" "vpmulhuw %%ymm2,%%ymm0,%%ymm0 \n" - "vpsubusw %%ymm3,%%ymm0,%%ymm0 \n" - "vpsrlw $0x6,%%ymm0,%%ymm0 \n" + "vpaddsw %%ymm3,%%ymm0,%%ymm0 \n" + "vpsraw $0x6,%%ymm0,%%ymm0 \n" "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" "vpunpcklbw %%ymm0,%%ymm0,%%ymm1 \n" "vpermq $0xd8,%%ymm1,%%ymm1 \n" @@ -3167,15 +3164,15 @@ void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* dst_argb, int width) { "vpor %%ymm4,%%ymm1,%%ymm1 \n" "vmovdqu %%ymm0,(%1) \n" "vmovdqu %%ymm1,0x20(%1) \n" - "lea 0x40(%1),%1 \n" + "lea 0x40(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" "vzeroupper \n" - : "+r"(y_buf), // %0 - "+r"(dst_argb), // %1 - "+rm"(width) // %2 - : - : "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); + : "+r"(y_buf), // %0 + "+r"(dst_argb), // %1 + "+rm"(width) // %2 + : "r"(yuvconstants) // %3 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); } #endif // HAS_I400TOARGBROW_AVX2 diff --git a/source/row_mmi.cc b/source/row_mmi.cc index feb21dcc6..bf7a57e94 100644 --- a/source/row_mmi.cc +++ b/source/row_mmi.cc @@ -4781,7 +4781,9 @@ void J400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* dst_argb, int width) { : "memory"); } -void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, int width) { +// TODO - respect YuvConstants +void I400ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, + const struct YuvConstants*, int width) { uint64_t src, src_lo, src_hi, dest, dest_lo, dest_hi; const uint64_t mask0 = 0x0; const uint64_t mask1 = 0x55; diff --git a/source/row_msa.cc b/source/row_msa.cc index 6d24d9241..37ad09512 100644 --- a/source/row_msa.cc +++ b/source/row_msa.cc @@ -2735,7 +2735,11 @@ void I444ToARGBRow_MSA(const uint8_t* src_y, } } -void I400ToARGBRow_MSA(const uint8_t* src_y, uint8_t* dst_argb, int width) { +// TODO - respect YuvConstants +void I400ToARGBRow_MSA(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants*, + int width) { int x; v16u8 src0, res0, res1, res2, res3, res4, dst0, dst1, dst2, dst3; v8i16 vec0, vec1; diff --git a/source/row_neon.cc b/source/row_neon.cc index 99b071b32..ae2d28306 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -344,7 +344,10 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, "q12", "q13", "q14", "q15"); } -void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { +void I400ToARGBRow_NEON(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { asm volatile( YUVTORGB_SETUP "vmov.u8 d23, #255 \n" @@ -355,10 +358,10 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 - : [kUVToRB] "r"(&kYuvI601Constants.kUVToRB), - [kUVToG] "r"(&kYuvI601Constants.kUVToG), - [kUVBiasBGR] "r"(&kYuvI601Constants.kUVBiasBGR), - [kYToRgb] "r"(&kYuvI601Constants.kYToRgb) + : [kUVToRB] "r"(&yuvconstants->kUVToRB), + [kUVToG] "r"(&yuvconstants->kUVToG), + [kUVBiasBGR] "r"(&yuvconstants->kUVBiasBGR), + [kYToRgb] "r"(&yuvconstants->kYToRgb) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"); } diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 9f1bd50df..26a903008 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -397,7 +397,10 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y, ); } -void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { +void I400ToARGBRow_NEON(const uint8_t* src_y, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { asm volatile ( YUVTORGB_SETUP "movi v23.8b, #255 \n" @@ -411,10 +414,10 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { : "+r"(src_y), // %0 "+r"(dst_argb), // %1 "+r"(width) // %2 - : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), - [kUVToG]"r"(&kYuvI601Constants.kUVToG), - [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), - [kYToRgb]"r"(&kYuvI601Constants.kYToRgb) + : [kUVToRB]"r"(&yuvconstants->kUVToRB), + [kUVToG]"r"(&yuvconstants->kUVToG), + [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), + [kYToRgb]"r"(&yuvconstants->kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); diff --git a/source/row_win.cc b/source/row_win.cc index aba6eefbd..1d2ce4002 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2900,10 +2900,12 @@ __declspec(naked) void I422ToRGBARow_SSSE3( } #endif // HAS_I422TOARGBROW_SSSE3 +// I400ToARGBRow_SSE2 is disabled due to new yuvconstant parameter #ifdef HAS_I400TOARGBROW_SSE2 // 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* rgb_buf, + const struct YuvConstants*, int width) { __asm { mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) @@ -2951,6 +2953,7 @@ __declspec(naked) void I400ToARGBRow_SSE2(const uint8_t* y_buf, // note: vpunpcklbw mutates and vpackuswb unmutates. __declspec(naked) void I400ToARGBRow_AVX2(const uint8_t* y_buf, uint8_t* rgb_buf, + const struct YuvConstants*, int width) { __asm { mov eax, 0x4a354a35 // 4a35 = 18997 = round(1.164 * 64 * 256) diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 27ea88508..1d008e57b 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -3140,4 +3140,66 @@ TEST_F(LibYUVConvertTest, TestARGBToRGB24) { free_aligned_buffer_page_end(dest_rgb24); } +// Test I400 with jpeg matrix is same as J400 +TEST_F(LibYUVConvertTest, TestI400) { + const int kSize = 256; + align_buffer_page_end(orig_i400, kSize); + align_buffer_page_end(argb_pixels_i400, kSize * 4); + align_buffer_page_end(argb_pixels_j400, kSize * 4); + align_buffer_page_end(argb_pixels_jpeg_i400, kSize * 4); + align_buffer_page_end(argb_pixels_h709_i400, kSize * 4); + align_buffer_page_end(argb_pixels_2020_i400, kSize * 4); + + // Test grey scale + for (int i = 0; i < kSize; ++i) { + orig_i400[i] = i; + } + + J400ToARGB(orig_i400, 0, argb_pixels_j400, 0, kSize, 1); + I400ToARGB(orig_i400, 0, argb_pixels_i400, 0, kSize, 1); + I400ToARGBMatrix(orig_i400, 0, argb_pixels_jpeg_i400, 0, &kYuvJPEGConstants, + kSize, 1); + I400ToARGBMatrix(orig_i400, 0, argb_pixels_h709_i400, 0, &kYuvH709Constants, + kSize, 1); + I400ToARGBMatrix(orig_i400, 0, argb_pixels_2020_i400, 0, &kYuv2020Constants, + kSize, 1); + + EXPECT_EQ(0, argb_pixels_i400[0]); + EXPECT_EQ(0, argb_pixels_j400[0]); + EXPECT_EQ(0, argb_pixels_jpeg_i400[0]); + EXPECT_EQ(0, argb_pixels_h709_i400[0]); + EXPECT_EQ(0, argb_pixels_2020_i400[0]); + EXPECT_EQ(0, argb_pixels_i400[16 * 4]); + EXPECT_EQ(16, argb_pixels_j400[16 * 4]); + EXPECT_EQ(16, argb_pixels_jpeg_i400[16 * 4]); + EXPECT_EQ(0, argb_pixels_h709_i400[16 * 4]); + EXPECT_EQ(0, argb_pixels_2020_i400[16 * 4]); + EXPECT_EQ(130, argb_pixels_i400[128 * 4]); + EXPECT_EQ(128, argb_pixels_j400[128 * 4]); + EXPECT_EQ(128, argb_pixels_jpeg_i400[128 * 4]); + EXPECT_EQ(130, argb_pixels_h709_i400[128 * 4]); + EXPECT_EQ(130, argb_pixels_2020_i400[128 * 4]); + EXPECT_EQ(255, argb_pixels_i400[255 * 4]); + EXPECT_EQ(255, argb_pixels_j400[255 * 4]); + EXPECT_EQ(255, argb_pixels_jpeg_i400[255 * 4]); + EXPECT_EQ(255, argb_pixels_h709_i400[255 * 4]); + EXPECT_EQ(255, argb_pixels_2020_i400[255 * 4]); + + for (int i = 0; i < kSize * 4; ++i) { + if ((i & 3) == 3) { + EXPECT_EQ(255, argb_pixels_j400[i]); + } else { + EXPECT_EQ(i / 4, argb_pixels_j400[i]); + } + EXPECT_EQ(argb_pixels_jpeg_i400[i], argb_pixels_j400[i]); + } + + free_aligned_buffer_page_end(orig_i400); + free_aligned_buffer_page_end(argb_pixels_i400); + free_aligned_buffer_page_end(argb_pixels_j400); + free_aligned_buffer_page_end(argb_pixels_jpeg_i400); + free_aligned_buffer_page_end(argb_pixels_h709_i400); + free_aligned_buffer_page_end(argb_pixels_2020_i400); +} + } // namespace libyuv