diff --git a/README.chromium b/README.chromium index 51b4a11ec..460d8ecc1 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1779 +Version: 1780 License: BSD License File: LICENSE diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h index 297de1516..419e74309 100644 --- a/include/libyuv/convert_argb.h +++ b/include/libyuv/convert_argb.h @@ -54,12 +54,30 @@ LIBYUV_API extern const struct YuvConstants kYvuV2020Constants; // BT.2020 full NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i) #define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) \ NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i) +#define I010ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \ + I010ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k) +#define I210ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \ + I210ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k) +#define I410ToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k) \ + I410ToARGBMatrix(a, b, e, f, c, d, g, h, i##VU, j, k) +#define I010ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \ + I010ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) +#define I210ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \ + I210ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) +#define I410ToAB30Matrix(a, b, c, d, e, f, g, h, i, j, k) \ + I410ToAR30Matrix(a, b, e, f, c, d, g, h, i##VU, j, k) #define I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ I420AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) #define I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ I422AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) #define I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ I444AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) +#define I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ + I010AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) +#define I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ + I210AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) +#define I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, k, l, m, n) \ + I410AlphaToARGBMatrix(a, b, e, f, c, d, g, h, i, j, k##VU, l, m, n) // Alias. #define ARGBToARGB ARGBCopy @@ -125,32 +143,6 @@ int J420ToABGR(const uint8_t* src_y, int width, int height); -// Convert F420 to ARGB. BT.709 full range -LIBYUV_API -int F420ToARGB(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_argb, - int dst_stride_argb, - int width, - int height); - -// Convert F420 to ABGR. BT.709 full range -LIBYUV_API -int F420ToABGR(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_u, - int src_stride_u, - const uint8_t* src_v, - int src_stride_v, - uint8_t* dst_abgr, - int dst_stride_abgr, - int width, - int height); - // Convert H420 to ARGB. LIBYUV_API int H420ToARGB(const uint8_t* src_y, @@ -1440,7 +1432,7 @@ int I444ToARGBMatrix(const uint8_t* src_y, int width, int height); -// multiply 10 bit yuv into high bits to allow any number of bits. +// Convert 10 bit 420 YUV to ARGB with matrix. LIBYUV_API int I010ToAR30Matrix(const uint16_t* src_y, int src_stride_y, @@ -1454,7 +1446,7 @@ int I010ToAR30Matrix(const uint16_t* src_y, int width, int height); -// multiply 10 bit yuv into high bits to allow any number of bits. +// Convert 10 bit 420 YUV to ARGB with matrix. LIBYUV_API int I210ToAR30Matrix(const uint16_t* src_y, int src_stride_y, @@ -1468,6 +1460,20 @@ int I210ToAR30Matrix(const uint16_t* src_y, int width, int height); +// Convert 10 bit 444 YUV to ARGB with matrix. +LIBYUV_API +int I410ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height); + // Convert 10 bit YUV to ARGB with matrix. LIBYUV_API int I010ToARGBMatrix(const uint16_t* src_y, @@ -1496,6 +1502,87 @@ int I210ToARGBMatrix(const uint16_t* src_y, int width, int height); +// Convert 10 bit 444 YUV to ARGB with matrix. +LIBYUV_API +int I410ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert P010 to ARGB with matrix. +LIBYUV_API +int P010ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert P210 to ARGB with matrix. +LIBYUV_API +int P210ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert P010 to AR30 with matrix. +LIBYUV_API +int P010ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// Convert P210 to AR30 with matrix. +LIBYUV_API +int P210ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height); + +// P012 and P010 use most significant bits so the conversion is the same. +// Convert P012 to ARGB with matrix. +#define P012ToARGBMatrix P010ToARGBMatrix +// Convert P012 to AR30 with matrix. +#define P012ToAR30Matrix P010ToAR30Matrix +// Convert P212 to ARGB with matrix. +#define P212ToARGBMatrix P210ToARGBMatrix +// Convert P212 to AR30 with matrix. +#define P212ToAR30Matrix P210ToAR30Matrix + +// Convert P016 to ARGB with matrix. +#define P016ToARGBMatrix P010ToARGBMatrix +// Convert P016 to AR30 with matrix. +#define P016ToAR30Matrix P010ToAR30Matrix +// Convert P216 to ARGB with matrix. +#define P216ToARGBMatrix P210ToARGBMatrix +// Convert P216 to AR30 with matrix. +#define P216ToAR30Matrix P210ToAR30Matrix + // Convert I420 with Alpha to preattenuated ARGB with matrix. LIBYUV_API int I420AlphaToARGBMatrix(const uint8_t* src_y, @@ -1547,6 +1634,57 @@ int I444AlphaToARGBMatrix(const uint8_t* src_y, int height, int attenuate); +// Convert I010 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I010AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate); + +// Convert I210 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I210AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate); + +// Convert I410 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I410AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate); + // Convert NV12 to ARGB with matrix. LIBYUV_API int NV12ToARGBMatrix(const uint8_t* src_y, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 68fb88b3e..765363140 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -175,8 +175,10 @@ extern "C" { defined(_MSC_VER) // TODO(fbarchard): fix build error on android_full_debug=1 // https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I444ALPHATOARGBROW_SSSE3 +#define HAS_I210ALPHATOARGBROW_SSSE3 +#define HAS_I410ALPHATOARGBROW_SSSE3 #define HAS_I422ALPHATOARGBROW_SSSE3 +#define HAS_I444ALPHATOARGBROW_SSSE3 #endif #endif @@ -240,8 +242,10 @@ extern "C" { defined(_MSC_VER) // TODO(fbarchard): fix build error on android_full_debug=1 // https://code.google.com/p/libyuv/issues/detail?id=517 -#define HAS_I444ALPHATOARGBROW_AVX2 +#define HAS_I210ALPHATOARGBROW_AVX2 +#define HAS_I410ALPHATOARGBROW_AVX2 #define HAS_I422ALPHATOARGBROW_AVX2 +#define HAS_I444ALPHATOARGBROW_AVX2 #endif #endif @@ -280,9 +284,15 @@ extern "C" { #define HAS_I210TOARGBROW_SSSE3 #define HAS_I400TOARGBROW_SSE2 #define HAS_I422TOAR30ROW_SSSE3 +#define HAS_I410TOAR30ROW_SSSE3 +#define HAS_I410TOARGBROW_SSSE3 #define HAS_MERGEARGBROW_SSE2 #define HAS_MERGERGBROW_SSSE3 #define HAS_MIRRORUVROW_SSSE3 +#define HAS_P210TOAR30ROW_SSSE3 +#define HAS_P210TOARGBROW_SSSE3 +#define HAS_P410TOAR30ROW_SSSE3 +#define HAS_P410TOARGBROW_SSSE3 #define HAS_RAWTORGBAROW_SSSE3 #define HAS_RGB24MIRRORROW_SSSE3 #define HAS_RGBATOYJROW_SSSE3 @@ -311,6 +321,12 @@ extern "C" { #define HAS_I210TOAR30ROW_AVX2 #define HAS_I210TOARGBROW_AVX2 #define HAS_I400TOARGBROW_AVX2 +#define HAS_I410TOAR30ROW_AVX2 +#define HAS_I410TOARGBROW_AVX2 +#define HAS_P210TOAR30ROW_AVX2 +#define HAS_P210TOARGBROW_AVX2 +#define HAS_P410TOAR30ROW_AVX2 +#define HAS_P410TOARGBROW_AVX2 #define HAS_I422TOAR30ROW_AVX2 #define HAS_I422TOUYVYROW_AVX2 #define HAS_I422TOYUY2ROW_AVX2 @@ -2575,6 +2591,33 @@ void I210ToARGBRow_C(const uint16_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); +void I410ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I410ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I210AlphaToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I410AlphaToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); + void I444AlphaToARGBRow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -2626,6 +2669,27 @@ void UYVYToARGBRow_C(const uint8_t* src_uyvy, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); +void P210ToARGBRow_C(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P410ToARGBRow_C(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P210ToAR30Row_C(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P410ToAR30Row_C(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); + void I422ToRGBARow_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -2705,6 +2769,32 @@ void I210ToARGBRow_SSSE3(const uint16_t* y_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I410ToAR30Row_SSSE3(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I410ToARGBRow_SSSE3(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I210AlphaToARGBRow_SSSE3(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I410AlphaToARGBRow_SSSE3(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); void I422ToAR30Row_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2723,6 +2813,32 @@ void I210ToAR30Row_AVX2(const uint16_t* y_buf, uint8_t* dst_ar30, const struct YuvConstants* yuvconstants, int width); +void I410ToAR30Row_AVX2(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I410ToARGBRow_AVX2(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I210AlphaToARGBRow_AVX2(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I410AlphaToARGBRow_AVX2(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); void I444AlphaToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2821,6 +2937,48 @@ void UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, int width); + +void P210ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P410ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P210ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P410ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P210ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P410ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P210ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P410ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); + void I422ToRGBARow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2923,6 +3081,32 @@ void I210ToARGBRow_Any_SSSE3(const uint16_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I410ToAR30Row_Any_SSSE3(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I410ToARGBRow_Any_SSSE3(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I210AlphaToARGBRow_Any_SSSE3(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I410AlphaToARGBRow_Any_SSSE3(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); void I422ToAR30Row_Any_AVX2(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2941,6 +3125,32 @@ void I210ToAR30Row_Any_AVX2(const uint16_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void I410ToAR30Row_Any_AVX2(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I410ToARGBRow_Any_AVX2(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I210AlphaToARGBRow_Any_AVX2(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); +void I410AlphaToARGBRow_Any_AVX2(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width); void I444AlphaToARGBRow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -3039,6 +3249,46 @@ void UYVYToARGBRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void P210ToARGBRow_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P410ToARGBRow_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P210ToAR30Row_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P410ToAR30Row_Any_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P210ToARGBRow_Any_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P410ToARGBRow_Any_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P210ToAR30Row_Any_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P410ToAR30Row_Any_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); void I422ToRGBARow_Any_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -3537,6 +3787,46 @@ void UYVYToARGBRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void P210ToARGBRow_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P410ToARGBRow_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P210ToAR30Row_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P410ToAR30Row_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P210ToARGBRow_Any_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P410ToARGBRow_Any_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width); +void P210ToAR30Row_Any_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); +void P410ToAR30Row_Any_NEON(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); void I444ToARGBRow_Any_MSA(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index e59b316a6..7016100d5 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1779 +#define LIBYUV_VERSION 1780 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 87d7d7325..eb185b6e2 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -1045,6 +1045,58 @@ int U210ToAB30(const uint16_t* src_y, &kYuv2020Constants, width, height); } +LIBYUV_API +int I410ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I410ToAR30Row)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I410ToAR30Row_C; + if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_I410TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410ToAR30Row = I410ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410ToAR30Row = I410ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_I410TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410ToAR30Row = I410ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410ToAR30Row = I410ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I410ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + // Convert 10 bit YUV to ARGB with matrix. LIBYUV_API int I010ToARGBMatrix(const uint16_t* src_y, @@ -1087,14 +1139,6 @@ int I010ToARGBMatrix(const uint16_t* src_y, I210ToARGBRow = I210ToARGBRow_AVX2; } } -#endif -#if defined(HAS_I210TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I210ToARGBRow = I210ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I210ToARGBRow = I210ToARGBRow_MMI; - } - } #endif for (y = 0; y < height; ++y) { I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -1258,14 +1302,6 @@ int I210ToARGBMatrix(const uint16_t* src_y, I210ToARGBRow = I210ToARGBRow_AVX2; } } -#endif -#if defined(HAS_I210TOARGBROW_MMI) - if (TestCpuFlag(kCpuHasMMI)) { - I210ToARGBRow = I210ToARGBRow_Any_MMI; - if (IS_ALIGNED(width, 4)) { - I210ToARGBRow = I210ToARGBRow_MMI; - } - } #endif for (y = 0; y < height; ++y) { I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); @@ -1385,6 +1421,254 @@ int U210ToABGR(const uint16_t* src_y, width, height); } +LIBYUV_API +int I410ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*I410ToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + I410ToARGBRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I410TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410ToARGBRow = I410ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410ToARGBRow = I410ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I410TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410ToARGBRow = I410ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410ToARGBRow = I410ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + I410ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +LIBYUV_API +int P010ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P210ToARGBRow)(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + P210ToARGBRow_C; + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_P210TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P210ToARGBRow = P210ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P210ToARGBRow = P210ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_P210TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P210ToARGBRow = P210ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P210ToARGBRow = P210ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +LIBYUV_API +int P210ToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P210ToARGBRow)(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + P210ToARGBRow_C; + if (!src_y || !src_uv || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_P210TOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P210ToARGBRow = P210ToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P210ToARGBRow = P210ToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_P210TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P210ToARGBRow = P210ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P210ToARGBRow = P210ToARGBRow_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + P210ToARGBRow(src_y, src_uv, dst_argb, yuvconstants, width); + dst_argb += dst_stride_argb; + src_y += src_stride_y; + src_uv += src_stride_uv; + } + return 0; +} + +LIBYUV_API +int P010ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P210ToAR30Row)(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + P210ToAR30Row_C; + if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_P210TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P210ToAR30Row = P210ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P210ToAR30Row = P210ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_P210TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P210ToAR30Row = P210ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P210ToAR30Row = P210ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_uv += src_stride_uv; + } + } + return 0; +} + +LIBYUV_API +int P210ToAR30Matrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_uv, + int src_stride_uv, + uint8_t* dst_ar30, + int dst_stride_ar30, + const struct YuvConstants* yuvconstants, + int width, + int height) { + int y; + void (*P210ToAR30Row)(const uint16_t* y_buf, const uint16_t* uv_buf, uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, int width) = + P210ToAR30Row_C; + if (!src_y || !src_uv || !dst_ar30 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; + dst_stride_ar30 = -dst_stride_ar30; + } +#if defined(HAS_P210TOAR30ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + P210ToAR30Row = P210ToAR30Row_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + P210ToAR30Row = P210ToAR30Row_SSSE3; + } + } +#endif +#if defined(HAS_P210TOAR30ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + P210ToAR30Row = P210ToAR30Row_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + P210ToAR30Row = P210ToAR30Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + P210ToAR30Row(src_y, src_uv, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + src_uv += src_stride_uv; + } + return 0; +} + // Convert I420 with Alpha to preattenuated ARGB with matrix. LIBYUV_API int I420AlphaToARGBMatrix(const uint8_t* src_y, @@ -1903,6 +2187,323 @@ int I444AlphaToABGR(const uint8_t* src_y, width, height, attenuate); } +// Convert I010 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I010AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I210AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I210ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I210ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBAttenuateRow = ARGBAttenuateRow_MMI; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; + } + } + return 0; +} + +// Convert I210 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I210AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I210AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I210AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I210ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I210ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I210AlphaToARGBRow = I210AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBAttenuateRow = ARGBAttenuateRow_MMI; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I210AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + +// Convert I410 with Alpha to preattenuated ARGB with matrix. +LIBYUV_API +int I410AlphaToARGBMatrix(const uint16_t* src_y, + int src_stride_y, + const uint16_t* src_u, + int src_stride_u, + const uint16_t* src_v, + int src_stride_v, + const uint16_t* src_a, + int src_stride_a, + uint8_t* dst_argb, + int dst_stride_argb, + const struct YuvConstants* yuvconstants, + int width, + int height, + int attenuate) { + int y; + void (*I410AlphaToARGBRow)(const uint16_t* y_buf, const uint16_t* u_buf, + const uint16_t* v_buf, const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) = I410AlphaToARGBRow_C; + void (*ARGBAttenuateRow)(const uint8_t* src_argb, uint8_t* dst_argb, + int width) = ARGBAttenuateRow_C; + if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_argb = dst_argb + (height - 1) * dst_stride_argb; + dst_stride_argb = -dst_stride_argb; + } +#if defined(HAS_I410ALPHATOARGBROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_SSSE3; + if (IS_ALIGNED(width, 8)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_SSSE3; + } + } +#endif +#if defined(HAS_I410ALPHATOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I410AlphaToARGBRow = I410AlphaToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3; + if (IS_ALIGNED(width, 4)) { + ARGBAttenuateRow = ARGBAttenuateRow_SSSE3; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_AVX2; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_NEON; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MMI) + if (TestCpuFlag(kCpuHasMMI)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MMI; + if (IS_ALIGNED(width, 2)) { + ARGBAttenuateRow = ARGBAttenuateRow_MMI; + } + } +#endif +#if defined(HAS_ARGBATTENUATEROW_MSA) + if (TestCpuFlag(kCpuHasMSA)) { + ARGBAttenuateRow = ARGBAttenuateRow_Any_MSA; + if (IS_ALIGNED(width, 8)) { + ARGBAttenuateRow = ARGBAttenuateRow_MSA; + } + } +#endif + + for (y = 0; y < height; ++y) { + I410AlphaToARGBRow(src_y, src_u, src_v, src_a, dst_argb, yuvconstants, + width); + if (attenuate) { + ARGBAttenuateRow(dst_argb, dst_argb, width); + } + dst_argb += dst_stride_argb; + src_a += src_stride_a; + src_y += src_stride_y; + src_u += src_stride_u; + src_v += src_stride_v; + } + return 0; +} + // Convert I400 to ARGB with matrix. LIBYUV_API int I400ToARGBMatrix(const uint8_t* src_y, diff --git a/source/row_any.cc b/source/row_any.cc index 08ae1d2af..bcb59ea7a 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -115,6 +115,46 @@ ANY41C(I422AlphaToARGBRow_Any_MMI, I422AlphaToARGBRow_MMI, 1, 0, 4, 7) #endif #undef ANY41C +// Any 4 planes to 1 plane of 8 bit with yuvconstants +#define ANY41CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ + void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, const T* a_buf, \ + uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, \ + int width) { \ + SIMD_ALIGNED(T temp[16 * 4]); \ + SIMD_ALIGNED(uint8_t out[64]); \ + memset(temp, 0, 16 * 4 * SBPP); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, u_buf, v_buf, a_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r * SBPP); \ + memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \ + memcpy(temp + 48, a_buf + n, r * SBPP); \ + ANY_SIMD(temp, temp + 16, temp + 32, temp + 48, out, yuvconstants, \ + MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_I210ALPHATOARGBROW_SSSE3 +ANY41CT(I210AlphaToARGBRow_Any_SSSE3, I210AlphaToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif + +#ifdef HAS_I210ALPHATOARGBROW_AVX2 +ANY41CT(I210AlphaToARGBRow_Any_AVX2, I210AlphaToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif + +#ifdef HAS_I410ALPHATOARGBROW_SSSE3 +ANY41CT(I410AlphaToARGBRow_Any_SSSE3, I410AlphaToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) +#endif + +#ifdef HAS_I410ALPHATOARGBROW_AVX2 +ANY41CT(I410AlphaToARGBRow_Any_AVX2, I410AlphaToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) +#endif + +#undef ANY41CT + // Any 3 planes to 1. #define ANY31(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, BPP, MASK) \ void NAMEANY(const uint8_t* y_buf, const uint8_t* u_buf, \ @@ -327,6 +367,18 @@ ANY31CT(I210ToARGBRow_Any_AVX2, I210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) #ifdef HAS_I210TOAR30ROW_AVX2 ANY31CT(I210ToAR30Row_Any_AVX2, I210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) #endif +#ifdef HAS_I410TOAR30ROW_SSSE3 +ANY31CT(I410ToAR30Row_Any_SSSE3, I410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I410TOARGBROW_SSSE3 +ANY31CT(I410ToARGBRow_Any_SSSE3, I410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_I410TOARGBROW_AVX2 +ANY31CT(I410ToARGBRow_Any_AVX2, I410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_I410TOAR30ROW_AVX2 +ANY31CT(I410ToAR30Row_Any_AVX2, I410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) +#endif #ifdef HAS_I210TOARGBROW_MMI ANY31CT(I210ToARGBRow_Any_MMI, I210ToARGBRow_MMI, 1, 0, uint16_t, 2, 4, 7) #endif @@ -546,12 +598,57 @@ ANY21C(NV12ToRGB565Row_Any_MMI, NV12ToRGB565Row_MMI, 1, 1, 2, 2, 7) #endif #undef ANY21C +// Any 2 planes of 16 bit to 1 with yuvconstants +#define ANY21CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \ + void NAMEANY(const T* y_buf, const T* uv_buf, uint8_t* dst_ptr, \ + const struct YuvConstants* yuvconstants, int width) { \ + SIMD_ALIGNED(T temp[16 * 3]); \ + SIMD_ALIGNED(uint8_t out[64]); \ + memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, yuvconstants, n); \ + } \ + memcpy(temp, y_buf + n, r * SBPP); \ + memcpy(temp + 16, uv_buf + 2 * (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP * 2); \ + ANY_SIMD(temp, temp + 16, out, yuvconstants, MASK + 1); \ + memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ + } + +#ifdef HAS_P210TOAR30ROW_SSSE3 +ANY21CT(P210ToAR30Row_Any_SSSE3, P210ToAR30Row_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P210TOARGBROW_SSSE3 +ANY21CT(P210ToARGBRow_Any_SSSE3, P210ToARGBRow_SSSE3, 1, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P210TOARGBROW_AVX2 +ANY21CT(P210ToARGBRow_Any_AVX2, P210ToARGBRow_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_P210TOAR30ROW_AVX2 +ANY21CT(P210ToAR30Row_Any_AVX2, P210ToAR30Row_AVX2, 1, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_P410TOAR30ROW_SSSE3 +ANY21CT(P410ToAR30Row_Any_SSSE3, P410ToAR30Row_SSSE3, 0, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P410TOARGBROW_SSSE3 +ANY21CT(P410ToARGBRow_Any_SSSE3, P410ToARGBRow_SSSE3, 0, 0, uint16_t, 2, 4, 7) +#endif +#ifdef HAS_P410TOARGBROW_AVX2 +ANY21CT(P410ToARGBRow_Any_AVX2, P410ToARGBRow_AVX2, 0, 0, uint16_t, 2, 4, 15) +#endif +#ifdef HAS_P410TOAR30ROW_AVX2 +ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15) +#endif + +#undef ANY21CT + // Any 2 16 bit planes with parameter to 1 #define ANY21PT(NAMEANY, ANY_SIMD, T, BPP, MASK) \ void NAMEANY(const T* src_u, const T* src_v, T* dst_uv, int depth, \ int width) { \ SIMD_ALIGNED(T temp[16 * 4]); \ - memset(temp, 0, 16 * 4); /* for msan */ \ + memset(temp, 0, 16 * 4 * BPP); /* for msan */ \ int r = width & MASK; \ int n = width & ~MASK; \ if (n > 0) { \ diff --git a/source/row_common.cc b/source/row_common.cc index a941c3f5f..ad4e95ea6 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1546,14 +1546,14 @@ MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR) #undef MAKEYUVCONSTANTS // C reference code that mimics the YUV assembly. -// Reads 8 bit YUV and leaves result as 16 bit. -static __inline void YuvPixel(uint8_t y, - uint8_t u, - uint8_t v, - uint8_t* b, - uint8_t* g, - uint8_t* r, - const struct YuvConstants* yuvconstants) { +// Reads 8 bit YUV and leaves result as 8 bit. +static __inline void YuvPixel8_8(uint8_t y, + uint8_t u, + uint8_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { #if defined(__aarch64__) int ub = -yuvconstants->kUVToRB[0]; int ug = yuvconstants->kUVToG[0]; @@ -1634,13 +1634,13 @@ static __inline void YuvPixel8_16(uint8_t y, // C reference code that mimics the YUV 16 bit assembly. // Reads 10 bit YUV and leaves result as 16 bit. -static __inline void YuvPixel16(int16_t y, - int16_t u, - int16_t v, - int* b, - int* g, - int* r, - const struct YuvConstants* yuvconstants) { +static __inline void YuvPixel10_16(uint16_t y, + uint16_t u, + uint16_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { #if defined(__aarch64__) int ub = -yuvconstants->kUVToRB[0]; int ug = yuvconstants->kUVToG[0]; @@ -1680,24 +1680,116 @@ static __inline void YuvPixel16(int16_t y, // C reference code that mimics the YUV 10 bit assembly. // Reads 10 bit YUV and clamps down to 8 bit RGB. -static __inline void YuvPixel10(uint16_t y, - uint16_t u, - uint16_t v, - uint8_t* b, - uint8_t* g, - uint8_t* r, - const struct YuvConstants* yuvconstants) { +static __inline void YuvPixel10_8(uint16_t y, + uint16_t u, + uint16_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { int b16; int g16; int r16; - YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants); + YuvPixel10_16(y, u, v, &b16, &g16, &r16, yuvconstants); *b = Clamp(b16 >> 6); *g = Clamp(g16 >> 6); *r = Clamp(r16 >> 6); } +// C reference code that mimics the YUV 16 bit assembly. +// Reads 16 bit YUV and leaves result as 8 bit. +static __inline void YuvPixel16_8(uint16_t y, + uint16_t u, + uint16_t v, + uint8_t* b, + uint8_t* g, + uint8_t* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = -yuvconstants->kUVToRB[1]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[1]; +#elif defined(__arm__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[4]; + int vr = -yuvconstants->kUVToRB[4]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[1]; +#else + int ub = yuvconstants->kUVToB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = yuvconstants->kUVToR[1]; + int bb = yuvconstants->kUVBiasB[0]; + int bg = yuvconstants->kUVBiasG[0]; + int br = yuvconstants->kUVBiasR[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + + uint32_t y1 = (uint32_t)(y * yg) >> 16; + u = clamp255(u >> 8); + v = clamp255(v >> 8); + *b = Clamp((int32_t)(y1 + -(u * ub) + bb) >> 6); + *g = Clamp((int32_t)(y1 + -(u * ug + v * vg) + bg) >> 6); + *r = Clamp((int32_t)(y1 + -(v * vr) + br) >> 6); +} + +// C reference code that mimics the YUV 16 bit assembly. +// Reads 16 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel16_16(uint16_t y, + uint16_t u, + uint16_t v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = -yuvconstants->kUVToRB[1]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[1]; +#elif defined(__arm__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[4]; + int vr = -yuvconstants->kUVToRB[4]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[1]; +#else + int ub = yuvconstants->kUVToB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = yuvconstants->kUVToR[1]; + int bb = yuvconstants->kUVBiasB[0]; + int bg = yuvconstants->kUVBiasG[0]; + int br = yuvconstants->kUVBiasR[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + + uint32_t y1 = (uint32_t)(y * yg) >> 16; + u = clamp255(u >> 8); + v = clamp255(v >> 8); + *b = (int)(-(u * ub) + y1 + bb); + *g = (int)(-(u * ug + v * vg) + y1 + bg); + *r = (int)(-(v * vr) + y1 + br); +} + // C reference code that mimics the YUV assembly. -// Reads 8 bit YUV and leaves result as 16 bit. +// Reads 8 bit YUV and leaves result as 8 bit. static __inline void YPixel(uint8_t y, uint8_t* b, uint8_t* g, @@ -1730,11 +1822,11 @@ void I444ToARGBRow_C(const uint8_t* src_y, for (x = 0; x < width - 1; x += 2) { uint8_t u = (src_u[0] + src_u[1] + 1) >> 1; uint8_t v = (src_v[0] + src_v[1] + 1) >> 1; - YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, - yuvconstants); + YuvPixel8_8(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, + yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, - yuvconstants); + YuvPixel8_8(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, + yuvconstants); rgb_buf[7] = 255; src_y += 2; src_u += 2; @@ -1742,8 +1834,8 @@ void I444ToARGBRow_C(const uint8_t* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -1756,8 +1848,8 @@ void I444ToARGBRow_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width; ++x) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; src_y += 1; src_u += 1; @@ -1776,11 +1868,11 @@ void I422ToARGBRow_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); + YuvPixel8_8(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_u += 1; @@ -1788,8 +1880,8 @@ void I422ToARGBRow_C(const uint8_t* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -1803,11 +1895,11 @@ void I210ToARGBRow_C(const uint16_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); + YuvPixel10_8(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_u += 1; @@ -1815,15 +1907,81 @@ void I210ToARGBRow_C(const uint16_t* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } +void I410ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + src_y += 1; + src_u += 1; + src_v += 1; + rgb_buf += 4; // Advance 1 pixels. + } +} + +void I210AlphaToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = clamp255(src_a[0] >> 2); + YuvPixel10_8(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = clamp255(src_a[1] >> 2); + src_y += 2; + src_u += 1; + src_v += 1; + src_a += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = clamp255(src_a[0] >> 2); + } +} + +void I410AlphaToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + const uint16_t* src_a, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel10_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = clamp255(src_a[0] >> 2); + src_y += 1; + src_u += 1; + src_v += 1; + src_a += 1; + rgb_buf += 4; // Advance 1 pixels. + } +} + static void StoreAR30(uint8_t* rgb_buf, int b, int g, int r) { uint32_t ar30; - b = b >> 4; // convert 10.6 to 10 bit. + b = b >> 4; // convert 8 bit 10.6 to 10 bit. g = g >> 4; r = r >> 4; b = Clamp10(b); @@ -1845,9 +2003,9 @@ void I210ToAR30Row_C(const uint16_t* src_y, int g; int r; for (x = 0; x < width - 1; x += 2) { - YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf, b, g, r); - YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + YuvPixel10_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf + 4, b, g, r); src_y += 2; src_u += 1; @@ -1855,11 +2013,113 @@ void I210ToAR30Row_C(const uint16_t* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); StoreAR30(rgb_buf, b, g, r); } } +void I410ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_u, + const uint16_t* src_v, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width; ++x) { + YuvPixel10_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + src_y += 1; + src_u += 1; + src_v += 1; + rgb_buf += 4; // Advance 1 pixel. + } +} + +void P210ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + YuvPixel16_8(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); + rgb_buf[7] = 255; + src_y += 2; + src_uv += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + } +} + +void P410ToARGBRow_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + for (x = 0; x < width; ++x) { + YuvPixel16_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + rgb_buf[3] = 255; + src_y += 1; + src_uv += 2; + rgb_buf += 4; // Advance 1 pixels. + } +} + +void P210ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + YuvPixel16_16(src_y[1], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf + 4, b, g, r); + src_y += 2; + src_uv += 2; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + } +} + +void P410ToAR30Row_C(const uint16_t* src_y, + const uint16_t* src_uv, + uint8_t* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width; ++x) { + YuvPixel16_16(src_y[0], src_uv[0], src_uv[1], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + src_y += 1; + src_uv += 2; + rgb_buf += 4; // Advance 1 pixel. + } +} + // 8 bit YUV to 10 bit AR30 // Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits. void I422ToAR30Row_C(const uint8_t* src_y, @@ -1903,11 +2163,11 @@ void I444AlphaToARGBRow_C(const uint8_t* src_y, for (x = 0; x < width - 1; x += 2) { uint8_t u = (src_u[0] + src_u[1] + 1) >> 1; uint8_t v = (src_v[0] + src_v[1] + 1) >> 1; - YuvPixel(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, - yuvconstants); + YuvPixel8_8(src_y[0], u, v, rgb_buf + 0, rgb_buf + 1, rgb_buf + 2, + yuvconstants); rgb_buf[3] = src_a[0]; - YuvPixel(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, - yuvconstants); + YuvPixel8_8(src_y[1], u, v, rgb_buf + 4, rgb_buf + 5, rgb_buf + 6, + yuvconstants); rgb_buf[7] = src_a[1]; src_y += 2; src_u += 2; @@ -1916,8 +2176,8 @@ void I444AlphaToARGBRow_C(const uint8_t* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = src_a[0]; } } @@ -1931,8 +2191,8 @@ void I444AlphaToARGBRow_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width; ++x) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = src_a[0]; src_y += 1; src_u += 1; @@ -1952,11 +2212,11 @@ void I422AlphaToARGBRow_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = src_a[0]; - YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); + YuvPixel8_8(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = src_a[1]; src_y += 2; src_u += 1; @@ -1965,8 +2225,8 @@ void I422AlphaToARGBRow_C(const uint8_t* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = src_a[0]; } } @@ -1979,18 +2239,18 @@ void I422ToRGB24Row_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4, - rgb_buf + 5, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[1], src_u[0], src_v[0], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); src_y += 2; src_u += 1; src_v += 1; rgb_buf += 6; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); } } @@ -2008,8 +2268,8 @@ void I422ToARGB4444Row_C(const uint8_t* src_y, uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); - YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel8_8(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); b0 = b0 >> 4; g0 = g0 >> 4; r0 = r0 >> 4; @@ -2024,7 +2284,7 @@ void I422ToARGB4444Row_C(const uint8_t* src_y, dst_argb4444 += 4; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); b0 = b0 >> 4; g0 = g0 >> 4; r0 = r0 >> 4; @@ -2046,8 +2306,8 @@ void I422ToARGB1555Row_C(const uint8_t* src_y, uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); - YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel8_8(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 3; r0 = r0 >> 3; @@ -2062,7 +2322,7 @@ void I422ToARGB1555Row_C(const uint8_t* src_y, dst_argb1555 += 4; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 3; r0 = r0 >> 3; @@ -2084,8 +2344,8 @@ void I422ToRGB565Row_C(const uint8_t* src_y, uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); - YuvPixel(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel8_8(src_y[1], src_u[0], src_v[0], &b1, &g1, &r1, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; @@ -2100,7 +2360,7 @@ void I422ToRGB565Row_C(const uint8_t* src_y, dst_rgb565 += 4; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], &b0, &g0, &r0, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; @@ -2115,19 +2375,19 @@ void NV12ToARGBRow_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); + YuvPixel8_8(src_y[1], src_uv[0], src_uv[1], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_uv += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -2139,19 +2399,19 @@ void NV21ToARGBRow_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); + YuvPixel8_8(src_y[1], src_vu[1], src_vu[0], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_y += 2; src_vu += 2; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -2163,17 +2423,17 @@ void NV12ToRGB24Row_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - YuvPixel(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4, - rgb_buf + 5, yuvconstants); + YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[1], src_uv[0], src_uv[1], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); src_y += 2; src_uv += 2; rgb_buf += 6; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); } } @@ -2184,17 +2444,17 @@ void NV21ToRGB24Row_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); - YuvPixel(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4, - rgb_buf + 5, yuvconstants); + YuvPixel8_8(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[1], src_vu[1], src_vu[0], rgb_buf + 3, rgb_buf + 4, + rgb_buf + 5, yuvconstants); src_y += 2; src_vu += 2; rgb_buf += 6; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_y[0], src_vu[1], src_vu[0], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); } } @@ -2211,8 +2471,8 @@ void NV12ToRGB565Row_C(const uint8_t* src_y, uint8_t r1; int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); - YuvPixel(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants); + YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); + YuvPixel8_8(src_y[1], src_uv[0], src_uv[1], &b1, &g1, &r1, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; @@ -2226,7 +2486,7 @@ void NV12ToRGB565Row_C(const uint8_t* src_y, dst_rgb565 += 4; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); + YuvPixel8_8(src_y[0], src_uv[0], src_uv[1], &b0, &g0, &r0, yuvconstants); b0 = b0 >> 3; g0 = g0 >> 2; r0 = r0 >> 3; @@ -2240,18 +2500,18 @@ void YUY2ToARGBRow_C(const uint8_t* src_yuy2, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); + YuvPixel8_8(src_yuy2[2], src_yuy2[1], src_yuy2[3], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_yuy2 += 4; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_yuy2[0], src_yuy2[1], src_yuy2[3], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -2262,18 +2522,18 @@ void UYVYToARGBRow_C(const uint8_t* src_uyvy, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; - YuvPixel(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5, - rgb_buf + 6, yuvconstants); + YuvPixel8_8(src_uyvy[3], src_uyvy[0], src_uyvy[2], rgb_buf + 4, rgb_buf + 5, + rgb_buf + 6, yuvconstants); rgb_buf[7] = 255; src_uyvy += 4; rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, - rgb_buf + 2, yuvconstants); + YuvPixel8_8(src_uyvy[1], src_uyvy[0], src_uyvy[2], rgb_buf + 0, rgb_buf + 1, + rgb_buf + 2, yuvconstants); rgb_buf[3] = 255; } } @@ -2286,11 +2546,11 @@ void I422ToRGBARow_C(const uint8_t* src_y, int width) { int x; for (x = 0; x < width - 1; x += 2) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, - rgb_buf + 3, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, + rgb_buf + 3, yuvconstants); rgb_buf[0] = 255; - YuvPixel(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6, - rgb_buf + 7, yuvconstants); + YuvPixel8_8(src_y[1], src_u[0], src_v[0], rgb_buf + 5, rgb_buf + 6, + rgb_buf + 7, yuvconstants); rgb_buf[4] = 255; src_y += 2; src_u += 1; @@ -2298,8 +2558,8 @@ void I422ToRGBARow_C(const uint8_t* src_y, rgb_buf += 8; // Advance 2 pixels. } if (width & 1) { - YuvPixel(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, - rgb_buf + 3, yuvconstants); + YuvPixel8_8(src_y[0], src_u[0], src_v[0], rgb_buf + 1, rgb_buf + 2, + rgb_buf + 3, yuvconstants); rgb_buf[0] = 255; } } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index faf0fc910..2c823a132 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1944,13 +1944,63 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ "lea 0x8(%[u_buf]),%[u_buf] \n" \ "punpcklwd %%xmm1,%%xmm0 \n" \ - "psraw $0x2,%%xmm0 \n" \ + "psraw $2,%%xmm0 \n" \ "packuswb %%xmm0,%%xmm0 \n" \ "punpcklwd %%xmm0,%%xmm0 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ - "psllw $0x6,%%xmm4 \n" \ + "psllw $6,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" +#define READYUVA210 \ + "movq (%[u_buf]),%%xmm0 \n" \ + "movq 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x8(%[u_buf]),%[u_buf] \n" \ + "punpcklwd %%xmm1,%%xmm0 \n" \ + "psraw $2,%%xmm0 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "psllw $6,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" \ + "movdqu (%[a_buf]),%%xmm5 \n" \ + "psraw $2,%%xmm5 \n" \ + "packuswb %%xmm5,%%xmm5 \n" \ + "lea 0x10(%[a_buf]),%[a_buf] \n" + +// Read 8 UV from 444 10 bit +#define READYUV410 \ + "movdqu (%[u_buf]),%%xmm0 \n" \ + "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "psraw $2,%%xmm0 \n" \ + "psraw $2,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "packuswb %%xmm1,%%xmm0 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "psllw $6,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from 444 10 bit. With 8 Alpha. +#define READYUVA410 \ + "movdqu (%[u_buf]),%%xmm0 \n" \ + "movdqu 0x00(%[u_buf],%[v_buf],1),%%xmm2 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "psraw $2,%%xmm0 \n" \ + "psraw $2,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm1 \n" \ + "packuswb %%xmm1,%%xmm0 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "psllw $0x6,%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" \ + "movdqu (%[a_buf]),%%xmm5 \n" \ + "psraw $2,%%xmm5 \n" \ + "packuswb %%xmm5,%%xmm5 \n" \ + "lea 0x10(%[a_buf]),%[a_buf] \n" + // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. #define READYUVA422 \ "movd (%[u_buf]),%%xmm0 \n" \ @@ -2010,6 +2060,27 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba0, "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ "lea 0x10(%[uyvy_buf]),%[uyvy_buf] \n" +// Read 4 UV from P210, upsample to 8 UV +#define READP210 \ + "movdqu (%[uv_buf]),%%xmm0 \n" \ + "lea 0x10(%[uv_buf]),%[uv_buf] \n" \ + "psrlw $0x8,%%xmm0 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from P410 +#define READP410 \ + "movdqu (%[uv_buf]),%%xmm0 \n" \ + "movdqu 0x10(%[uv_buf]),%%xmm1 \n" \ + "lea 0x20(%[uv_buf]),%[uv_buf] \n" \ + "psrlw $0x8,%%xmm0 \n" \ + "psrlw $0x8,%%xmm1 \n" \ + "packuswb %%xmm1,%%xmm0 \n" \ + "movdqu (%[y_buf]),%%xmm4 \n" \ + "lea 0x10(%[y_buf]),%[y_buf] \n" + #if defined(__x86_64__) #define YUVTORGB_SETUP(yuvconstants) \ "movdqa (%[yuvconstants]),%%xmm8 \n" \ @@ -2362,6 +2433,146 @@ void OMITFP I210ToAR30Row_SSSE3(const uint16_t* y_buf, ); } +// 10 bit YUV to ARGB +void OMITFP I410ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READYUV410 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +#ifdef HAS_I210ALPHATOARGBROW_SSSE3 +// 10 bit YUVA to ARGB +void OMITFP I210AlphaToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUVA210 + YUVTORGB(yuvconstants) + STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), + [dst_argb] "+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif + +#ifdef HAS_I410ALPHATOARGBROW_SSSE3 +// 10 bit YUVA to ARGB +void OMITFP I410AlphaToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUVA410 + YUVTORGB(yuvconstants) + STOREARGB + "subl $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), + [dst_argb] "+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif + +// 10 bit YUV to AR30 +void OMITFP I410ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV410 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + #ifdef HAS_I422ALPHATOARGBROW_SSSE3 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, @@ -2513,6 +2724,123 @@ void OMITFP UYVYToARGBRow_SSSE3(const uint8_t* uyvy_buf, // clang-format on } +void OMITFP P210ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READP210 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[u_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +void OMITFP P410ToARGBRow_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + + LABELALIGN + "1: \n" + READP410 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf] "+r"(y_buf), // %[y_buf] + [uv_buf] "+r"(uv_buf), // %[u_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] + [width] "+rm"(width) // %[width] + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", + "xmm5"); +} + +void OMITFP P210ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READP210 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + +void OMITFP P410ToAR30Row_SSSE3(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + + YUVTORGB_SETUP(yuvconstants) + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READP410 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, const uint8_t* u_buf, const uint8_t* v_buf, @@ -2570,7 +2898,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" -// Read 8 UV from 210 10 bit, upsample to 16 UV +// Read 8 UV from 210, upsample to 16 UV // TODO(fbarchard): Consider vshufb to replace pack/unpack // TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1. #define READYUV210_AVX2 \ @@ -2580,13 +2908,64 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \ - "vpsraw $0x2,%%ymm0,%%ymm0 \n" \ + "vpsraw $2,%%ymm0,%%ymm0 \n" \ "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $0x6,%%ymm4,%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" +// Read 8 UV from 210, upsample to 16 UV. With 16 Alpha. +#define READYUVA210_AVX2 \ + "vmovdqu (%[u_buf]),%%xmm0 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%xmm1 \n" \ + "lea 0x10(%[u_buf]),%[u_buf] \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm1,%%ymm1 \n" \ + "vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpsraw $2,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" \ + "vmovdqu (%[a_buf]),%%ymm5 \n" \ + "vpsraw $2,%%ymm5,%%ymm5 \n" \ + "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \ + "lea 0x20(%[a_buf]),%[a_buf] \n" + +// Read 16 UV from 410 +#define READYUV410_AVX2 \ + "vmovdqu (%[u_buf]),%%ymm0 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \ + "lea 0x20(%[u_buf]),%[u_buf] \n" \ + "vpsraw $2,%%ymm0,%%ymm0 \n" \ + "vpsraw $2,%%ymm2,%%ymm2 \n" \ + "vpunpckhwd %%ymm2,%%ymm0,%%ymm1 \n" \ + "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + +// Read 16 UV from 410. With 16 Alpha. +#define READYUVA410_AVX2 \ + "vmovdqu (%[u_buf]),%%ymm0 \n" \ + "vmovdqu 0x00(%[u_buf],%[v_buf],1),%%ymm2 \n" \ + "lea 0x20(%[u_buf]),%[u_buf] \n" \ + "vpsraw $2,%%ymm0,%%ymm0 \n" \ + "vpsraw $2,%%ymm2,%%ymm2 \n" \ + "vpunpckhwd %%ymm2,%%ymm0,%%ymm1 \n" \ + "vpunpcklwd %%ymm2,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" \ + "vmovdqu (%[a_buf]),%%ymm5 \n" \ + "vpsraw $2,%%ymm5,%%ymm5 \n" \ + "vpackuswb %%ymm5,%%ymm5,%%ymm5 \n" \ + "lea 0x20(%[a_buf]),%[a_buf] \n" + // Read 16 UV from 444. With 16 Alpha. #define READYUVA444_AVX2 \ "vmovdqu (%[u_buf]),%%xmm0 \n" \ @@ -2641,6 +3020,28 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpunpcklbw %%ymm4,%%ymm4,%%ymm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" +// Read 4 UV from P210, upsample to 8 UV +#define READP210_AVX2 \ + "vmovdqu (%[uv_buf]),%%ymm0 \n" \ + "lea 0x20(%[uv_buf]),%[uv_buf] \n" \ + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \ + "vpackuswb %%ymm0,%%ymm0,%%ymm0 \n" \ + "vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + +// Read 8 UV from P410 +#define READP410_AVX2 \ + "vmovdqu (%[uv_buf]),%%ymm0 \n" \ + "vmovdqu 0x20(%[uv_buf]),%%ymm1 \n" \ + "lea 0x40(%[uv_buf]),%[uv_buf] \n" \ + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \ + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" \ + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" \ + "vpermq $0xd8,%%ymm0,%%ymm0 \n" \ + "vmovdqu (%[y_buf]),%%ymm4 \n" \ + "lea 0x20(%[y_buf]),%[y_buf] \n" + // Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV. #define READYUY2_AVX2 \ "vmovdqu (%[yuy2_buf]),%%ymm4 \n" \ @@ -2934,6 +3335,165 @@ void OMITFP I210ToAR30Row_AVX2(const uint16_t* y_buf, } #endif // HAS_I210TOAR30ROW_AVX2 +#if defined(HAS_I410TOARGBROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y producing 16 ARGB (64 bytes). +void OMITFP I410ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READYUV410_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I410TOARGBROW_AVX2 + +#if defined(HAS_I210ALPHATOARGBROW_AVX2) +// 16 pixels +// 8 UV, 16 Y and 16 A producing 16 ARGB (64 bytes). +void OMITFP I210AlphaToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUVA210_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5" + ); +} +#endif // HAS_I210TOARGBROW_AVX2 + +#if defined(HAS_I410ALPHATOARGBROW_AVX2) +// 16 pixels +// 16 UV, 16 Y and 16 A producing 16 ARGB (64 bytes). +void OMITFP I410AlphaToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + const uint16_t* a_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile( + + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + + LABELALIGN + "1: \n" + READYUVA410_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "subl $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + + : [y_buf] "+r"(y_buf), // %[y_buf] + [u_buf] "+r"(u_buf), // %[u_buf] + [v_buf] "+r"(v_buf), // %[v_buf] + [a_buf] "+r"(a_buf), // %[a_buf] + [dst_argb] "+r"(dst_argb), // %[dst_argb] +#if defined(__i386__) + [width]"+m"(width) // %[width] +#else + [width]"+rm"(width) // %[width] +#endif + : [yuvconstants] "r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5" + ); +} +#endif // HAS_I410TOARGBROW_AVX2 + +#if defined(HAS_I410TOAR30ROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y producing 16 AR30 (64 bytes). +void OMITFP I410ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* u_buf, + const uint16_t* v_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + + YUVTORGB_SETUP_AVX2(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READYUV410_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} +#endif // HAS_I410TOAR30ROW_AVX2 + #if defined(HAS_I444ALPHATOARGBROW_AVX2) // 16 pixels // 16 UV values with 16 Y and 16 A producing 16 ARGB. @@ -3193,6 +3753,148 @@ void OMITFP UYVYToARGBRow_AVX2(const uint8_t* uyvy_buf, } #endif // HAS_UYVYTOARGBROW_AVX2 +#if defined(HAS_P210TOARGBROW_AVX2) +// 16 pixels. +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP P210ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READP210_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_P210TOARGBROW_AVX2 + +#if defined(HAS_P410TOARGBROW_AVX2) +// 16 pixels. +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +void OMITFP P410ToARGBRow_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_argb, + const struct YuvConstants* yuvconstants, + int width) { + // clang-format off + asm volatile ( + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + + LABELALIGN + "1: \n" + READP410_AVX2 + YUVTORGB_AVX2(yuvconstants) + STOREARGB_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); + // clang-format on +} +#endif // HAS_P410TOARGBROW_AVX2 + +#if defined(HAS_P210TOAR30ROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y producing 16 AR30 (64 bytes). +void OMITFP P210ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READP210_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_P210TOAR30ROW_AVX2 + +#if defined(HAS_P410TOAR30ROW_AVX2) +// 16 pixels +// 16 UV values with 16 Y producing 16 AR30 (64 bytes). +void OMITFP P410ToAR30Row_AVX2(const uint16_t* y_buf, + const uint16_t* uv_buf, + uint8_t* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + + YUVTORGB_SETUP_AVX2(yuvconstants) + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // AR30 constants + "vpsrlw $14,%%ymm5,%%ymm5 \n" + "vpsllw $4,%%ymm5,%%ymm5 \n" // 2 alpha bits + "vpxor %%ymm6,%%ymm6,%%ymm6 \n" // 0 for min + "vpcmpeqb %%ymm7,%%ymm7,%%ymm7 \n" // 1023 for max + "vpsrlw $6,%%ymm7,%%ymm7 \n" + + LABELALIGN + "1: \n" + READP410_AVX2 + YUVTORGB16_AVX2(yuvconstants) + STOREAR30_AVX2 + "sub $0x10,%[width] \n" + "jg 1b \n" + + "vzeroupper \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [uv_buf]"+r"(uv_buf), // %[uv_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS_AVX2 + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} +#endif // HAS_P410TOAR30ROW_AVX2 + #ifdef HAS_I400TOARGBROW_SSE2 void I400ToARGBRow_SSE2(const uint8_t* y_buf, uint8_t* dst_argb, diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 8638a84c1..de5cd00eb 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -55,13 +55,13 @@ namespace libyuv { static_assert(SRC_BPC == 1 || SRC_BPC == 2, "SRC BPC unsupported"); \ static_assert(DST_BPC == 1 || DST_BPC == 2, "DST BPC unsupported"); \ static_assert(SRC_SUBSAMP_X == 1 || SRC_SUBSAMP_X == 2, \ - "SRC_SUBSAMP_X unsupported"); \ + "SRC_SUBSAMP_X unsupported"); \ static_assert(SRC_SUBSAMP_Y == 1 || SRC_SUBSAMP_Y == 2, \ - "SRC_SUBSAMP_Y unsupported"); \ + "SRC_SUBSAMP_Y unsupported"); \ static_assert(DST_SUBSAMP_X == 1 || DST_SUBSAMP_X == 2, \ - "DST_SUBSAMP_X unsupported"); \ + "DST_SUBSAMP_X unsupported"); \ static_assert(DST_SUBSAMP_Y == 1 || DST_SUBSAMP_Y == 2, \ - "DST_SUBSAMP_Y unsupported"); \ + "DST_SUBSAMP_Y unsupported"); \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ const int kHeight = benchmark_height_; \ const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ @@ -928,6 +928,8 @@ TESTQPLANARTOB(J420Alpha, 2, 2, ARGB, 4, 4, 1) TESTQPLANARTOB(J420Alpha, 2, 2, ABGR, 4, 4, 1) TESTQPLANARTOB(H420Alpha, 2, 2, ARGB, 4, 4, 1) TESTQPLANARTOB(H420Alpha, 2, 2, ABGR, 4, 4, 1) +TESTQPLANARTOB(F420Alpha, 2, 2, ARGB, 4, 4, 1) +TESTQPLANARTOB(F420Alpha, 2, 2, ABGR, 4, 4, 1) TESTQPLANARTOB(U420Alpha, 2, 2, ARGB, 4, 4, 1) TESTQPLANARTOB(U420Alpha, 2, 2, ABGR, 4, 4, 1) TESTQPLANARTOB(V420Alpha, 2, 2, ARGB, 4, 4, 1) @@ -938,6 +940,8 @@ TESTQPLANARTOB(J422Alpha, 2, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(J422Alpha, 2, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(H422Alpha, 2, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(H422Alpha, 2, 1, ABGR, 4, 4, 1) +TESTQPLANARTOB(F422Alpha, 2, 1, ARGB, 4, 4, 1) +TESTQPLANARTOB(F422Alpha, 2, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(U422Alpha, 2, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(U422Alpha, 2, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(V422Alpha, 2, 1, ARGB, 4, 4, 1) @@ -948,6 +952,8 @@ TESTQPLANARTOB(J444Alpha, 1, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(J444Alpha, 1, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(H444Alpha, 1, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(H444Alpha, 1, 1, ABGR, 4, 4, 1) +TESTQPLANARTOB(F444Alpha, 1, 1, ARGB, 4, 4, 1) +TESTQPLANARTOB(F444Alpha, 1, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(U444Alpha, 1, 1, ARGB, 4, 4, 1) TESTQPLANARTOB(U444Alpha, 1, 1, ABGR, 4, 4, 1) TESTQPLANARTOB(V444Alpha, 1, 1, ARGB, 4, 4, 1) @@ -3162,89 +3168,441 @@ TEST_F(LibYUVConvertTest, ABGRToAR30Row_Opt) { #endif // HAS_ABGRTOAR30ROW_AVX2 // TODO(fbarchard): Fix clamping issue affected by U channel. -#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ - ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF) \ - TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ - const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ - const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ - const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ - const int kBpc = 2; \ - align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \ - align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF); \ - align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF); \ - align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \ - align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \ - for (int i = 0; i < kWidth * kHeight; ++i) { \ - reinterpret_cast(src_y + SOFF)[i] = (fastrand() & 0x3ff); \ - } \ - for (int i = 0; i < kSizeUV; ++i) { \ - reinterpret_cast(src_u + SOFF)[i] = (fastrand() & 0x3ff); \ - reinterpret_cast(src_v + SOFF)[i] = (fastrand() & 0x3ff); \ - } \ - memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \ - memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \ - MaskCpuFlags(disable_cpu_flags_); \ - FMT_PLANAR##To##FMT_B( \ - reinterpret_cast(src_y + SOFF), kWidth, \ - reinterpret_cast(src_u + SOFF), kStrideUV, \ - reinterpret_cast(src_v + SOFF), kStrideUV, \ - dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - FMT_PLANAR##To##FMT_B( \ - reinterpret_cast(src_y + SOFF), kWidth, \ - reinterpret_cast(src_u + SOFF), kStrideUV, \ - reinterpret_cast(src_v + SOFF), kStrideUV, \ - dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight); \ - } \ - for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ - EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]); \ - } \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_u); \ - free_aligned_buffer_page_end(src_v); \ - free_aligned_buffer_page_end(dst_argb_c); \ - free_aligned_buffer_page_end(dst_argb_opt); \ +#define TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF, S_DEPTH) \ + TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ + const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ + const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ + const int kBpc = 2; \ + align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \ + align_buffer_page_end(src_u, kSizeUV* kBpc + SOFF); \ + align_buffer_page_end(src_v, kSizeUV* kBpc + SOFF); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + reinterpret_cast(src_y + SOFF)[i] = \ + (fastrand() & ((1 << S_DEPTH) - 1)); \ + } \ + for (int i = 0; i < kSizeUV; ++i) { \ + reinterpret_cast(src_u + SOFF)[i] = \ + (fastrand() & ((1 << S_DEPTH) - 1)); \ + reinterpret_cast(src_v + SOFF)[i] = \ + (fastrand() & ((1 << S_DEPTH) - 1)); \ + } \ + memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \ + memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_PLANAR##To##FMT_B( \ + reinterpret_cast(src_y + SOFF), kWidth, \ + reinterpret_cast(src_u + SOFF), kStrideUV, \ + reinterpret_cast(src_v + SOFF), kStrideUV, \ + dst_argb_c + DOFF, kStrideB, kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B( \ + reinterpret_cast(src_y + SOFF), kWidth, \ + reinterpret_cast(src_u + SOFF), kStrideUV, \ + reinterpret_cast(src_v + SOFF), kStrideUV, \ + dst_argb_opt + DOFF, kStrideB, kWidth, NEG kHeight); \ + } \ + for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ + EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]); \ + } \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_u); \ + free_aligned_buffer_page_end(src_v); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ } #define TESTPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN) \ + YALIGN, S_DEPTH) \ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_ - 4, _Any, +, 0, 0) \ + YALIGN, benchmark_width_ - 4, _Any, +, 0, 0, S_DEPTH) \ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, _Unaligned, +, 1, 1) \ + YALIGN, benchmark_width_, _Unaligned, +, 1, 1, S_DEPTH) \ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, _Invert, -, 0, 0) \ + YALIGN, benchmark_width_, _Invert, -, 0, 0, S_DEPTH) \ TESTPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ - YALIGN, benchmark_width_, _Opt, +, 0, 0) + YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH) -TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1) -TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1) -TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1) -TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1) -TESTPLANAR16TOB(U010, 2, 2, ARGB, 4, 4, 1) -TESTPLANAR16TOB(U010, 2, 2, ABGR, 4, 4, 1) -TESTPLANAR16TOB(I210, 2, 1, ARGB, 4, 4, 1) -TESTPLANAR16TOB(I210, 2, 1, ABGR, 4, 4, 1) -TESTPLANAR16TOB(H210, 2, 1, ARGB, 4, 4, 1) -TESTPLANAR16TOB(H210, 2, 1, ABGR, 4, 4, 1) -TESTPLANAR16TOB(U210, 2, 1, ARGB, 4, 4, 1) -TESTPLANAR16TOB(U210, 2, 1, ABGR, 4, 4, 1) +#define I410ToARGB(a, b, c, d, e, f, g, h, i, j) \ + I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) +#define I410ToABGR(a, b, c, d, e, f, g, h, i, j) \ + I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) +#define H410ToARGB(a, b, c, d, e, f, g, h, i, j) \ + I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j) +#define H410ToABGR(a, b, c, d, e, f, g, h, i, j) \ + I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j) +#define U410ToARGB(a, b, c, d, e, f, g, h, i, j) \ + I410ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j) +#define U410ToABGR(a, b, c, d, e, f, g, h, i, j) \ + I410ToABGRMatrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j) +#define I410ToAR30(a, b, c, d, e, f, g, h, i, j) \ + I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) +#define I410ToAB30(a, b, c, d, e, f, g, h, i, j) \ + I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuvI601Constants, i, j) +#define H410ToAR30(a, b, c, d, e, f, g, h, i, j) \ + I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j) +#define H410ToAB30(a, b, c, d, e, f, g, h, i, j) \ + I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuvH709Constants, i, j) +#define U410ToAR30(a, b, c, d, e, f, g, h, i, j) \ + I410ToAR30Matrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j) +#define U410ToAB30(a, b, c, d, e, f, g, h, i, j) \ + I410ToAB30Matrix(a, b, c, d, e, f, g, h, &kYuv2020Constants, i, j) + +TESTPLANAR16TOB(I010, 2, 2, ARGB, 4, 4, 1, 10) +TESTPLANAR16TOB(I010, 2, 2, ABGR, 4, 4, 1, 10) +TESTPLANAR16TOB(H010, 2, 2, ARGB, 4, 4, 1, 10) +TESTPLANAR16TOB(H010, 2, 2, ABGR, 4, 4, 1, 10) +TESTPLANAR16TOB(U010, 2, 2, ARGB, 4, 4, 1, 10) +TESTPLANAR16TOB(U010, 2, 2, ABGR, 4, 4, 1, 10) +TESTPLANAR16TOB(I210, 2, 1, ARGB, 4, 4, 1, 10) +TESTPLANAR16TOB(I210, 2, 1, ABGR, 4, 4, 1, 10) +TESTPLANAR16TOB(H210, 2, 1, ARGB, 4, 4, 1, 10) +TESTPLANAR16TOB(H210, 2, 1, ABGR, 4, 4, 1, 10) +TESTPLANAR16TOB(U210, 2, 1, ARGB, 4, 4, 1, 10) +TESTPLANAR16TOB(U210, 2, 1, ABGR, 4, 4, 1, 10) +TESTPLANAR16TOB(I410, 1, 1, ARGB, 4, 4, 1, 10) +TESTPLANAR16TOB(I410, 1, 1, ABGR, 4, 4, 1, 10) +TESTPLANAR16TOB(H410, 1, 1, ARGB, 4, 4, 1, 10) +TESTPLANAR16TOB(H410, 1, 1, ABGR, 4, 4, 1, 10) +TESTPLANAR16TOB(U410, 1, 1, ARGB, 4, 4, 1, 10) +TESTPLANAR16TOB(U410, 1, 1, ABGR, 4, 4, 1, 10) #ifdef LITTLE_ENDIAN_ONLY_TEST -TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1) -TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1) -TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1) -TESTPLANAR16TOB(H010, 2, 2, AB30, 4, 4, 1) -TESTPLANAR16TOB(U010, 2, 2, AR30, 4, 4, 1) -TESTPLANAR16TOB(U010, 2, 2, AB30, 4, 4, 1) -TESTPLANAR16TOB(I210, 2, 1, AR30, 4, 4, 1) -TESTPLANAR16TOB(I210, 2, 1, AB30, 4, 4, 1) -TESTPLANAR16TOB(H210, 2, 1, AR30, 4, 4, 1) -TESTPLANAR16TOB(H210, 2, 1, AB30, 4, 4, 1) -TESTPLANAR16TOB(U210, 2, 1, AR30, 4, 4, 1) -TESTPLANAR16TOB(U210, 2, 1, AB30, 4, 4, 1) +TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1, 10) +TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1, 10) +TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 10) +TESTPLANAR16TOB(H010, 2, 2, AB30, 4, 4, 1, 10) +TESTPLANAR16TOB(U010, 2, 2, AR30, 4, 4, 1, 10) +TESTPLANAR16TOB(U010, 2, 2, AB30, 4, 4, 1, 10) +TESTPLANAR16TOB(I210, 2, 1, AR30, 4, 4, 1, 10) +TESTPLANAR16TOB(I210, 2, 1, AB30, 4, 4, 1, 10) +TESTPLANAR16TOB(H210, 2, 1, AR30, 4, 4, 1, 10) +TESTPLANAR16TOB(H210, 2, 1, AB30, 4, 4, 1, 10) +TESTPLANAR16TOB(U210, 2, 1, AR30, 4, 4, 1, 10) +TESTPLANAR16TOB(U210, 2, 1, AB30, 4, 4, 1, 10) +TESTPLANAR16TOB(I410, 1, 1, AR30, 4, 4, 1, 10) +TESTPLANAR16TOB(I410, 1, 1, AB30, 4, 4, 1, 10) +TESTPLANAR16TOB(H410, 1, 1, AR30, 4, 4, 1, 10) +TESTPLANAR16TOB(H410, 1, 1, AB30, 4, 4, 1, 10) +TESTPLANAR16TOB(U410, 1, 1, AR30, 4, 4, 1, 10) +TESTPLANAR16TOB(U410, 1, 1, AB30, 4, 4, 1, 10) +#endif + +#define TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + ALIGN, YALIGN, W1280, N, NEG, OFF, ATTEN, S_DEPTH) \ + TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ + const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X); \ + const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y); \ + const int kBpc = 2; \ + align_buffer_page_end(src_y, kWidth* kHeight* kBpc + OFF); \ + align_buffer_page_end(src_u, kSizeUV* kBpc + OFF); \ + align_buffer_page_end(src_v, kSizeUV* kBpc + OFF); \ + align_buffer_page_end(src_a, kWidth* kHeight* kBpc + OFF); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeight + OFF); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + OFF); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + reinterpret_cast(src_y + OFF)[i] = \ + (fastrand() & ((1 << S_DEPTH) - 1)); \ + reinterpret_cast(src_a + OFF)[i] = \ + (fastrand() & ((1 << S_DEPTH) - 1)); \ + } \ + for (int i = 0; i < kSizeUV; ++i) { \ + reinterpret_cast(src_u + OFF)[i] = \ + (fastrand() & ((1 << S_DEPTH) - 1)); \ + reinterpret_cast(src_v + OFF)[i] = \ + (fastrand() & ((1 << S_DEPTH) - 1)); \ + } \ + memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \ + memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_PLANAR##To##FMT_B(reinterpret_cast(src_y + OFF), kWidth, \ + reinterpret_cast(src_u + OFF), kStrideUV, \ + reinterpret_cast(src_v + OFF), kStrideUV, \ + reinterpret_cast(src_a + OFF), kWidth, \ + dst_argb_c + OFF, kStrideB, kWidth, NEG kHeight, \ + ATTEN); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B( \ + reinterpret_cast(src_y + OFF), kWidth, \ + reinterpret_cast(src_u + OFF), kStrideUV, \ + reinterpret_cast(src_v + OFF), kStrideUV, \ + reinterpret_cast(src_a + OFF), kWidth, \ + dst_argb_opt + OFF, kStrideB, kWidth, NEG kHeight, ATTEN); \ + } \ + for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ + EXPECT_EQ(dst_argb_c[i + OFF], dst_argb_opt[i + OFF]); \ + } \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_u); \ + free_aligned_buffer_page_end(src_v); \ + free_aligned_buffer_page_end(src_a); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + } + +#define TESTQPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + ALIGN, YALIGN, S_DEPTH) \ + TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_ - 4, _Any, +, 0, 0, S_DEPTH) \ + TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, _Unaligned, +, 1, 0, S_DEPTH) \ + TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, _Invert, -, 0, 0, S_DEPTH) \ + TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH) \ + TESTQPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, _Premult, +, 0, 1, S_DEPTH) + +#define I010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ + l, m) +#define I010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ + l, m) +#define J010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ + l, m) +#define J010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ + l, m) +#define F010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ + l, m) +#define F010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ + l, m) +#define H010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ + l, m) +#define H010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ + l, m) +#define U010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ + l, m) +#define U010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ + l, m) +#define V010AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ + l, m) +#define V010AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I010AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ + l, m) +#define I210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ + l, m) +#define I210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ + l, m) +#define J210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ + l, m) +#define J210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ + l, m) +#define F210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ + l, m) +#define F210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ + l, m) +#define H210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ + l, m) +#define H210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ + l, m) +#define U210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ + l, m) +#define U210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ + l, m) +#define V210AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ + l, m) +#define V210AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I210AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ + l, m) +#define I410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ + l, m) +#define I410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvI601Constants, k, \ + l, m) +#define J410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ + l, m) +#define J410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \ + l, m) +#define F410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ + l, m) +#define F410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvF709Constants, k, \ + l, m) +#define H410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ + l, m) +#define H410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvH709Constants, k, \ + l, m) +#define U410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ + l, m) +#define U410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \ + l, m) +#define V410AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ + l, m) +#define V410AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \ + I410AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \ + l, m) + +TESTQPLANAR16TOB(I010Alpha, 2, 2, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(I010Alpha, 2, 2, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(J010Alpha, 2, 2, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(J010Alpha, 2, 2, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(H010Alpha, 2, 2, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(H010Alpha, 2, 2, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(F010Alpha, 2, 2, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(F010Alpha, 2, 2, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(U010Alpha, 2, 2, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(U010Alpha, 2, 2, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(V010Alpha, 2, 2, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(V010Alpha, 2, 2, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(I210Alpha, 2, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(J210Alpha, 2, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(J210Alpha, 2, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(H210Alpha, 2, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(H210Alpha, 2, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(F210Alpha, 2, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(F210Alpha, 2, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(U210Alpha, 2, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(U210Alpha, 2, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(V210Alpha, 2, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(V210Alpha, 2, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(I410Alpha, 1, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(I410Alpha, 1, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(J410Alpha, 1, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(J410Alpha, 1, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(H410Alpha, 1, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(H410Alpha, 1, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(F410Alpha, 1, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(F410Alpha, 1, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(U410Alpha, 1, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(U410Alpha, 1, 1, ABGR, 4, 4, 1, 10) +TESTQPLANAR16TOB(V410Alpha, 1, 1, ARGB, 4, 4, 1, 10) +TESTQPLANAR16TOB(V410Alpha, 1, 1, ABGR, 4, 4, 1, 10) + +#define TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + ALIGN, YALIGN, W1280, N, NEG, SOFF, DOFF, S_DEPTH) \ + TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = ALIGNINT(benchmark_height_, YALIGN); \ + const int kStrideB = ALIGNINT(kWidth * BPP_B, ALIGN); \ + const int kStrideUV = SUBSAMPLE(kWidth, SUBSAMP_X) * 2; \ + const int kSizeUV = kStrideUV * SUBSAMPLE(kHeight, SUBSAMP_Y) * 2; \ + const int kBpc = 2; \ + align_buffer_page_end(src_y, kWidth* kHeight* kBpc + SOFF); \ + align_buffer_page_end(src_uv, kSizeUV* kBpc + SOFF); \ + align_buffer_page_end(dst_argb_c, kStrideB* kHeight + DOFF); \ + align_buffer_page_end(dst_argb_opt, kStrideB* kHeight + DOFF); \ + for (int i = 0; i < kWidth * kHeight; ++i) { \ + reinterpret_cast(src_y + SOFF)[i] = \ + (fastrand() & (((uint16_t)(-1)) << (16 - S_DEPTH))); \ + } \ + for (int i = 0; i < kSizeUV; ++i) { \ + reinterpret_cast(src_uv + SOFF)[i] = \ + (fastrand() & (((uint16_t)(-1)) << (16 - S_DEPTH))); \ + } \ + memset(dst_argb_c + DOFF, 1, kStrideB * kHeight); \ + memset(dst_argb_opt + DOFF, 101, kStrideB * kHeight); \ + MaskCpuFlags(disable_cpu_flags_); \ + FMT_PLANAR##To##FMT_B(reinterpret_cast(src_y + SOFF), kWidth, \ + reinterpret_cast(src_uv + SOFF), \ + kStrideUV, dst_argb_c + DOFF, kStrideB, kWidth, \ + NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_PLANAR##To##FMT_B(reinterpret_cast(src_y + SOFF), kWidth, \ + reinterpret_cast(src_uv + SOFF), \ + kStrideUV, dst_argb_opt + DOFF, kStrideB, kWidth, \ + NEG kHeight); \ + } \ + for (int i = 0; i < kWidth * BPP_B * kHeight; ++i) { \ + EXPECT_EQ(dst_argb_c[i + DOFF], dst_argb_opt[i + DOFF]); \ + } \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_uv); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + } + +#define TESTBIPLANAR16TOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + ALIGN, YALIGN, S_DEPTH) \ + TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_ - 4, _Any, +, 0, 0, S_DEPTH) \ + TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, _Unaligned, +, 1, 1, S_DEPTH) \ + TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, _Invert, -, 0, 0, S_DEPTH) \ + TESTBIPLANAR16TOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ + YALIGN, benchmark_width_, _Opt, +, 0, 0, S_DEPTH) + +#define P010ToARGB(a, b, c, d, e, f, g, h) \ + P010ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P210ToARGB(a, b, c, d, e, f, g, h) \ + P210ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P010ToAR30(a, b, c, d, e, f, g, h) \ + P010ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P210ToAR30(a, b, c, d, e, f, g, h) \ + P210ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) + +#define P012ToARGB(a, b, c, d, e, f, g, h) \ + P012ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P212ToARGB(a, b, c, d, e, f, g, h) \ + P212ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P012ToAR30(a, b, c, d, e, f, g, h) \ + P012ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P212ToAR30(a, b, c, d, e, f, g, h) \ + P212ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) + +#define P016ToARGB(a, b, c, d, e, f, g, h) \ + P016ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P216ToARGB(a, b, c, d, e, f, g, h) \ + P216ToARGBMatrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P016ToAR30(a, b, c, d, e, f, g, h) \ + P016ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) +#define P216ToAR30(a, b, c, d, e, f, g, h) \ + P216ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h) + +TESTBIPLANAR16TOB(P010, 2, 2, ARGB, 4, 4, 1, 10) +TESTBIPLANAR16TOB(P210, 2, 1, ARGB, 4, 4, 1, 10) +TESTBIPLANAR16TOB(P012, 2, 2, ARGB, 4, 4, 1, 12) +TESTBIPLANAR16TOB(P212, 2, 1, ARGB, 4, 4, 1, 12) +TESTBIPLANAR16TOB(P016, 2, 2, ARGB, 4, 4, 1, 16) +TESTBIPLANAR16TOB(P216, 2, 1, ARGB, 4, 4, 1, 16) +#ifdef LITTLE_ENDIAN_ONLY_TEST +TESTBIPLANAR16TOB(P010, 2, 2, AR30, 4, 4, 1, 10) +TESTBIPLANAR16TOB(P210, 2, 1, AR30, 4, 4, 1, 10) +TESTBIPLANAR16TOB(P012, 2, 2, AR30, 4, 4, 1, 12) +TESTBIPLANAR16TOB(P212, 2, 1, AR30, 4, 4, 1, 12) +TESTBIPLANAR16TOB(P016, 2, 2, AR30, 4, 4, 1, 16) +TESTBIPLANAR16TOB(P216, 2, 1, AR30, 4, 4, 1, 16) #endif static int Clamp(int y) {