From dd8b46630a1f3510aefea626b0a2600328d97070 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 12 May 2026 13:57:46 -0700 Subject: [PATCH] ARGBToUV444MatrixRow_AVX2 intrinsics for Visual C Was C LibYUVConvertTest.ARGBToI444_Opt (1027 ms) Now AVX2 LibYUVConvertTest.ARGBToI444_Opt (310 ms) Bug: libyuv:508639302 Change-Id: I0bc7f5c5b72160d24226a98d5fddb184a004ed00 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7841655 Reviewed-by: richard winterton --- README.chromium | 2 +- include/libyuv/row.h | 1 + include/libyuv/version.h | 2 +- source/row_win.cc | 112 +++++++++++++++++++++++++++++++-------- 4 files changed, 92 insertions(+), 25 deletions(-) diff --git a/README.chromium b/README.chromium index 92d44bc8c..3e36b6704 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: https://chromium.googlesource.com/libyuv/libyuv/ -Version: 1937 +Version: 1938 Revision: DEPS License: BSD-3-Clause License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 5a135133e..2bd046913 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -362,6 +362,7 @@ extern "C" { #endif #define HAS_ARGBTOYROW_AVX2 #define HAS_ARGBTOYMATRIXROW_AVX2 +#define HAS_ARGBTOUV444MATRIXROW_AVX2 #define HAS_ABGRTOYROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 #define HAS_ABGRTOYJROW_AVX2 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index f384c1efb..1a7808bc2 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1937 +#define LIBYUV_VERSION 1938 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_win.cc b/source/row_win.cc index 77070d031..87a4a5aeb 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -102,42 +102,109 @@ extern "C" { _mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \ dst_argb += 32; -#if defined(HAS_I422TOARGBROW_SSSE3) - -#endif - -#if defined(HAS_I422ALPHATOARGBROW_SSSE3) - -#endif - -#if defined(HAS_I444TOARGBROW_SSSE3) - -#endif - -#if defined(HAS_I444ALPHATOARGBROW_SSSE3) - -#endif - -#if defined(HAS_ARGBTOYROW_AVX2) +#if defined(HAS_ARGBTOYMATRIXROW_AVX2) #if defined(__clang__) || defined(__GNUC__) #define LIBYUV_TARGET_AVX2 __attribute__((target("avx2"))) -#define LIBYUV_TARGET_AVX512BW __attribute__((target("avx512bw,avx512vl,avx512f"))) +#define LIBYUV_TARGET_AVX512BW \ + __attribute__((target("avx512bw,avx512vl,avx512f"))) #else #define LIBYUV_TARGET_AVX2 #define LIBYUV_TARGET_AVX512BW #endif +// Convert 32 ARGB pixels (128 bytes) to 32 UV444 values. +#if defined(HAS_ARGBTOYMATRIXROW_AVX2) || defined(HAS_ARGBTOUV444MATRIXROW_AVX2) +LIBYUV_TARGET_AVX2 +void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c) { + __m256i ymm5 = _mm256_set1_epi8((char)0x80); + __m256i ymm_u = + _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU)); + __m256i ymm_v = + _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToV)); + __m256i ymm_add = + _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kAddUV)); + __m256i ymm_u_bias = _mm256_maddubs_epi16(ymm_u, ymm5); + ymm_u_bias = _mm256_hadd_epi16(ymm_u_bias, ymm_u_bias); + __m256i ymm_add_u = _mm256_sub_epi16(ymm_add, ymm_u_bias); + __m256i ymm_v_bias = _mm256_maddubs_epi16(ymm_v, ymm5); + ymm_v_bias = _mm256_hadd_epi16(ymm_v_bias, ymm_v_bias); + __m256i ymm_add_v = _mm256_sub_epi16(ymm_add, ymm_v_bias); + __m256i perm_mask = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7); + + while (width > 0) { + __m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb); + __m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32)); + __m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + 64)); + __m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + 96)); + src_argb += 128; + + __m256i ymm0_u = _mm256_sub_epi8(ymm0, ymm5); + __m256i ymm1_u = _mm256_sub_epi8(ymm1, ymm5); + __m256i ymm2_u = _mm256_sub_epi8(ymm2, ymm5); + __m256i ymm3_u = _mm256_sub_epi8(ymm3, ymm5); + + __m256i ymm0_v = ymm0_u; + __m256i ymm1_v = ymm1_u; + __m256i ymm2_v = ymm2_u; + __m256i ymm3_v = ymm3_u; + + ymm0_u = _mm256_maddubs_epi16(ymm_u, ymm0_u); + ymm1_u = _mm256_maddubs_epi16(ymm_u, ymm1_u); + ymm2_u = _mm256_maddubs_epi16(ymm_u, ymm2_u); + ymm3_u = _mm256_maddubs_epi16(ymm_u, ymm3_u); + + ymm0_v = _mm256_maddubs_epi16(ymm_v, ymm0_v); + ymm1_v = _mm256_maddubs_epi16(ymm_v, ymm1_v); + ymm2_v = _mm256_maddubs_epi16(ymm_v, ymm2_v); + ymm3_v = _mm256_maddubs_epi16(ymm_v, ymm3_v); + + ymm0_u = _mm256_hadd_epi16(ymm0_u, ymm1_u); + ymm2_u = _mm256_hadd_epi16(ymm2_u, ymm3_u); + + ymm0_v = _mm256_hadd_epi16(ymm0_v, ymm1_v); + ymm2_v = _mm256_hadd_epi16(ymm2_v, ymm3_v); + + ymm0_u = _mm256_add_epi16(ymm0_u, ymm_add_u); + ymm2_u = _mm256_add_epi16(ymm2_u, ymm_add_u); + + ymm0_v = _mm256_add_epi16(ymm0_v, ymm_add_v); + ymm2_v = _mm256_add_epi16(ymm2_v, ymm_add_v); + + ymm0_u = _mm256_srli_epi16(ymm0_u, 8); + ymm2_u = _mm256_srli_epi16(ymm2_u, 8); + + ymm0_v = _mm256_srli_epi16(ymm0_v, 8); + ymm2_v = _mm256_srli_epi16(ymm2_v, 8); + + ymm0_u = _mm256_packus_epi16(ymm0_u, ymm2_u); + ymm0_u = _mm256_permutevar8x32_epi32(ymm0_u, perm_mask); + + ymm0_v = _mm256_packus_epi16(ymm0_v, ymm2_v); + ymm0_v = _mm256_permutevar8x32_epi32(ymm0_v, perm_mask); + + _mm256_storeu_si256((__m256i*)dst_u, ymm0_u); + _mm256_storeu_si256((__m256i*)dst_v, ymm0_v); + dst_u += 32; + dst_v += 32; + width -= 32; + } +} +#endif LIBYUV_TARGET_AVX2 void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width, const struct ArgbConstants* c) { __m256i ymm5 = _mm256_set1_epi8((char)0x80); - __m128i kRGBToY = _mm_loadu_si128((const __m128i*)c->kRGBToY); - __m256i ymm4 = _mm256_broadcastsi128_si256(kRGBToY); - __m128i kAddY = _mm_loadu_si128((const __m128i*)c->kAddY); - __m256i ymm7 = _mm256_broadcastsi128_si256(kAddY); + __m256i ymm4 = + _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToY)); + __m256i ymm7 = + _mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kAddY)); __m256i ymm6 = _mm256_maddubs_epi16(ymm4, ymm5); ymm6 = _mm256_hadd_epi16(ymm6, ymm6); ymm7 = _mm256_sub_epi16(ymm7, ymm6); @@ -405,7 +472,6 @@ void MergeUVRow_AVX2(const uint8_t* src_u, #endif - #ifdef __cplusplus } // extern "C" } // namespace libyuv