ARGBToUV444MatrixRow_AVX2 intrinsics for Visual C

Was C
LibYUVConvertTest.ARGBToI444_Opt (1027 ms)

Now AVX2
LibYUVConvertTest.ARGBToI444_Opt (310 ms)

Bug: libyuv:508639302
Change-Id: I0bc7f5c5b72160d24226a98d5fddb184a004ed00
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7841655
Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
Frank Barchard 2026-05-12 13:57:46 -07:00
parent cb061d0378
commit dd8b46630a
4 changed files with 92 additions and 25 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1937
Version: 1938
Revision: DEPS
License: BSD-3-Clause
License File: LICENSE

View File

@ -362,6 +362,7 @@ extern "C" {
#endif
#define HAS_ARGBTOYROW_AVX2
#define HAS_ARGBTOYMATRIXROW_AVX2
#define HAS_ARGBTOUV444MATRIXROW_AVX2
#define HAS_ABGRTOYROW_AVX2
#define HAS_ARGBTOYJROW_AVX2
#define HAS_ABGRTOYJROW_AVX2

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1937
#define LIBYUV_VERSION 1938
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -102,42 +102,109 @@ extern "C" {
_mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
dst_argb += 32;
#if defined(HAS_I422TOARGBROW_SSSE3)
#endif
#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
#endif
#if defined(HAS_I444TOARGBROW_SSSE3)
#endif
#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
#if defined(__clang__) || defined(__GNUC__)
#define LIBYUV_TARGET_AVX2 __attribute__((target("avx2")))
#define LIBYUV_TARGET_AVX512BW __attribute__((target("avx512bw,avx512vl,avx512f")))
#define LIBYUV_TARGET_AVX512BW \
__attribute__((target("avx512bw,avx512vl,avx512f")))
#else
#define LIBYUV_TARGET_AVX2
#define LIBYUV_TARGET_AVX512BW
#endif
// Convert 32 ARGB pixels (128 bytes) to 32 UV444 values.
#if defined(HAS_ARGBTOYMATRIXROW_AVX2) || defined(HAS_ARGBTOUV444MATRIXROW_AVX2)
LIBYUV_TARGET_AVX2
void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c) {
__m256i ymm5 = _mm256_set1_epi8((char)0x80);
__m256i ymm_u =
_mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU));
__m256i ymm_v =
_mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToV));
__m256i ymm_add =
_mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kAddUV));
__m256i ymm_u_bias = _mm256_maddubs_epi16(ymm_u, ymm5);
ymm_u_bias = _mm256_hadd_epi16(ymm_u_bias, ymm_u_bias);
__m256i ymm_add_u = _mm256_sub_epi16(ymm_add, ymm_u_bias);
__m256i ymm_v_bias = _mm256_maddubs_epi16(ymm_v, ymm5);
ymm_v_bias = _mm256_hadd_epi16(ymm_v_bias, ymm_v_bias);
__m256i ymm_add_v = _mm256_sub_epi16(ymm_add, ymm_v_bias);
__m256i perm_mask = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
while (width > 0) {
__m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb);
__m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32));
__m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + 64));
__m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + 96));
src_argb += 128;
__m256i ymm0_u = _mm256_sub_epi8(ymm0, ymm5);
__m256i ymm1_u = _mm256_sub_epi8(ymm1, ymm5);
__m256i ymm2_u = _mm256_sub_epi8(ymm2, ymm5);
__m256i ymm3_u = _mm256_sub_epi8(ymm3, ymm5);
__m256i ymm0_v = ymm0_u;
__m256i ymm1_v = ymm1_u;
__m256i ymm2_v = ymm2_u;
__m256i ymm3_v = ymm3_u;
ymm0_u = _mm256_maddubs_epi16(ymm_u, ymm0_u);
ymm1_u = _mm256_maddubs_epi16(ymm_u, ymm1_u);
ymm2_u = _mm256_maddubs_epi16(ymm_u, ymm2_u);
ymm3_u = _mm256_maddubs_epi16(ymm_u, ymm3_u);
ymm0_v = _mm256_maddubs_epi16(ymm_v, ymm0_v);
ymm1_v = _mm256_maddubs_epi16(ymm_v, ymm1_v);
ymm2_v = _mm256_maddubs_epi16(ymm_v, ymm2_v);
ymm3_v = _mm256_maddubs_epi16(ymm_v, ymm3_v);
ymm0_u = _mm256_hadd_epi16(ymm0_u, ymm1_u);
ymm2_u = _mm256_hadd_epi16(ymm2_u, ymm3_u);
ymm0_v = _mm256_hadd_epi16(ymm0_v, ymm1_v);
ymm2_v = _mm256_hadd_epi16(ymm2_v, ymm3_v);
ymm0_u = _mm256_add_epi16(ymm0_u, ymm_add_u);
ymm2_u = _mm256_add_epi16(ymm2_u, ymm_add_u);
ymm0_v = _mm256_add_epi16(ymm0_v, ymm_add_v);
ymm2_v = _mm256_add_epi16(ymm2_v, ymm_add_v);
ymm0_u = _mm256_srli_epi16(ymm0_u, 8);
ymm2_u = _mm256_srli_epi16(ymm2_u, 8);
ymm0_v = _mm256_srli_epi16(ymm0_v, 8);
ymm2_v = _mm256_srli_epi16(ymm2_v, 8);
ymm0_u = _mm256_packus_epi16(ymm0_u, ymm2_u);
ymm0_u = _mm256_permutevar8x32_epi32(ymm0_u, perm_mask);
ymm0_v = _mm256_packus_epi16(ymm0_v, ymm2_v);
ymm0_v = _mm256_permutevar8x32_epi32(ymm0_v, perm_mask);
_mm256_storeu_si256((__m256i*)dst_u, ymm0_u);
_mm256_storeu_si256((__m256i*)dst_v, ymm0_v);
dst_u += 32;
dst_v += 32;
width -= 32;
}
}
#endif
LIBYUV_TARGET_AVX2
void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
__m256i ymm5 = _mm256_set1_epi8((char)0x80);
__m128i kRGBToY = _mm_loadu_si128((const __m128i*)c->kRGBToY);
__m256i ymm4 = _mm256_broadcastsi128_si256(kRGBToY);
__m128i kAddY = _mm_loadu_si128((const __m128i*)c->kAddY);
__m256i ymm7 = _mm256_broadcastsi128_si256(kAddY);
__m256i ymm4 =
_mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToY));
__m256i ymm7 =
_mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kAddY));
__m256i ymm6 = _mm256_maddubs_epi16(ymm4, ymm5);
ymm6 = _mm256_hadd_epi16(ymm6, ymm6);
ymm7 = _mm256_sub_epi16(ymm7, ymm6);
@ -405,7 +472,6 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
#endif
#ifdef __cplusplus
} // extern "C"
} // namespace libyuv