mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-06-15 00:16:08 +08:00
ARGBToUV444MatrixRow_AVX2 intrinsics for Visual C
Was C LibYUVConvertTest.ARGBToI444_Opt (1027 ms) Now AVX2 LibYUVConvertTest.ARGBToI444_Opt (310 ms) Bug: libyuv:508639302 Change-Id: I0bc7f5c5b72160d24226a98d5fddb184a004ed00 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7841655 Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
parent
cb061d0378
commit
dd8b46630a
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
||||
Version: 1937
|
||||
Version: 1938
|
||||
Revision: DEPS
|
||||
License: BSD-3-Clause
|
||||
License File: LICENSE
|
||||
|
||||
@ -362,6 +362,7 @@ extern "C" {
|
||||
#endif
|
||||
#define HAS_ARGBTOYROW_AVX2
|
||||
#define HAS_ARGBTOYMATRIXROW_AVX2
|
||||
#define HAS_ARGBTOUV444MATRIXROW_AVX2
|
||||
#define HAS_ABGRTOYROW_AVX2
|
||||
#define HAS_ARGBTOYJROW_AVX2
|
||||
#define HAS_ABGRTOYJROW_AVX2
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1937
|
||||
#define LIBYUV_VERSION 1938
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -102,42 +102,109 @@ extern "C" {
|
||||
_mm_storeu_si128((__m128i*)(dst_argb + 16), xmm1); \
|
||||
dst_argb += 32;
|
||||
|
||||
#if defined(HAS_I422TOARGBROW_SSSE3)
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(HAS_I422ALPHATOARGBROW_SSSE3)
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(HAS_I444TOARGBROW_SSSE3)
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(HAS_I444ALPHATOARGBROW_SSSE3)
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(HAS_ARGBTOYROW_AVX2)
|
||||
#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
|
||||
|
||||
#if defined(__clang__) || defined(__GNUC__)
|
||||
#define LIBYUV_TARGET_AVX2 __attribute__((target("avx2")))
|
||||
#define LIBYUV_TARGET_AVX512BW __attribute__((target("avx512bw,avx512vl,avx512f")))
|
||||
#define LIBYUV_TARGET_AVX512BW \
|
||||
__attribute__((target("avx512bw,avx512vl,avx512f")))
|
||||
#else
|
||||
#define LIBYUV_TARGET_AVX2
|
||||
#define LIBYUV_TARGET_AVX512BW
|
||||
#endif
|
||||
|
||||
// Convert 32 ARGB pixels (128 bytes) to 32 UV444 values.
|
||||
#if defined(HAS_ARGBTOYMATRIXROW_AVX2) || defined(HAS_ARGBTOUV444MATRIXROW_AVX2)
|
||||
LIBYUV_TARGET_AVX2
|
||||
void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
__m256i ymm5 = _mm256_set1_epi8((char)0x80);
|
||||
__m256i ymm_u =
|
||||
_mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToU));
|
||||
__m256i ymm_v =
|
||||
_mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToV));
|
||||
__m256i ymm_add =
|
||||
_mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kAddUV));
|
||||
__m256i ymm_u_bias = _mm256_maddubs_epi16(ymm_u, ymm5);
|
||||
ymm_u_bias = _mm256_hadd_epi16(ymm_u_bias, ymm_u_bias);
|
||||
__m256i ymm_add_u = _mm256_sub_epi16(ymm_add, ymm_u_bias);
|
||||
__m256i ymm_v_bias = _mm256_maddubs_epi16(ymm_v, ymm5);
|
||||
ymm_v_bias = _mm256_hadd_epi16(ymm_v_bias, ymm_v_bias);
|
||||
__m256i ymm_add_v = _mm256_sub_epi16(ymm_add, ymm_v_bias);
|
||||
__m256i perm_mask = _mm256_setr_epi32(0, 4, 1, 5, 2, 6, 3, 7);
|
||||
|
||||
while (width > 0) {
|
||||
__m256i ymm0 = _mm256_loadu_si256((const __m256i*)src_argb);
|
||||
__m256i ymm1 = _mm256_loadu_si256((const __m256i*)(src_argb + 32));
|
||||
__m256i ymm2 = _mm256_loadu_si256((const __m256i*)(src_argb + 64));
|
||||
__m256i ymm3 = _mm256_loadu_si256((const __m256i*)(src_argb + 96));
|
||||
src_argb += 128;
|
||||
|
||||
__m256i ymm0_u = _mm256_sub_epi8(ymm0, ymm5);
|
||||
__m256i ymm1_u = _mm256_sub_epi8(ymm1, ymm5);
|
||||
__m256i ymm2_u = _mm256_sub_epi8(ymm2, ymm5);
|
||||
__m256i ymm3_u = _mm256_sub_epi8(ymm3, ymm5);
|
||||
|
||||
__m256i ymm0_v = ymm0_u;
|
||||
__m256i ymm1_v = ymm1_u;
|
||||
__m256i ymm2_v = ymm2_u;
|
||||
__m256i ymm3_v = ymm3_u;
|
||||
|
||||
ymm0_u = _mm256_maddubs_epi16(ymm_u, ymm0_u);
|
||||
ymm1_u = _mm256_maddubs_epi16(ymm_u, ymm1_u);
|
||||
ymm2_u = _mm256_maddubs_epi16(ymm_u, ymm2_u);
|
||||
ymm3_u = _mm256_maddubs_epi16(ymm_u, ymm3_u);
|
||||
|
||||
ymm0_v = _mm256_maddubs_epi16(ymm_v, ymm0_v);
|
||||
ymm1_v = _mm256_maddubs_epi16(ymm_v, ymm1_v);
|
||||
ymm2_v = _mm256_maddubs_epi16(ymm_v, ymm2_v);
|
||||
ymm3_v = _mm256_maddubs_epi16(ymm_v, ymm3_v);
|
||||
|
||||
ymm0_u = _mm256_hadd_epi16(ymm0_u, ymm1_u);
|
||||
ymm2_u = _mm256_hadd_epi16(ymm2_u, ymm3_u);
|
||||
|
||||
ymm0_v = _mm256_hadd_epi16(ymm0_v, ymm1_v);
|
||||
ymm2_v = _mm256_hadd_epi16(ymm2_v, ymm3_v);
|
||||
|
||||
ymm0_u = _mm256_add_epi16(ymm0_u, ymm_add_u);
|
||||
ymm2_u = _mm256_add_epi16(ymm2_u, ymm_add_u);
|
||||
|
||||
ymm0_v = _mm256_add_epi16(ymm0_v, ymm_add_v);
|
||||
ymm2_v = _mm256_add_epi16(ymm2_v, ymm_add_v);
|
||||
|
||||
ymm0_u = _mm256_srli_epi16(ymm0_u, 8);
|
||||
ymm2_u = _mm256_srli_epi16(ymm2_u, 8);
|
||||
|
||||
ymm0_v = _mm256_srli_epi16(ymm0_v, 8);
|
||||
ymm2_v = _mm256_srli_epi16(ymm2_v, 8);
|
||||
|
||||
ymm0_u = _mm256_packus_epi16(ymm0_u, ymm2_u);
|
||||
ymm0_u = _mm256_permutevar8x32_epi32(ymm0_u, perm_mask);
|
||||
|
||||
ymm0_v = _mm256_packus_epi16(ymm0_v, ymm2_v);
|
||||
ymm0_v = _mm256_permutevar8x32_epi32(ymm0_v, perm_mask);
|
||||
|
||||
_mm256_storeu_si256((__m256i*)dst_u, ymm0_u);
|
||||
_mm256_storeu_si256((__m256i*)dst_v, ymm0_v);
|
||||
dst_u += 32;
|
||||
dst_v += 32;
|
||||
width -= 32;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
LIBYUV_TARGET_AVX2
|
||||
void ARGBToYMatrixRow_AVX2(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
__m256i ymm5 = _mm256_set1_epi8((char)0x80);
|
||||
__m128i kRGBToY = _mm_loadu_si128((const __m128i*)c->kRGBToY);
|
||||
__m256i ymm4 = _mm256_broadcastsi128_si256(kRGBToY);
|
||||
__m128i kAddY = _mm_loadu_si128((const __m128i*)c->kAddY);
|
||||
__m256i ymm7 = _mm256_broadcastsi128_si256(kAddY);
|
||||
__m256i ymm4 =
|
||||
_mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kRGBToY));
|
||||
__m256i ymm7 =
|
||||
_mm256_broadcastsi128_si256(_mm_loadu_si128((const __m128i*)c->kAddY));
|
||||
__m256i ymm6 = _mm256_maddubs_epi16(ymm4, ymm5);
|
||||
ymm6 = _mm256_hadd_epi16(ymm6, ymm6);
|
||||
ymm7 = _mm256_sub_epi16(ymm7, ymm6);
|
||||
@ -405,7 +472,6 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user