MergeUV AVX512BW use assembly

- Convert MergeUVRow_AVX512BW to assembly
- Enable MergeUVRow_AVX512BW for Windows with clangcl
- MergeUVRow_AVX2 use vpmovzxbw and vpsllw
- MergeUVRow_16_AVX2 use vpmovzxbw and vpsllw with different shift for U and V

AMD Zen 4 640x360 100000 iterations
Was
AVX512 MergeUVPlane_Opt (884 ms)
AVX2   MergeUVPlane_Opt (945 ms)
AVX2   MergeUVPlane_16_Opt (2167 ms)

Now
AVX512 MergeUVPlane_Opt (865 ms)
AVX2   MergeUVPlane_Opt (943 ms)
SSE2   MergeUVPlane_Opt (973 ms)
AVX2   MergeUVPlane_16_Opt (2102 ms)

Bug: None
Change-Id: I658ada2a75d44c3f93be8bd3ed96f83d5fa2ab8d
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4271230
Reviewed-by: Fritz Koenig <frkoenig@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
Frank Barchard 2023-02-20 02:21:22 -08:00 committed by libyuv LUCI CQ
parent 2bdc210be9
commit 88b050f337
10 changed files with 80 additions and 106 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1860
Version: 1861
License: BSD
License File: LICENSE

View File

@ -402,9 +402,8 @@ extern "C" {
// The following are available for AVX512 clang x86 platforms:
// TODO(fbarchard): Port to GCC and Visual C
// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
// TODO(fbarchard): Port MERGEUV to assembly
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512) && !defined(_MSC_VER))
(defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512)
#define HAS_ARGBTORGB24ROW_AVX512VBMI
#define HAS_MERGEUVROW_AVX512BW
#endif

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1860
#define LIBYUV_VERSION 1861
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -919,7 +919,7 @@ int I422ToNV21(const uint8_t* src_y,
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow = MergeUVRow_AVX2;
}
}

View File

@ -384,7 +384,7 @@ int ARGBToNV12(const uint8_t* src_argb,
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow_ = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_AVX2;
}
}
@ -562,7 +562,7 @@ int ARGBToNV21(const uint8_t* src_argb,
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow_ = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_AVX2;
}
}
@ -737,7 +737,7 @@ int ABGRToNV12(const uint8_t* src_abgr,
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow_ = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_AVX2;
}
}
@ -913,7 +913,7 @@ int ABGRToNV21(const uint8_t* src_abgr,
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow_ = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_AVX2;
}
}
@ -2948,7 +2948,7 @@ int RAWToJNV21(const uint8_t* src_raw,
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow_ = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(halfwidth, 32)) {
if (IS_ALIGNED(halfwidth, 16)) {
MergeUVRow_ = MergeUVRow_AVX2;
}
}

View File

@ -162,7 +162,7 @@ void Convert8To16Plane(const uint8_t* src_y,
int src_stride_y,
uint16_t* dst_y,
int dst_stride_y,
int scale, // 16384 for 10 bits
int scale, // 1024 for 10 bits
int width,
int height) {
int y;
@ -594,7 +594,7 @@ void MergeUVPlane(const uint8_t* src_u,
#if defined(HAS_MERGEUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow = MergeUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
if (IS_ALIGNED(width, 16)) {
MergeUVRow = MergeUVRow_AVX2;
}
}
@ -736,7 +736,7 @@ void MergeUVPlane_16(const uint16_t* src_u,
#if defined(HAS_MERGEUVROW_16_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow_16 = MergeUVRow_16_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
if (IS_ALIGNED(width, 8)) {
MergeUVRow_16 = MergeUVRow_16_AVX2;
}
}

View File

@ -569,7 +569,7 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON,
ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
#endif
#ifdef HAS_MERGEUVROW_AVX2
ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 15)
#endif
#ifdef HAS_MERGEUVROW_AVX512BW
ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31)
@ -861,7 +861,7 @@ ANY21CT(P410ToAR30Row_Any_AVX2, P410ToAR30Row_AVX2, 0, 0, uint16_t, 2, 4, 15)
}
#ifdef HAS_MERGEUVROW_16_AVX2
ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 15)
ANY21PT(MergeUVRow_16_Any_AVX2, MergeUVRow_16_AVX2, uint16_t, 2, 7)
#endif
#ifdef HAS_MERGEUVROW_16_NEON
ANY21PT(MergeUVRow_16_Any_NEON, MergeUVRow_16_NEON, uint16_t, 2, 7)

View File

@ -17,8 +17,6 @@ extern "C" {
// This module is for GCC x86 and x64.
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
#include <immintrin.h>
#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
// Constants for ARGB
@ -5145,21 +5143,30 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
#endif // HAS_DETILESPLITUVROW_SSSE3
#ifdef HAS_MERGEUVROW_AVX512BW
__attribute__ ((target("avx512vl,avx512bw")))
void MergeUVRow_AVX512BW(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width) {
do {
const __m512i u = _mm512_cvtepu8_epi16(_mm256_loadu_epi8(src_u));
const __m512i v = _mm512_slli_epi64(_mm512_cvtepu8_epi16(_mm256_loadu_epi8(src_v)), 8);
const __m512i uv = _mm512_or_si512(u, v);
_mm512_storeu_epi8(dst_uv, uv);
src_u += 32;
src_v += 32;
dst_uv += 64;
width -= 32;
} while (width > 0);
asm volatile("sub %0,%1 \n"
LABELALIGN
"1: \n"
"vpmovzxbw (%0),%%zmm0 \n"
"vpmovzxbw 0x00(%0,%1,1),%%zmm1 \n"
"lea 0x20(%0),%0 \n"
"vpsllw $0x8,%%zmm1,%%zmm1 \n"
"vporq %%zmm0,%%zmm1,%%zmm2 \n"
"vmovdqu64 %%zmm2,(%2) \n"
"lea 0x40(%2),%2 \n"
"sub $0x20,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
:
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_MERGEUVROW_AVX512BW
@ -5168,31 +5175,26 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width) {
asm volatile(
asm volatile("sub %0,%1 \n"
"sub %0,%1 \n"
LABELALIGN
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x00(%0,%1,1),%%ymm1 \n"
"lea 0x20(%0),%0 \n"
"vpunpcklbw %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpckhbw %%ymm1,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm2,(%2) \n"
"vextractf128 $0x0,%%ymm0,0x10(%2) \n"
"vextractf128 $0x1,%%ymm2,0x20(%2) \n"
"vextractf128 $0x1,%%ymm0,0x30(%2) \n"
"lea 0x40(%2),%2 \n"
"sub $0x20,%3 \n"
"vpmovzxbw (%0),%%ymm0 \n"
"vpmovzxbw 0x00(%0,%1,1),%%ymm1 \n"
"lea 0x10(%0),%0 \n"
"vpsllw $0x8,%%ymm1,%%ymm1 \n"
"vpor %%ymm0,%%ymm1,%%ymm2 \n"
"vmovdqu %%ymm2,(%2) \n"
"lea 0x20(%2),%2 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
:
: "memory", "cc", "xmm0", "xmm1", "xmm2");
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
:
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_MERGEUVROW_AVX2
@ -5201,11 +5203,9 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width) {
asm volatile(
asm volatile("sub %0,%1 \n"
"sub %0,%1 \n"
LABELALIGN
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%1,1),%%xmm1 \n"
@ -5218,12 +5218,12 @@ void MergeUVRow_SSE2(const uint8_t* src_u,
"lea 0x20(%2),%2 \n"
"sub $0x10,%3 \n"
"jg 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
:
: "memory", "cc", "xmm0", "xmm1", "xmm2");
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
:
: "memory", "cc", "xmm0", "xmm1", "xmm2");
}
#endif // HAS_MERGEUVROW_SSE2
@ -5233,37 +5233,35 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
uint16_t* dst_uv,
int depth,
int width) {
depth = 16 - depth;
// clang-format off
asm volatile (
"vmovd %4,%%xmm3 \n"
"vmovd %5,%%xmm4 \n"
"sub %0,%1 \n"
// 8 pixels per loop.
// 16 pixels per loop.
LABELALIGN
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu (%0,%1,1),%%ymm1 \n"
"add $0x20,%0 \n"
"vpmovzxwd (%0),%%ymm0 \n"
"vpmovzxwd 0x00(%0,%1,1),%%ymm1 \n"
"lea 0x10(%0),%0 \n"
"vpsllw %%xmm3,%%ymm0,%%ymm0 \n"
"vpsllw %%xmm3,%%ymm1,%%ymm1 \n"
"vpunpcklwd %%ymm1,%%ymm0,%%ymm2 \n" // mutates
"vpunpckhwd %%ymm1,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm2,(%2) \n"
"vextractf128 $0x0,%%ymm0,0x10(%2) \n"
"vextractf128 $0x1,%%ymm2,0x20(%2) \n"
"vextractf128 $0x1,%%ymm0,0x30(%2) \n"
"add $0x40,%2 \n"
"sub $0x10,%3 \n"
"vpslld %%xmm4,%%ymm1,%%ymm1 \n"
"vpor %%ymm0,%%ymm1,%%ymm2 \n"
"vmovdqu %%ymm2,(%2) \n"
"lea 0x20(%2),%2 \n"
"sub $0x8,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
: "r"(depth) // %4
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3");
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3
: "r"(16 - depth), // %4
"r"(32 - depth) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
// clang-format on
}
#endif // HAS_MERGEUVROW_AVX2
@ -5469,7 +5467,6 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
// 512 = 9 bits
// 1024 = 10 bits
// 4096 = 12 bits
// TODO(fbarchard): reduce to SSE2
void Convert8To16Row_SSE2(const uint8_t* src_y,
uint16_t* dst_y,
int scale,

View File

@ -820,28 +820,6 @@ void MergeUVRow_NEON(const uint8_t* src_u,
: "cc", "memory", "v0", "v1" // Clobber List
);
}
// Reads 16 U's and V's and writes out 16 pairs of UV.
void MergeUVRow_NEON1(const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_uv,
int width) {
asm volatile(
"1: \n"
"ld1 {v0.16b,v2.16b}, [%0], #32 \n" // load U
"ld1 {v1.16b,v3.16b}, [%1], #32 \n" // load V
"subs %w3, %w3, #32 \n" // 32 processed per loop
"prfm pldl1keep, [%0, 448] \n"
"prfm pldl1keep, [%1, 448] \n"
"st2 {v0.16b,v1.16b,v2.16b,v3.16b}, [%2], #64 \n" // store 32 UV
"b.gt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(width) // %3 // Output registers
: // Input registers
: "cc", "memory", "v0", "v1" // Clobber List
);
}
void MergeUVRow_16_NEON(const uint16_t* src_u,
const uint16_t* src_v,

View File

@ -3534,8 +3534,8 @@ TESTTPLANARTOP(MergeXR30, uint16_t, uint8_t, 16)
// TODO(fbarchard): improve test for platforms and cpu detect
#ifdef HAS_MERGEUVROW_16_AVX2
TEST_F(LibYUVPlanarTest, MergeUVRow_16_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
// Round count up to multiple of 8
const int kPixels = (benchmark_width_ * benchmark_height_ + 7) & ~7;
align_buffer_page_end(src_pixels_u, kPixels * 2);
align_buffer_page_end(src_pixels_v, kPixels * 2);