InterpolateRow_16_AVX2 for row_gcc

On AMD Zen4
Was C
TestInterpolatePlane_16 (143 ms)
Now AVX2
TestInterpolatePlane_16 (48 ms)

Was
I210ToI420_Opt (87 ms)
 35.60% InterpolateRow_16To8_AVX2
 31.03% Convert16To8Row_AVX512BW
 21.35% Convert16To8Row_AVX2

Now
I210ToI420_Opt (69 ms)
 37.57% Convert16To8Row_AVX512BW
 32.69% InterpolateRow_16_AVX2
  7.18% Convert16To8Row_AVX2
  5.23% InterpolateRow_16To8_AVX2

Bug: None
Change-Id: Ica9b9c5dbd847068ae076b682c487e1753d3c812
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7855648
Reviewed-by: Dale Curtis <dalecurtis@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
Frank Barchard 2026-05-18 14:13:04 -07:00 committed by libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com
parent cda55fcf53
commit 9f751100d2
6 changed files with 102 additions and 23 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1939
Version: 1940
Revision: DEPS
License: BSD-3-Clause
License File: LICENSE

View File

@ -323,6 +323,7 @@ extern "C" {
#define HAS_I422TOUYVYROW_AVX2
#define HAS_I422TOYUY2ROW_AVX2
#define HAS_INTERPOLATEROW_16TO8_AVX2
#define HAS_INTERPOLATEROW_16_AVX2
#define HAS_MERGEAR64ROW_AVX2
#define HAS_MERGEARGB16TO8ROW_AVX2
#define HAS_MERGEARGBROW_AVX2
@ -6700,6 +6701,16 @@ void InterpolateRow_16_C(uint16_t* dst_ptr,
ptrdiff_t src_stride,
int width,
int source_y_fraction);
void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
int width,
int source_y_fraction);
void InterpolateRow_16_Any_AVX2(uint16_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
int width,
int source_y_fraction);
void InterpolateRow_16_NEON(uint16_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1939
#define LIBYUV_VERSION 1940
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -1858,6 +1858,15 @@ ANY11I(InterpolateRow_16_Any_NEON,
1,
7)
#endif
#ifdef HAS_INTERPOLATEROW_16_AVX2
ANY11I(InterpolateRow_16_Any_AVX2,
InterpolateRow_16_AVX2,
uint16_t,
uint16_t,
1,
1,
15)
#endif
#undef ANY11I
// Any 1 to 1 interpolate with scale param

View File

@ -4370,26 +4370,6 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
}
#endif
#ifdef HAS_RGB24TOYJROW_AVX2
// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
}
#endif // HAS_RGB24TOYJROW_AVX2
#ifdef HAS_RAWTOYJROW_AVX2
// Convert 32 RAW pixels (128 bytes) to 32 YJ values.
}
#endif // HAS_RAWTOYJROW_AVX2
#ifdef HAS_RGB24TOYJROW_SSSE3
// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
}
#endif // HAS_RGB24TOYJROW_SSSE3
#ifdef HAS_RAWTOYJROW_SSSE3
// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
}
#endif // HAS_RAWTOYJROW_SSSE3
#ifdef HAS_INTERPOLATEROW_16TO8_AVX2
void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
const uint16_t* src_ptr,
@ -4401,7 +4381,7 @@ void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
SIMD_ALIGNED(uint16_t row[MAXTWIDTH]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction);
InterpolateRow_16_AVX2(row, src_ptr, src_stride, twidth, source_y_fraction);
Convert16To8Row_AVX2(row, dst_ptr, scale, twidth);
src_ptr += twidth;
dst_ptr += twidth;

View File

@ -8911,6 +8911,85 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr,
}
#endif // HAS_INTERPOLATEROW_AVX2
#ifdef HAS_INTERPOLATEROW_16_AVX2
// Bilinear filter 16x2 -> 16x1
void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
int width,
int source_y_fraction) {
asm volatile(
"sub %1,%0 \n"
"cmp $0x0,%3 \n"
"je 100f \n"
"cmp $0x80,%3 \n"
"je 50f \n"
"vmovd %3,%%xmm0 \n"
"neg %3 \n"
"add $0x100,%3 \n"
"vmovd %3,%%xmm5 \n"
"vpunpcklwd %%xmm0,%%xmm5,%%xmm5 \n"
"vpbroadcastd %%xmm5,%%ymm5 \n"
"mov $0x80008000,%%eax \n" // 0x80008000 used to bias unsigned words to signed range for vpmaddwd.
"vmovd %%eax,%%xmm4 \n"
"vbroadcastss %%xmm4,%%ymm4 \n"
"mov $8388736,%%eax \n" // 32768 * 256 + 128 rounding constant.
"vmovd %%eax,%%xmm3 \n"
"vbroadcastss %%xmm3,%%ymm3 \n"
LABELALIGN
"1: \n"
"vmovdqu (%1),%%ymm0 \n"
"vmovdqu (%1,%4,2),%%ymm1 \n"
"vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n"
"vpsubw %%ymm4,%%ymm2,%%ymm2 \n"
"vpsubw %%ymm4,%%ymm0,%%ymm0 \n"
"vpmaddwd %%ymm5,%%ymm2,%%ymm2 \n"
"vpmaddwd %%ymm5,%%ymm0,%%ymm0 \n"
"vpaddd %%ymm3,%%ymm2,%%ymm2 \n"
"vpaddd %%ymm3,%%ymm0,%%ymm0 \n"
"vpsrad $0x8,%%ymm2,%%ymm2 \n"
"vpsrad $0x8,%%ymm0,%%ymm0 \n"
"vpackusdw %%ymm2,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,0x00(%1,%0,1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 1b \n"
"jmp 99f \n"
"50: \n"
LABELALIGN
"2: \n"
"vmovdqu (%1),%%ymm0 \n"
"vpavgw (%1,%4,2),%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,0x00(%1,%0,1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 2b \n"
"jmp 99f \n"
"100: \n"
LABELALIGN
"3: \n"
"vmovdqu (%1),%%ymm0 \n"
"vmovdqu %%ymm0,0x00(%1,%0,1) \n"
"lea 0x20(%1),%1 \n"
"sub $0x10,%2 \n"
"jg 3b \n"
"99: \n"
"vzeroupper \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(width), // %2
"+r"(source_y_fraction) // %3
: "r"(src_stride) // %4
: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
}
#endif // HAS_INTERPOLATEROW_16_AVX2
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,