mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-06-15 00:16:08 +08:00
InterpolateRow_16_AVX2 for row_gcc
On AMD Zen4 Was C TestInterpolatePlane_16 (143 ms) Now AVX2 TestInterpolatePlane_16 (48 ms) Was I210ToI420_Opt (87 ms) 35.60% InterpolateRow_16To8_AVX2 31.03% Convert16To8Row_AVX512BW 21.35% Convert16To8Row_AVX2 Now I210ToI420_Opt (69 ms) 37.57% Convert16To8Row_AVX512BW 32.69% InterpolateRow_16_AVX2 7.18% Convert16To8Row_AVX2 5.23% InterpolateRow_16To8_AVX2 Bug: None Change-Id: Ica9b9c5dbd847068ae076b682c487e1753d3c812 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7855648 Reviewed-by: Dale Curtis <dalecurtis@chromium.org> Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
parent
cda55fcf53
commit
9f751100d2
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
||||
Version: 1939
|
||||
Version: 1940
|
||||
Revision: DEPS
|
||||
License: BSD-3-Clause
|
||||
License File: LICENSE
|
||||
|
||||
@ -323,6 +323,7 @@ extern "C" {
|
||||
#define HAS_I422TOUYVYROW_AVX2
|
||||
#define HAS_I422TOYUY2ROW_AVX2
|
||||
#define HAS_INTERPOLATEROW_16TO8_AVX2
|
||||
#define HAS_INTERPOLATEROW_16_AVX2
|
||||
#define HAS_MERGEAR64ROW_AVX2
|
||||
#define HAS_MERGEARGB16TO8ROW_AVX2
|
||||
#define HAS_MERGEARGBROW_AVX2
|
||||
@ -6700,6 +6701,16 @@ void InterpolateRow_16_C(uint16_t* dst_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int width,
|
||||
int source_y_fraction);
|
||||
void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
|
||||
const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int width,
|
||||
int source_y_fraction);
|
||||
void InterpolateRow_16_Any_AVX2(uint16_t* dst_ptr,
|
||||
const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int width,
|
||||
int source_y_fraction);
|
||||
void InterpolateRow_16_NEON(uint16_t* dst_ptr,
|
||||
const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1939
|
||||
#define LIBYUV_VERSION 1940
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -1858,6 +1858,15 @@ ANY11I(InterpolateRow_16_Any_NEON,
|
||||
1,
|
||||
7)
|
||||
#endif
|
||||
#ifdef HAS_INTERPOLATEROW_16_AVX2
|
||||
ANY11I(InterpolateRow_16_Any_AVX2,
|
||||
InterpolateRow_16_AVX2,
|
||||
uint16_t,
|
||||
uint16_t,
|
||||
1,
|
||||
1,
|
||||
15)
|
||||
#endif
|
||||
#undef ANY11I
|
||||
|
||||
// Any 1 to 1 interpolate with scale param
|
||||
|
||||
@ -4370,26 +4370,6 @@ void NV12ToRGB565Row_AVX2(const uint8_t* src_y,
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_RGB24TOYJROW_AVX2
|
||||
// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
|
||||
}
|
||||
#endif // HAS_RGB24TOYJROW_AVX2
|
||||
|
||||
#ifdef HAS_RAWTOYJROW_AVX2
|
||||
// Convert 32 RAW pixels (128 bytes) to 32 YJ values.
|
||||
}
|
||||
#endif // HAS_RAWTOYJROW_AVX2
|
||||
|
||||
#ifdef HAS_RGB24TOYJROW_SSSE3
|
||||
// Convert 16 RGB24 pixels (64 bytes) to 16 YJ values.
|
||||
}
|
||||
#endif // HAS_RGB24TOYJROW_SSSE3
|
||||
|
||||
#ifdef HAS_RAWTOYJROW_SSSE3
|
||||
// Convert 16 RAW pixels (64 bytes) to 16 YJ values.
|
||||
}
|
||||
#endif // HAS_RAWTOYJROW_SSSE3
|
||||
|
||||
#ifdef HAS_INTERPOLATEROW_16TO8_AVX2
|
||||
void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
|
||||
const uint16_t* src_ptr,
|
||||
@ -4401,7 +4381,7 @@ void InterpolateRow_16To8_AVX2(uint8_t* dst_ptr,
|
||||
SIMD_ALIGNED(uint16_t row[MAXTWIDTH]);
|
||||
while (width > 0) {
|
||||
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
|
||||
InterpolateRow_16_C(row, src_ptr, src_stride, twidth, source_y_fraction);
|
||||
InterpolateRow_16_AVX2(row, src_ptr, src_stride, twidth, source_y_fraction);
|
||||
Convert16To8Row_AVX2(row, dst_ptr, scale, twidth);
|
||||
src_ptr += twidth;
|
||||
dst_ptr += twidth;
|
||||
|
||||
@ -8911,6 +8911,85 @@ void InterpolateRow_AVX2(uint8_t* dst_ptr,
|
||||
}
|
||||
#endif // HAS_INTERPOLATEROW_AVX2
|
||||
|
||||
#ifdef HAS_INTERPOLATEROW_16_AVX2
|
||||
// Bilinear filter 16x2 -> 16x1
|
||||
void InterpolateRow_16_AVX2(uint16_t* dst_ptr,
|
||||
const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int width,
|
||||
int source_y_fraction) {
|
||||
asm volatile(
|
||||
"sub %1,%0 \n"
|
||||
"cmp $0x0,%3 \n"
|
||||
"je 100f \n"
|
||||
"cmp $0x80,%3 \n"
|
||||
"je 50f \n"
|
||||
|
||||
"vmovd %3,%%xmm0 \n"
|
||||
"neg %3 \n"
|
||||
"add $0x100,%3 \n"
|
||||
"vmovd %3,%%xmm5 \n"
|
||||
"vpunpcklwd %%xmm0,%%xmm5,%%xmm5 \n"
|
||||
"vpbroadcastd %%xmm5,%%ymm5 \n"
|
||||
"mov $0x80008000,%%eax \n" // 0x80008000 used to bias unsigned words to signed range for vpmaddwd.
|
||||
"vmovd %%eax,%%xmm4 \n"
|
||||
"vbroadcastss %%xmm4,%%ymm4 \n"
|
||||
"mov $8388736,%%eax \n" // 32768 * 256 + 128 rounding constant.
|
||||
"vmovd %%eax,%%xmm3 \n"
|
||||
"vbroadcastss %%xmm3,%%ymm3 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vmovdqu (%1),%%ymm0 \n"
|
||||
"vmovdqu (%1,%4,2),%%ymm1 \n"
|
||||
"vpunpckhwd %%ymm1,%%ymm0,%%ymm2 \n"
|
||||
"vpunpcklwd %%ymm1,%%ymm0,%%ymm0 \n"
|
||||
"vpsubw %%ymm4,%%ymm2,%%ymm2 \n"
|
||||
"vpsubw %%ymm4,%%ymm0,%%ymm0 \n"
|
||||
"vpmaddwd %%ymm5,%%ymm2,%%ymm2 \n"
|
||||
"vpmaddwd %%ymm5,%%ymm0,%%ymm0 \n"
|
||||
"vpaddd %%ymm3,%%ymm2,%%ymm2 \n"
|
||||
"vpaddd %%ymm3,%%ymm0,%%ymm0 \n"
|
||||
"vpsrad $0x8,%%ymm2,%%ymm2 \n"
|
||||
"vpsrad $0x8,%%ymm0,%%ymm0 \n"
|
||||
"vpackusdw %%ymm2,%%ymm0,%%ymm0 \n"
|
||||
"vmovdqu %%ymm0,0x00(%1,%0,1) \n"
|
||||
"lea 0x20(%1),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
"jmp 99f \n"
|
||||
|
||||
"50: \n"
|
||||
LABELALIGN
|
||||
"2: \n"
|
||||
"vmovdqu (%1),%%ymm0 \n"
|
||||
"vpavgw (%1,%4,2),%%ymm0,%%ymm0 \n"
|
||||
"vmovdqu %%ymm0,0x00(%1,%0,1) \n"
|
||||
"lea 0x20(%1),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 2b \n"
|
||||
"jmp 99f \n"
|
||||
|
||||
"100: \n"
|
||||
LABELALIGN
|
||||
"3: \n"
|
||||
"vmovdqu (%1),%%ymm0 \n"
|
||||
"vmovdqu %%ymm0,0x00(%1,%0,1) \n"
|
||||
"lea 0x20(%1),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 3b \n"
|
||||
|
||||
"99: \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(dst_ptr), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+r"(width), // %2
|
||||
"+r"(source_y_fraction) // %3
|
||||
: "r"(src_stride) // %4
|
||||
: "memory", "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
|
||||
}
|
||||
#endif // HAS_INTERPOLATEROW_16_AVX2
|
||||
|
||||
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
|
||||
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
|
||||
void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user