mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
ARGBToUV AVX2 for x86_64
Icelake Was SSSE3+SSSE3 ARGBToJ420_Opt (356 ms) Was SSSE3+AVX2 ARGBToJ420_Opt (301 ms) Now AVX2+AVX2 ARGBToJ420_Opt (227 ms) Change-Id: I2cb427bc164b225b3ad4c5f43c09d6da6ca496d5 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6943036 Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
parent
0f795672ae
commit
a61882c049
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
||||||
Version: 1917
|
Version: 1918
|
||||||
License: BSD-3-Clause
|
License: BSD-3-Clause
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
Shipped: yes
|
Shipped: yes
|
||||||
|
|||||||
@ -350,7 +350,7 @@ extern "C" {
|
|||||||
// #define HAS_ABGRTOUVJROW_AVX2
|
// #define HAS_ABGRTOUVJROW_AVX2
|
||||||
// #define HAS_ABGRTOUVROW_AVX2
|
// #define HAS_ABGRTOUVROW_AVX2
|
||||||
// #define HAS_ARGBTOUVJROW_AVX2
|
// #define HAS_ARGBTOUVJROW_AVX2
|
||||||
// #define HAS_ARGBTOUVROW_AVX2
|
#define HAS_ARGBTOUVROW_AVX2
|
||||||
|
|
||||||
#if defined(__x86_64__) || !defined(__pic__)
|
#if defined(__x86_64__) || !defined(__pic__)
|
||||||
// TODO(fbarchard): fix build error on android_full_debug=1
|
// TODO(fbarchard): fix build error on android_full_debug=1
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1917
|
#define LIBYUV_VERSION 1918
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|||||||
@ -1822,6 +1822,86 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
|
|||||||
}
|
}
|
||||||
#endif // HAS_ARGBTOUVROW_SSSE3
|
#endif // HAS_ARGBTOUVROW_SSSE3
|
||||||
|
|
||||||
|
#ifdef HAS_ARGBTOUVROW_AVX2
|
||||||
|
|
||||||
|
// UYVY shuf 8 UV to 16 UV.
|
||||||
|
static const vec8 kShuffleAARRGGBB = {0, 4, 1, 5, 2, 6, 3, 7,
|
||||||
|
8, 12, 9, 13, 10, 14, 11, 15};
|
||||||
|
|
||||||
|
// 16x2 -> 8x1 ARGB pixels converted to 8 U and 8 V
|
||||||
|
// ARGBToUV does rounding average of 4 ARGB pixels
|
||||||
|
void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
|
||||||
|
int src_stride_argb,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width,
|
||||||
|
const struct RgbUVConstants* rgbuvconstants) {
|
||||||
|
asm volatile(
|
||||||
|
"vbroadcastf128 %7,%%ymm15 \n" // kShuffleAARRGGBB
|
||||||
|
"vpcmpeqb %%ymm14,%%ymm14,%%ymm14 \n" // 0x0101
|
||||||
|
"vpabsb %%ymm14,%%ymm14 \n"
|
||||||
|
"vpxor %%ymm13,%%ymm13,%%ymm13 \n" // 0 for vpavgw
|
||||||
|
|
||||||
|
"vbroadcastf128 %5,%%ymm6 \n" // RGBToU
|
||||||
|
"vbroadcastf128 %6,%%ymm7 \n" // RGBToV
|
||||||
|
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // 0x8000
|
||||||
|
"vpsllw $15,%%ymm5,%%ymm5 \n"
|
||||||
|
|
||||||
|
"sub %1,%2 \n"
|
||||||
|
|
||||||
|
LABELALIGN
|
||||||
|
"1: \n"
|
||||||
|
"vmovdqu (%0),%%ymm8 \n" // Read 16x2 ARGB Pixels
|
||||||
|
"vmovdqu 0x20(%0),%%ymm9 \n"
|
||||||
|
"vmovdqu 0x00(%0,%4,1),%%ymm10 \n"
|
||||||
|
"vmovdqu 0x20(%0,%4,1),%%ymm11 \n"
|
||||||
|
"vpshufb %%ymm15,%%ymm8,%%ymm8 \n" // aarrggbb
|
||||||
|
"vpshufb %%ymm15,%%ymm9,%%ymm9 \n"
|
||||||
|
"vpshufb %%ymm15,%%ymm10,%%ymm10 \n"
|
||||||
|
"vpshufb %%ymm15,%%ymm11,%%ymm11 \n"
|
||||||
|
"vpmaddubsw %%ymm14,%%ymm8,%%ymm8 \n" // 16x2 -> 8x2
|
||||||
|
"vpmaddubsw %%ymm14,%%ymm9,%%ymm9 \n"
|
||||||
|
"vpmaddubsw %%ymm14,%%ymm10,%%ymm10 \n"
|
||||||
|
"vpmaddubsw %%ymm14,%%ymm11,%%ymm11 \n"
|
||||||
|
"vpaddw %%ymm8,%%ymm10,%%ymm8 \n" // 8x2 -> 8x1
|
||||||
|
"vpaddw %%ymm9,%%ymm11,%%ymm9 \n"
|
||||||
|
"vpsrlw $1,%%ymm8,%%ymm8 \n"
|
||||||
|
"vpsrlw $1,%%ymm9,%%ymm9 \n"
|
||||||
|
"vpavgw %%ymm13,%%ymm8,%%ymm8 \n"
|
||||||
|
"vpavgw %%ymm13,%%ymm9,%%ymm9 \n"
|
||||||
|
"vpackuswb %%ymm9,%%ymm8,%%ymm0 \n" // mutates
|
||||||
|
"vpermq $0xd8,%%ymm0,%%ymm0 \n" // 8 ARGB Pixels
|
||||||
|
|
||||||
|
"vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" // 8 V
|
||||||
|
"vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" // 8 U
|
||||||
|
"vphaddw %%ymm1,%%ymm0,%%ymm0 \n" // uuuuvvvv uuuuvvvv
|
||||||
|
"vpermq $0xd8,%%ymm0,%%ymm0 \n" // uuuuuuuu vvvvvvvv
|
||||||
|
"vpsubw %%ymm0,%%ymm5,%%ymm2 \n"
|
||||||
|
"vpsrlw $0x8,%%ymm2,%%ymm2 \n"
|
||||||
|
"vpackuswb %%ymm2,%%ymm2,%%ymm2 \n" // mutates 8U8u- 8V8v
|
||||||
|
"vmovq %%xmm2,(%1) \n" // Write 8 U's
|
||||||
|
"vextractf128 $0x1,%%ymm2,%%xmm2 \n" // Copy V to low 8 bytes
|
||||||
|
"vmovq %%xmm2,0x00(%1,%2,1) \n" // Write 8 V's
|
||||||
|
|
||||||
|
"lea 0x40(%0),%0 \n"
|
||||||
|
"lea 0x8(%1),%1 \n"
|
||||||
|
"subl $0x10,%3 \n"
|
||||||
|
"jg 1b \n"
|
||||||
|
"vzeroupper \n"
|
||||||
|
: "+r"(src_argb), // %0
|
||||||
|
"+r"(dst_u), // %1
|
||||||
|
"+r"(dst_v), // %2
|
||||||
|
"+r"(width) // %3
|
||||||
|
: "r"((intptr_t)(src_stride_argb)), // %4
|
||||||
|
"m"(rgbuvconstants->kRGBToU), // %5
|
||||||
|
"m"(rgbuvconstants->kRGBToV), // %6
|
||||||
|
"m"(kShuffleAARRGGBB) // %7
|
||||||
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7",
|
||||||
|
"xmm8", "xmm9", "xmm10", "xmm11", "xmm13", "xmm14", "xmm15");
|
||||||
|
}
|
||||||
|
#endif // HAS_ARGBTOUVROW_AVX2
|
||||||
|
|
||||||
|
|
||||||
#ifdef HAS_ARGBTOUV444ROW_SSSE3
|
#ifdef HAS_ARGBTOUV444ROW_SSSE3
|
||||||
|
|
||||||
// RGB to BT601 coefficients
|
// RGB to BT601 coefficients
|
||||||
@ -1855,6 +1935,18 @@ void ARGBToUV444Row_AVX2(const uint8_t* src_argb,
|
|||||||
}
|
}
|
||||||
#endif // HAS_ARGBTOUV444ROW_AVX2
|
#endif // HAS_ARGBTOUV444ROW_AVX2
|
||||||
|
|
||||||
|
#ifdef HAS_ARGBTOUVROW_AVX2
|
||||||
|
void ARGBToUVRow_AVX2(const uint8_t* src_argb,
|
||||||
|
int src_stride_argb,
|
||||||
|
uint8_t* dst_u,
|
||||||
|
uint8_t* dst_v,
|
||||||
|
int width) {
|
||||||
|
ARGBToUVMatrixRow_AVX2(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||||
|
&kARGBI601UVConstants);
|
||||||
|
}
|
||||||
|
#endif // HAS_ARGBTOUVROW_AVX2
|
||||||
|
|
||||||
|
|
||||||
#ifdef HAS_ARGBTOUVROW_SSSE3
|
#ifdef HAS_ARGBTOUVROW_SSSE3
|
||||||
void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
|
void ARGBToUVRow_SSSE3(const uint8_t* src_argb,
|
||||||
int src_stride_argb,
|
int src_stride_argb,
|
||||||
@ -3089,7 +3181,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
|
|||||||
"lea 0x20(%[y_buf]),%[y_buf] \n"
|
"lea 0x20(%[y_buf]),%[y_buf] \n"
|
||||||
|
|
||||||
// Read 8 UV from 210, upsample to 16 UV
|
// Read 8 UV from 210, upsample to 16 UV
|
||||||
// TODO(fbarchard): Consider vshufb to replace pack/unpack
|
// TODO(fbarchard): Consider vpshufb to replace pack/unpack
|
||||||
// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
|
// TODO(fbarchard): Consider vunpcklpd to combine the 2 registers into 1.
|
||||||
#define READYUV210_AVX2 \
|
#define READYUV210_AVX2 \
|
||||||
"vmovdqu (%[u_buf]),%%xmm3 \n" \
|
"vmovdqu (%[u_buf]),%%xmm3 \n" \
|
||||||
|
|||||||
@ -2771,10 +2771,10 @@ TEST_F(LibYUVConvertTest, TestARGBToUVRow) {
|
|||||||
benchmark_width_ * benchmark_height_ * benchmark_iterations_ / 32;
|
benchmark_width_ * benchmark_height_ * benchmark_iterations_ / 32;
|
||||||
|
|
||||||
for (int i = 0; i < benchmark_iterations; ++i) {
|
for (int i = 0; i < benchmark_iterations; ++i) {
|
||||||
#if defined(HAS_ARGBTOUVROW_SSSE3)
|
#if defined(HAS_ARGBTOUVROW_AVX2)
|
||||||
int has_ssse3 = TestCpuFlag(kCpuHasSSSE3);
|
int has_avx2 = TestCpuFlag(kCpuHasAVX2);
|
||||||
if (has_ssse3) {
|
if (has_avx2) {
|
||||||
ARGBToUVRow_SSSE3(&orig_argb_pixels[0], 0, &dest_u[0], &dest_v[0], 64);
|
ARGBToUVRow_AVX2(&orig_argb_pixels[0], 0, &dest_u[0], &dest_v[0], 64);
|
||||||
} else {
|
} else {
|
||||||
ARGBToUVRow_C(&orig_argb_pixels[0], 0, &dest_u[0], &dest_v[0], 64);
|
ARGBToUVRow_C(&orig_argb_pixels[0], 0, &dest_u[0], &dest_v[0], 64);
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user