Optimze ABGRToI420 for AVX2

libyuv_test --gunit_filter=*ABGRToI420_Opt --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1

Was SSSE3 ABGRToI420_Opt (324 ms)
Now AVX2  ABGRToI420_Opt (253 ms)

Bug: b/155989084
Change-Id: I4f3831e29b379be758f9d3fcb244be088bb1ca3c
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2229606
Reviewed-by: Miguel Casas <mcasas@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2020-06-03 16:43:55 -07:00 committed by Commit Bot
parent ce5b333853
commit c5e45dcae5
6 changed files with 59 additions and 14 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1758
Version: 1759
License: BSD
License File: LICENSE

View File

@ -39,10 +39,14 @@ LIBYUV_API extern const struct YuvConstants kYvu2020Constants; // BT.2020
#define kYuvJPEGConstantsVU kYvuJPEGConstants
#define kYuvH709ConstantsVU kYvuH709Constants
#define kYuv2020ConstantsVU kYvu2020Constants
#define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
#define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) NV12ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
#define NV12ToRAWMatrix(a, b, c, d, e, f, g, h, i) NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
#define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
#define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
#define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
NV12ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
#define NV12ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
#define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) \
NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i)
// Alias.
#define ARGBToARGB ARGBCopy

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1758
#define LIBYUV_VERSION 1759
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -608,11 +608,9 @@ int NV21ToI420(const uint8_t* src_y,
int dst_stride_v,
int width,
int height) {
return NV12ToI420(src_y, src_stride_y,
src_vu, src_stride_vu,
dst_y, dst_stride_y,
dst_v, dst_stride_v,
dst_u, dst_stride_u, width, height);
return NV12ToI420(src_y, src_stride_y, src_vu, src_stride_vu, dst_y,
dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u,
width, height);
}
// Convert YUY2 to I420.
@ -1170,6 +1168,16 @@ int ABGRToI420(const uint8_t* src_abgr,
}
}
#endif
#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ABGRToUVRow = ABGRToUVRow_Any_AVX2;
ABGRToYRow = ABGRToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ABGRToUVRow = ABGRToUVRow_AVX2;
ABGRToYRow = ABGRToYRow_AVX2;
}
}
#endif
#if defined(HAS_ABGRTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ABGRToYRow = ABGRToYRow_Any_NEON;

View File

@ -349,9 +349,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
asm volatile(
YUVTORGB_SETUP
"vmov.u8 d23, #255 \n"
"1: \n"
READYUV400
YUVTORGB
"1: \n" READYUV400 YUVTORGB
"subs %2, %2, #8 \n"
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"

35
source/test.sh Executable file
View File

@ -0,0 +1,35 @@
#!/bin/bash
set -x
function runbenchmark1 {
perf record /google/src/cloud/fbarchard/clean/google3/blaze-bin/third_party/libyuv/libyuv_test --gunit_filter=*$1 --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1
perf report | grep AVX
}
runbenchmark1 ABGRToI420
runbenchmark1 Android420ToI420
runbenchmark1 ARGBToI420
runbenchmark1 Convert16To8Plane
runbenchmark1 ConvertToARGB
runbenchmark1 ConvertToI420
runbenchmark1 CopyPlane
runbenchmark1 H010ToAB30
runbenchmark1 H010ToAR30
runbenchmark1 HalfFloatPlane
runbenchmark1 I010ToAB30
runbenchmark1 I010ToAR30
runbenchmark1 I420Copy
runbenchmark1 I420Psnr
runbenchmark1 I420Scale
runbenchmark1 I420Ssim
runbenchmark1 I420ToARGB
runbenchmark1 I420ToNV12
runbenchmark1 I420ToUYVY
runbenchmark1 I422ToI420
runbenchmark1 InitCpuFlags
runbenchmark1 J420ToARGB
runbenchmark1 NV12ToARGB
runbenchmark1 NV12ToI420
runbenchmark1 NV12ToI420Rotate
runbenchmark1 SetCpuFlags
runbenchmark1 YUY2ToI420