From c5e45dcae58f5cb3eb893f8000c1de88a8fe3c4e Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Wed, 3 Jun 2020 16:43:55 -0700 Subject: [PATCH] Optimze ABGRToI420 for AVX2 libyuv_test --gunit_filter=*ABGRToI420_Opt --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1 Was SSSE3 ABGRToI420_Opt (324 ms) Now AVX2 ABGRToI420_Opt (253 ms) Bug: b/155989084 Change-Id: I4f3831e29b379be758f9d3fcb244be088bb1ca3c Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2229606 Reviewed-by: Miguel Casas Commit-Queue: Frank Barchard --- README.chromium | 2 +- include/libyuv/convert_argb.h | 12 ++++++++---- include/libyuv/version.h | 2 +- source/convert.cc | 18 +++++++++++++----- source/row_neon.cc | 4 +--- source/test.sh | 35 +++++++++++++++++++++++++++++++++++ 6 files changed, 59 insertions(+), 14 deletions(-) create mode 100755 source/test.sh diff --git a/README.chromium b/README.chromium index 860799e76..a9638f83d 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1758 +Version: 1759 License: BSD License File: LICENSE diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h index a45b94c7f..ce7457320 100644 --- a/include/libyuv/convert_argb.h +++ b/include/libyuv/convert_argb.h @@ -39,10 +39,14 @@ LIBYUV_API extern const struct YuvConstants kYvu2020Constants; // BT.2020 #define kYuvJPEGConstantsVU kYvuJPEGConstants #define kYuvH709ConstantsVU kYvuH709Constants #define kYuv2020ConstantsVU kYvu2020Constants -#define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i) -#define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) NV12ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i) -#define NV12ToRAWMatrix(a, b, c, d, e, f, g, h, i) NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i) -#define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i) +#define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \ + NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i) +#define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \ + NV12ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i) +#define NV12ToRAWMatrix(a, b, c, d, e, f, g, h, i) \ + NV21ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i) +#define NV21ToRAWMatrix(a, b, c, d, e, f, g, h, i) \ + NV12ToRGB24Matrix(a, b, c, d, e, f, g##VU, h, i) // Alias. #define ARGBToARGB ARGBCopy diff --git a/include/libyuv/version.h b/include/libyuv/version.h index ba3ef50c1..d6ee08386 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1758 +#define LIBYUV_VERSION 1759 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index 3886f3f01..98258b9bc 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -608,11 +608,9 @@ int NV21ToI420(const uint8_t* src_y, int dst_stride_v, int width, int height) { - return NV12ToI420(src_y, src_stride_y, - src_vu, src_stride_vu, - dst_y, dst_stride_y, - dst_v, dst_stride_v, - dst_u, dst_stride_u, width, height); + return NV12ToI420(src_y, src_stride_y, src_vu, src_stride_vu, dst_y, + dst_stride_y, dst_v, dst_stride_v, dst_u, dst_stride_u, + width, height); } // Convert YUY2 to I420. @@ -1170,6 +1168,16 @@ int ABGRToI420(const uint8_t* src_abgr, } } #endif +#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ABGRToUVRow = ABGRToUVRow_Any_AVX2; + ABGRToYRow = ABGRToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ABGRToUVRow = ABGRToUVRow_AVX2; + ABGRToYRow = ABGRToYRow_AVX2; + } + } +#endif #if defined(HAS_ABGRTOYROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ABGRToYRow = ABGRToYRow_Any_NEON; diff --git a/source/row_neon.cc b/source/row_neon.cc index 137073386..b81c53ff2 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -349,9 +349,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y, asm volatile( YUVTORGB_SETUP "vmov.u8 d23, #255 \n" - "1: \n" - READYUV400 - YUVTORGB + "1: \n" READYUV400 YUVTORGB "subs %2, %2, #8 \n" "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" diff --git a/source/test.sh b/source/test.sh new file mode 100755 index 000000000..7f12c3c15 --- /dev/null +++ b/source/test.sh @@ -0,0 +1,35 @@ +#!/bin/bash +set -x + +function runbenchmark1 { + perf record /google/src/cloud/fbarchard/clean/google3/blaze-bin/third_party/libyuv/libyuv_test --gunit_filter=*$1 --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1 + perf report | grep AVX +} + +runbenchmark1 ABGRToI420 +runbenchmark1 Android420ToI420 +runbenchmark1 ARGBToI420 +runbenchmark1 Convert16To8Plane +runbenchmark1 ConvertToARGB +runbenchmark1 ConvertToI420 +runbenchmark1 CopyPlane +runbenchmark1 H010ToAB30 +runbenchmark1 H010ToAR30 +runbenchmark1 HalfFloatPlane +runbenchmark1 I010ToAB30 +runbenchmark1 I010ToAR30 +runbenchmark1 I420Copy +runbenchmark1 I420Psnr +runbenchmark1 I420Scale +runbenchmark1 I420Ssim +runbenchmark1 I420ToARGB +runbenchmark1 I420ToNV12 +runbenchmark1 I420ToUYVY +runbenchmark1 I422ToI420 +runbenchmark1 InitCpuFlags +runbenchmark1 J420ToARGB +runbenchmark1 NV12ToARGB +runbenchmark1 NV12ToI420 +runbenchmark1 NV12ToI420Rotate +runbenchmark1 SetCpuFlags +runbenchmark1 YUY2ToI420