From 22eb5965fcce57589fc104221ec6d0d69b64b883 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Wed, 22 Oct 2014 23:39:16 +0000 Subject: [PATCH] Optimize I422ToRGBA for AVX2 by hoisting ymm5 initialization and using different register for output of unpack. BUG=269 TESTED=intelsde on I422ToABGR R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/29889004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1137 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/row_win.cc | 13 ++++++------- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/README.chromium b/README.chromium index a9dc5d468..5ff9eef8a 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1136 +Version: 1137 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index d9f0aae22..f4f370fa8 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1136 +#define LIBYUV_VERSION 1137 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_win.cc b/source/row_win.cc index 9a87a67e0..d28df8e65 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -1613,7 +1613,6 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). -// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. __declspec(naked) __declspec(align(16)) void I422ToBGRARow_AVX2(const uint8* y_buf, const uint8* u_buf, @@ -1659,7 +1658,6 @@ void I422ToBGRARow_AVX2(const uint8* y_buf, // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). -// TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. __declspec(naked) __declspec(align(16)) void I422ToRGBARow_AVX2(const uint8* y_buf, const uint8* u_buf, @@ -1675,6 +1673,7 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, mov edx, [esp + 8 + 16] // argb mov ecx, [esp + 8 + 20] // width sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpxor ymm4, ymm4, ymm4 align 4 @@ -1683,13 +1682,12 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, YUVTORGB_AVX2 // Step 3: Weave into RGBA - vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha vpunpcklbw ymm1, ymm1, ymm2 // GR vpermq ymm1, ymm1, 0xd8 - vpunpcklbw ymm5, ymm5, ymm0 // AB - vpermq ymm5, ymm5, 0xd8 - vpunpcklwd ymm0, ymm5, ymm1 // ABGR first 8 pixels - vpunpckhwd ymm1, ymm5, ymm1 // ABGR next 8 pixels + vpunpcklbw ymm2, ymm5, ymm0 // AB + vpermq ymm2, ymm2, 0xd8 + vpunpcklwd ymm0, ymm2, ymm1 // ABGR first 8 pixels + vpunpckhwd ymm1, ymm2, ymm1 // ABGR next 8 pixels vmovdqu [edx], ymm0 vmovdqu [edx + 32], ymm1 lea edx, [edx + 64] @@ -1702,6 +1700,7 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, ret } } + #endif // HAS_I422TOARGBROW_AVX2 #ifdef HAS_I422TOARGBROW_SSSE3