From 7892ea1fe14533d197f11a4f720d877eebaeb0b8 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 15 Dec 2014 18:59:23 +0000 Subject: [PATCH] Fix for ARGBToUV on AVX2 BUG=269 TESTED=local testing R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/33669004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1202 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/row_posix.cc | 2 ++ source/row_win.cc | 12 +++++++++--- 4 files changed, 13 insertions(+), 5 deletions(-) diff --git a/README.chromium b/README.chromium index edfeedc11..8680ab393 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1201 +Version: 1203 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 9ceeff62c..8cc1b5ed0 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1201 +#define LIBYUV_VERSION 1203 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_posix.cc b/source/row_posix.cc index 78997edc2..38886c15d 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -983,6 +983,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, "vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n" "vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n" "vpavgb %%ymm4,%%ymm2,%%ymm2 \n" + "vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n" "vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n" "vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n" @@ -995,6 +996,7 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, "vpermq $0xd8,%%ymm0,%%ymm0 \n" "vpshufb %8,%%ymm0,%%ymm0 \n" "vpaddb %%ymm5,%%ymm0,%%ymm0 \n" + "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n" VEXTOPMEM(vextractf128,ymm0,0x1,1,2,1) // vextractf128 $0x1,%%ymm0,(%1,%2,1) "lea " MEMLEA(0x10,1) ",%1 \n" diff --git a/source/row_win.cc b/source/row_win.cc index 4996de00f..31af591c9 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -1473,7 +1473,6 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } #endif // HAS_ARGBTOYROW_SSSE3 - #if defined(HAS_I422TOARGBROW_AVX2) || defined(HAS_I422TOBGRAROW_AVX2) static const lvec8 kUVToB_AVX = { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, @@ -1502,6 +1501,7 @@ static const lvec16 kUVBiasG_AVX = { static const lvec16 kUVBiasR_AVX = { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR }; +#endif // defined(HAS_I422TOARGBROW_AVX2) || defined(HAS_I422TOBGRAROW_AVX2) // Read 8 UV from 422, upsample to 16 UV. #define READYUV422_AVX2 __asm { \ @@ -1540,7 +1540,7 @@ static const lvec16 kUVBiasR_AVX = { __asm vpackuswb ymm2, ymm2, ymm2 /* R */ \ } -#if defined(HAS_I422TOARGBROW_AVX2) +#ifdef HAS_I422TOARGBROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). __declspec(naked) __declspec(align(16)) @@ -1584,7 +1584,9 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, ret } } +#endif // HAS_I422TOARGBROW_AVX2 +#ifdef HAS_I422TOBGRAROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 BGRA (64 bytes). // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. @@ -1629,7 +1631,9 @@ void I422ToBGRARow_AVX2(const uint8* y_buf, ret } } +#endif // HAS_I422TOBGRAROW_AVX2 +#ifdef HAS_I422TORGBAROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes). // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. @@ -1674,7 +1678,9 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, ret } } +#endif // HAS_I422TORGBAROW_AVX2 +#ifdef HAS_I422TOABGRROW_AVX2 // 16 pixels // 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ABGR (64 bytes). // TODO(fbarchard): Use macros to reduce duplicate code. See SSSE3. @@ -1719,7 +1725,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, ret } } -#endif // HAS_I422TOARGBROW_AVX2 +#endif // HAS_I422TOABGRROW_AVX2 #ifdef HAS_I422TOARGBROW_SSSE3 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.