From 208280598e95e8504d6d862adba40b67abc8a945 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Fri, 8 Feb 2013 23:27:22 +0000 Subject: [PATCH] Remove vmovdqa from UV code BUG=181 TESTED=c:\intelsde\sde -hsw -- out\release\libyuv_unittest.exe --gtest_filter=*ARGBToI420* Review URL: https://webrtc-codereview.appspot.com/1091010 git-svn-id: http://libyuv.googlecode.com/svn/trunk@567 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/row_win.cc | 68 ++++++++++++++++------------------------ 3 files changed, 29 insertions(+), 43 deletions(-) diff --git a/README.chromium b/README.chromium index e36271cef..96c4ad890 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 566 +Version: 567 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 156ecfd19..9b1b24ed1 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 566 +#define LIBYUV_VERSION 567 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_win.cc b/source/row_win.cc index 95b912e7c..b9e4a8488 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -797,10 +797,6 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { lea edx, [edx + 32] jg convertloop ret -vphaddw ymm0, ymm0, ymm1 -vpermq ymm0, ymm0, 0xd8 -vpackuswb ymm0, ymm0, ymm2 -vpermq ymm0, ymm0, 0xd8 } } #endif // HAS_ARGBTOYROW_AVX2 @@ -1176,39 +1172,34 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb, vpavgb ymm2, ymm2, [eax + esi + 64] vpavgb ymm3, ymm3, [eax + esi + 96] lea eax, [eax + 128] - vmovdqa ymm4, ymm0 // TODO(fbarchard): Remove. - vshufps ymm0, ymm0, ymm1, 0x88 - vshufps ymm4, ymm4, ymm1, 0xdd + vshufps ymm4, ymm0, ymm1, 0x88 + vshufps ymm0, ymm0, ymm1, 0xdd vpavgb ymm0, ymm0, ymm4 vpermq ymm0, ymm0, 0xd8 // TODO(fbarchard): Remove. - vmovdqa ymm4, ymm2 // TODO(fbarchard): Remove. - vshufps ymm2, ymm2, ymm3, 0x88 - vshufps ymm4, ymm4, ymm3, 0xdd + vshufps ymm4, ymm2, ymm3, 0x88 + vshufps ymm2, ymm2, ymm3, 0xdd vpavgb ymm2, ymm2, ymm4 vpermq ymm2, ymm2, 0xd8 // TODO(fbarchard): Remove. // step 2 - convert to U and V // from here down is very similar to Y code except // instead of 32 different pixels, its 16 pixels of U and 16 of V - vmovdqa ymm1, ymm0 // TODO(fbarchard): Remove. - vmovdqa ymm3, ymm2 // TODO(fbarchard): Remove. - vpmaddubsw ymm0, ymm0, ymm7 // U - vpmaddubsw ymm2, ymm2, ymm7 - vpmaddubsw ymm1, ymm1, ymm6 // V - vpmaddubsw ymm3, ymm3, ymm6 - vphaddw ymm0, ymm0, ymm2 - vpermq ymm0, ymm0, 0xd8 // TODO(fbarchard): Remove. + vpmaddubsw ymm1, ymm0, ymm7 // U + vpmaddubsw ymm3, ymm2, ymm7 + vpmaddubsw ymm0, ymm0, ymm6 // V + vpmaddubsw ymm2, ymm2, ymm6 vphaddw ymm1, ymm1, ymm3 vpermq ymm1, ymm1, 0xd8 // TODO(fbarchard): Remove. - vpsraw ymm0, ymm0, 8 + vphaddw ymm0, ymm0, ymm2 + vpermq ymm0, ymm0, 0xd8 // TODO(fbarchard): Remove. vpsraw ymm1, ymm1, 8 - vpacksswb ymm0, ymm0, ymm1 + vpsraw ymm0, ymm0, 8 + vpacksswb ymm0, ymm1, ymm0 vpermq ymm0, ymm0, 0xd8 vpaddb ymm0, ymm0, ymm5 // -> unsigned // step 3 - store 16 U and 16 V values sub ecx, 32 - vmovdqa ymm1, ymm0 vextractf128 qword ptr [edx], ymm0, 0 // U vextractf128 qword ptr [edx + edi], ymm0, 1 // V lea edx, [edx + 16] @@ -1320,39 +1311,34 @@ void ARGBToUVRow_Unaligned_AVX2(const uint8* src_argb0, int src_stride_argb, vpavgb ymm2, ymm2, [eax + esi + 64] vpavgb ymm3, ymm3, [eax + esi + 96] lea eax, [eax + 128] - vmovdqa ymm4, ymm0 - vshufps ymm0, ymm0, ymm1, 0x88 - vshufps ymm4, ymm4, ymm1, 0xdd + vshufps ymm4, ymm0, ymm1, 0x88 + vshufps ymm0, ymm0, ymm1, 0xdd vpavgb ymm0, ymm0, ymm4 - vpermq ymm0, ymm0, 0xd8 - vmovdqa ymm4, ymm2 - vshufps ymm2, ymm2, ymm3, 0x88 - vshufps ymm4, ymm4, ymm3, 0xdd + vpermq ymm0, ymm0, 0xd8 // TODO(fbarchard): Remove. + vshufps ymm4, ymm2, ymm3, 0x88 + vshufps ymm2, ymm2, ymm3, 0xdd vpavgb ymm2, ymm2, ymm4 - vpermq ymm2, ymm2, 0xd8 + vpermq ymm2, ymm2, 0xd8 // TODO(fbarchard): Remove. // step 2 - convert to U and V // from here down is very similar to Y code except // instead of 32 different pixels, its 16 pixels of U and 16 of V - vmovdqa ymm1, ymm0 - vmovdqa ymm3, ymm2 - vpmaddubsw ymm0, ymm0, ymm7 // U - vpmaddubsw ymm2, ymm2, ymm7 - vpmaddubsw ymm1, ymm1, ymm6 // V - vpmaddubsw ymm3, ymm3, ymm6 - vphaddw ymm0, ymm0, ymm2 - vpermq ymm0, ymm0, 0xd8 + vpmaddubsw ymm1, ymm0, ymm7 // U + vpmaddubsw ymm3, ymm2, ymm7 + vpmaddubsw ymm0, ymm0, ymm6 // V + vpmaddubsw ymm2, ymm2, ymm6 vphaddw ymm1, ymm1, ymm3 - vpermq ymm1, ymm1, 0xd8 - vpsraw ymm0, ymm0, 8 + vpermq ymm1, ymm1, 0xd8 // TODO(fbarchard): Remove. + vphaddw ymm0, ymm0, ymm2 + vpermq ymm0, ymm0, 0xd8 // TODO(fbarchard): Remove. vpsraw ymm1, ymm1, 8 - vpacksswb ymm0, ymm0, ymm1 + vpsraw ymm0, ymm0, 8 + vpacksswb ymm0, ymm1, ymm0 vpermq ymm0, ymm0, 0xd8 vpaddb ymm0, ymm0, ymm5 // -> unsigned // step 3 - store 16 U and 16 V values sub ecx, 32 - vmovdqa ymm1, ymm0 vextractf128 qword ptr [edx], ymm0, 0 // U vextractf128 qword ptr [edx + edi], ymm0, 1 // V lea edx, [edx + 16]