From 91c50c3a7d8736aa5834d6c54ae1c6bbea581e1f Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Wed, 3 Apr 2013 23:47:10 +0000 Subject: [PATCH] ARGBToYJ_AVX2 port to AVX2. BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/1272008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@640 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- include/libyuv/row.h | 3 +++ source/convert_from_argb.cc | 16 ++++++++++++ source/row_any.cc | 1 + source/row_win.cc | 50 +++++++++++++++++++++++++++++++++++++ 4 files changed, 70 insertions(+) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 7780f5873..84bbbba46 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -137,6 +137,7 @@ extern "C" { #define HAS_ARGBSHUFFLEROW_AVX2 #define HAS_ARGBTOUVROW_AVX2 #define HAS_ARGBTOYROW_AVX2 +#define HAS_ARGBTOYJROW_AVX2 #define HAS_HALFROW_AVX2 #define HAS_MERGEUVROW_AVX2 #define HAS_MIRRORROW_AVX2 @@ -403,6 +404,8 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYJRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix); void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix); void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix); diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index cc5171ff3..7949c87c1 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -1012,6 +1012,14 @@ int ARGBToJ420(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYJRow = ARGBToYJRow_Any_NEON; @@ -1077,6 +1085,14 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOYJROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { + ARGBToYJRow = ARGBToYJRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + ARGBToYJRow = ARGBToYJRow_AVX2; + } + } +#endif #if defined(HAS_ARGBTOYJROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { ARGBToYJRow = ARGBToYJRow_Any_NEON; diff --git a/source/row_any.cc b/source/row_any.cc index 07202596c..6c0d4f4a3 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -204,6 +204,7 @@ BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C, #ifdef HAS_ARGBTOYROW_AVX2 YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, 4, 1, 32) +YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, 4, 1, 32) YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, 2, 1, 32) YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, 2, 1, 32) #endif diff --git a/source/row_win.cc b/source/row_win.cc index a939c82a5..2994c3634 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -35,6 +35,11 @@ static const lvec8 kARGBToY_AVX = { 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 }; +static const lvec8 kARGBToYJ_AVX = { + 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, + 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0 +}; + static const vec8 kARGBToU = { 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 }; @@ -119,6 +124,9 @@ static const uvec8 kAddY16 = { static const vec16 kAddYJ64 = { 64, 64, 64, 64, 64, 64, 64, 64 }; +static const lvec16 kAddYJ64_AVX = { + 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 +}; static const ulvec8 kAddY16_AVX = { 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, @@ -760,6 +768,48 @@ void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { } #endif // HAS_ARGBTOYROW_AVX2 +#ifdef HAS_ARGBTOYROW_AVX2 +// Convert 32 ARGB pixels (128 bytes) to 32 Y values. +__declspec(naked) __declspec(align(32)) +void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) { + __asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + vmovdqa ymm4, kARGBToYJ_AVX + vmovdqa ymm5, kAddYJ64_AVX + vmovdqa ymm6, kShufARGBToY_AVX + + align 16 + convertloop: + vmovdqu ymm0, [eax] + vmovdqu ymm1, [eax + 32] + vmovdqu ymm2, [eax + 64] + vmovdqu ymm3, [eax + 96] + vpmaddubsw ymm0, ymm0, ymm4 + vpmaddubsw ymm1, ymm1, ymm4 + vpmaddubsw ymm2, ymm2, ymm4 + vpmaddubsw ymm3, ymm3, ymm4 + lea eax, [eax + 128] + vphaddw ymm0, ymm0, ymm1 // mutates. + vphaddw ymm2, ymm2, ymm3 + vpaddw ymm0, ymm0, ymm5 // Add .5 for rounding. + vpaddw ymm2, ymm2, ymm5 + vpsrlw ymm0, ymm0, 7 + vpsrlw ymm2, ymm2, 7 + vpackuswb ymm0, ymm0, ymm2 // mutates. + vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation. + sub ecx, 32 + vmovdqu [edx], ymm0 + lea edx, [edx + 32] + jg convertloop + + vzeroupper + ret + } +} +#endif // HAS_ARGBTOYJROW_AVX2 + __declspec(naked) __declspec(align(16)) void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm {