From 067892c5a1010413167919d24e21fb9027acd66f Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 3 Nov 2014 18:30:17 +0000 Subject: [PATCH] Port YUY2ToYRow_AVX2 and UYVYToYRow_AVX2 to gcc/NaCL from Windows AVX code. BUG=269 TESTED=ncval R=brucedawson@google.com, harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/25039004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1151 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/row_any.cc | 6 ++++- source/row_posix.cc | 58 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 65 insertions(+), 3 deletions(-) diff --git a/README.chromium b/README.chromium index 9a6a9a132..100fe9677 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1148 +Version: 1149 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 6a7f50a7d..1e86cc158 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1148 +#define LIBYUV_VERSION 1149 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_any.cc b/source/row_any.cc index b283157b1..25c271964 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -243,9 +243,13 @@ BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C, #ifdef HAS_ARGBTOYROW_AVX2 YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_AVX2, ARGBToYRow_C, 4, 1, 31) YANY(ARGBToYJRow_Any_AVX2, ARGBToYJRow_AVX2, ARGBToYJRow_C, 4, 1, 31) -YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, YUY2ToYRow_C, 2, 1, 31) +#endif +#ifdef HAS_UYVYTOYROW_AVX2 YANY(UYVYToYRow_Any_AVX2, UYVYToYRow_AVX2, UYVYToYRow_C, 2, 1, 31) #endif +#ifdef HAS_YUY2TOYROW_AVX2 +YANY(YUY2ToYRow_Any_AVX2, YUY2ToYRow_AVX2, YUY2ToYRow_C, 2, 1, 31) +#endif #ifdef HAS_ARGBTOYROW_SSSE3 YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_SSSE3, ARGBToYRow_C, 4, 1, 15) #endif diff --git a/source/row_posix.cc b/source/row_posix.cc index f17faec6a..c97bbe9eb 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -2853,6 +2853,64 @@ void UYVYToUV422Row_SSE2(const uint8* src_uyvy, } #endif // HAS_YUY2TOYROW_SSE2 +#ifdef HAS_YUY2TOYROW_AVX2 +void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { + asm volatile ( + "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" + "vpsrlw $0x8,%%ymm5,%%ymm5 \n" + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpand %%ymm5,%%ymm0,%%ymm0 \n" + "vpand %%ymm5,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "sub $0x20,%2 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + +void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int pix) { + asm volatile ( + LABELALIGN + "1: \n" + "vmovdqu " MEMACCESS(0) ",%%ymm0 \n" + "vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" + "vpsrlw $0x8,%%ymm1,%%ymm1 \n" + "vpackuswb %%ymm1,%%ymm0,%%ymm0 \n" + "vpermq $0xd8,%%ymm0,%%ymm0 \n" + "sub $0x20,%2 \n" + "vmovdqu %%ymm0," MEMACCESS(1) " \n" + "lea " MEMLEA(0x20,1) ",%1 \n" + "jg 1b \n" + "vzeroupper \n" + "ret \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} +#endif // HAS_YUY2TOYROW_AVX2 + #ifdef HAS_ARGBBLENDROW_SSE2 // Blend 8 pixels at a time. void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,