From c74fe987257b082e0bb887290f97caf7ab3bad66 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Fri, 5 Oct 2012 12:00:07 +0000 Subject: [PATCH] YUY2 16 pixels at a time is 2x faster BUG=116 TEST=libyuv_unittest Review URL: https://webrtc-codereview.appspot.com/870005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@393 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/convert.cc | 30 ++++++++++-------------------- source/row_neon.cc | 21 +++++++++++---------- 4 files changed, 23 insertions(+), 32 deletions(-) diff --git a/README.chromium b/README.chromium index 9c9461532..c0c0b5560 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 392 +Version: 393 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index d5ecb6ddb..f5b72527f 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 392 +#define LIBYUV_VERSION 393 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert.cc b/source/convert.cc index cb3ad2a96..fac6c674a 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -558,11 +558,9 @@ int Q420ToI420(const uint8* src_y, int src_stride_y, YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; } } - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { YUY2ToYRow = YUY2ToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_NEON; - } + YUY2ToUV422Row = YUY2ToUV422Row_NEON; } } #endif @@ -683,11 +681,9 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, YUY2ToUVRow = YUY2ToUVRow_Any_NEON; } } - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { YUY2ToYRow = YUY2ToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - YUY2ToUVRow = YUY2ToUVRow_NEON; - } + YUY2ToUVRow = YUY2ToUVRow_NEON; } } #endif @@ -752,11 +748,9 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy, UYVYToUVRow = UYVYToUVRow_Any_NEON; } } - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { UYVYToYRow = UYVYToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - UYVYToUVRow = UYVYToUVRow_NEON; - } + UYVYToUVRow = UYVYToUVRow_NEON; } } #endif @@ -872,11 +866,9 @@ int V210ToI420(const uint8* src_v210, int src_stride_v210, UYVYToUVRow = UYVYToUVRow_Any_NEON; } } - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { UYVYToYRow = UYVYToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - UYVYToUVRow = UYVYToUVRow_NEON; - } + UYVYToUVRow = UYVYToUVRow_NEON; } } #endif @@ -903,11 +895,9 @@ int V210ToI420(const uint8* src_v210, int src_stride_v210, UYVYToUVRow = UYVYToUVRow_Any_NEON; } } - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { UYVYToYRow = UYVYToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - UYVYToUVRow = UYVYToUVRow_NEON; - } + UYVYToUVRow = UYVYToUVRow_NEON; } } #endif diff --git a/source/row_neon.cc b/source/row_neon.cc index 3a2a96a69..14455fd19 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -509,15 +509,15 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { asm volatile ( "1: \n" - "vld2.u8 {d0, d1}, [%0]! \n" // load 8 pixels of YUY2. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst1.u8 {d0}, [%1]! \n" // store 8 pixels of Y. + "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.u8 {q0}, [%1]! \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "memory", "cc", "d0", "d1" // Clobber List + : "memory", "cc", "q0", "q1" // Clobber List ); } #endif // HAS_YUY2TOYROW_NEON @@ -526,19 +526,22 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( "1: \n" - "vld2.u8 {d0, d1}, [%0]! \n" // load 8 pixels of UYVY. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst1.u8 {d1}, [%1]! \n" // store 8 pixels of Y. + "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst1.u8 {q1}, [%1]! \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "memory", "cc", "d0", "d1" // Clobber List + : "memory", "cc", "q0", "q1" // Clobber List ); } #endif // HAS_UYVYTOYROW_NEON +#endif // HAS_UYVYTOYROW_NEON + + #ifdef HAS_YUY2TOYROW_NEON void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, int pix) { @@ -627,8 +630,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, : "memory", "cc", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List ); } -#endif // HAS_UYVYTOYROW_NEON - #endif // __ARM_NEON__ #ifdef __cplusplus