From 5808cb22ce60bf963e15bfb1a0958cb362f5efbc Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Thu, 20 Sep 2012 05:05:33 +0000 Subject: [PATCH] Neon optimized RAW (RGB big endian) BUG=68 TEST=unittest on try bot Review URL: https://webrtc-codereview.appspot.com/819005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@368 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 3 +++ include/libyuv/version.h | 2 +- source/convert.cc | 19 +------------------ source/convert_from.cc | 13 ++++++++++++- source/planar_functions.cc | 10 ++++++++++ source/row_common.cc | 1 + source/row_neon.cc | 18 ++++++++++++++++++ 8 files changed, 47 insertions(+), 21 deletions(-) diff --git a/README.chromium b/README.chromium index 2cdd54f36..4118fed76 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 367 +Version: 368 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index d2ac3d030..f7fc3e768 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -117,6 +117,7 @@ extern "C" { #define HAS_I422TORGBAROW_NEON #define HAS_ARGBTORGBAROW_NEON #define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORAWROW_NEON #endif #if defined(_MSC_VER) && !defined(__CLR_VER) @@ -260,6 +261,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); @@ -478,6 +480,7 @@ void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRAWRow_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index da61880cf..e8ce9f9e0 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 367 +#define LIBYUV_VERSION 368 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert.cc b/source/convert.cc index 1e24d0642..2554b4ef8 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -745,24 +745,7 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, } } #endif -#if defined(HAS_YUY2TOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - if (width > 16) { - YUY2ToUVRow = YUY2ToUVRow_Any_NEON; - YUY2ToYRow = YUY2ToYRow_Any_NEON; - } - if (IS_ALIGNED(width, 16)) { - YUY2ToUVRow = YUY2ToUVRow_Unaligned_NEON; - YUY2ToYRow = YUY2ToYRow_Unaligned_NEON; - if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { - YUY2ToUVRow = YUY2ToUVRow_NEON; - if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { - YUY2ToYRow = YUY2ToYRow_NEON; - } - } - } - } -#endif + for (int y = 0; y < height - 1; y += 2) { YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); YUY2ToYRow(src_yuy2, dst_y, width); diff --git a/source/convert_from.cc b/source/convert_from.cc index 3ab7bc631..b8b759cf0 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -882,6 +882,7 @@ int I420ToRGBA(const uint8* src_y, int src_stride_y, } // Convert I420 to RGB24. +// TODO(fbarchard): One step I420ToRGB24Row_NEON. int I420ToRGB24(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, @@ -928,7 +929,6 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, } #endif #if defined(HAS_ARGBTORGB24ROW_NEON) - // TODO(fbarchard): One step I420ToRGB24Row_NEON. if (TestCpuFlag(kCpuHasNEON)) { if (width * 3 <= kMaxStride) { ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; @@ -953,6 +953,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, } // Convert I420 to RAW. +// TODO(fbarchard): One step I420ToRAWRow_NEON. int I420ToRAW(const uint8* src_y, int src_stride_y, const uint8* src_u, int src_stride_u, const uint8* src_v, int src_stride_v, @@ -998,6 +999,16 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_ARGBTORAWROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width * 3 <= kMaxStride) { + ARGBToRAWRow = ARGBToRAWRow_Any_NEON; + } + if (IS_ALIGNED(width, 16)) { + ARGBToRAWRow = ARGBToRAWRow_NEON; + } + } +#endif for (int y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, row, width); diff --git a/source/planar_functions.cc b/source/planar_functions.cc index e8146e571..4aa618a07 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -406,6 +406,16 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORAWROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width * 3 <= kMaxStride) { + ARGBToRAWRow = ARGBToRAWRow_Any_NEON; + } + if (IS_ALIGNED(width, 16)) { + ARGBToRAWRow = ARGBToRAWRow_NEON; + } + } +#endif for (int y = 0; y < height; ++y) { ARGBToRAWRow(src_argb, dst_raw, width); diff --git a/source/row_common.cc b/source/row_common.cc index 337ecfd04..b3cf5a979 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -981,6 +981,7 @@ RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2) #endif #if defined(HAS_ARGBTORGB24ROW_NEON) RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 3) +RGBANY(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 3) #endif #undef RGBANY diff --git a/source/row_neon.cc b/source/row_neon.cc index c6d4ed28a..43ca6f825 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -395,6 +395,24 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { } #endif // HAS_ARGBTORGB24ROW_NEON +#ifdef HAS_ARGBTORAWROW_NEON +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { + asm volatile ( + "1: \n" + "vld4.u8 {q1,q2,q3,q4}, [%0]! \n" // load 16 pixels of ARGB. + "vswp.u8 q1, q3 \n" // swap R, B + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst3.u8 {q1,q2,q3}, [%1]! \n" // store 16 pixels of RAW. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_raw), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List + ); +} +#endif // HAS_ARGBTORAWROW_NEON + #endif // __ARM_NEON__ #ifdef __cplusplus