From 64961c01b200a77b4af9629bf1215358ec056f0a Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Wed, 19 Sep 2012 20:03:20 +0000 Subject: [PATCH] ARGBToRGBA_NEON and ARGBToRGB24_NEON BUG=68 TEST=none Review URL: https://webrtc-codereview.appspot.com/816004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@367 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 7 +++++++ include/libyuv/version.h | 2 +- source/convert.cc | 18 ++++++++++++++++++ source/convert_from.cc | 11 +++++++++++ source/planar_functions.cc | 16 ++++++++++++++++ source/row_common.cc | 3 +++ source/row_neon.cc | 37 +++++++++++++++++++++++++++++++++++++ 8 files changed, 94 insertions(+), 2 deletions(-) diff --git a/README.chromium b/README.chromium index b7d3b99cf..2cdd54f36 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 366 +Version: 367 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index ced0e3201..d2ac3d030 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -115,6 +115,8 @@ extern "C" { #define HAS_I422TOBGRAROW_NEON #define HAS_I422TOABGRROW_NEON #define HAS_I422TORGBAROW_NEON +#define HAS_ARGBTORGBAROW_NEON +#define HAS_ARGBTORGB24ROW_NEON #endif #if defined(_MSC_VER) && !defined(__CLR_VER) @@ -256,6 +258,9 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); + void ARGBToRGBARow_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix); @@ -472,6 +477,8 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); + void ARGBToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void ABGRToYRow_Any_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index d6e2cb42e..da61880cf 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 366 +#define LIBYUV_VERSION 367 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert.cc b/source/convert.cc index b23afdade..1e24d0642 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -744,6 +744,24 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2, } } } +#endif +#if defined(HAS_YUY2TOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width > 16) { + YUY2ToUVRow = YUY2ToUVRow_Any_NEON; + YUY2ToYRow = YUY2ToYRow_Any_NEON; + } + if (IS_ALIGNED(width, 16)) { + YUY2ToUVRow = YUY2ToUVRow_Unaligned_NEON; + YUY2ToYRow = YUY2ToYRow_Unaligned_NEON; + if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { + YUY2ToUVRow = YUY2ToUVRow_NEON; + if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { + YUY2ToYRow = YUY2ToYRow_NEON; + } + } + } + } #endif for (int y = 0; y < height - 1; y += 2) { YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width); diff --git a/source/convert_from.cc b/source/convert_from.cc index 0abbd6855..3ab7bc631 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -927,6 +927,17 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_ARGBTORGB24ROW_NEON) + // TODO(fbarchard): One step I420ToRGB24Row_NEON. + if (TestCpuFlag(kCpuHasNEON)) { + if (width * 3 <= kMaxStride) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; + } + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_NEON; + } + } +#endif for (int y = 0; y < height; ++y) { I422ToARGBRow(src_y, src_u, src_v, row, width); diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 54ff4614b..e8146e571 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -320,6 +320,12 @@ int ARGBToRGBA(const uint8* src_argb, int src_stride_argb, ARGBToRGBARow = ARGBToRGBARow_SSSE3; } #endif +#if defined(HAS_ARGBTORGBAROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && + IS_ALIGNED(width, 16)) { + ARGBToRGBARow = ARGBToRGBARow_NEON; + } +#endif for (int y = 0; y < height; ++y) { ARGBToRGBARow(src_argb, dst_rgba, width); @@ -355,6 +361,16 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORGB24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + if (width * 3 <= kMaxStride) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; + } + if (IS_ALIGNED(width, 16)) { + ARGBToRGB24Row = ARGBToRGB24Row_NEON; + } + } +#endif for (int y = 0; y < height; ++y) { ARGBToRGB24Row(src_argb, dst_rgb24, width); diff --git a/source/row_common.cc b/source/row_common.cc index b84bc142d..337ecfd04 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -979,6 +979,9 @@ RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 2) RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 2) RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2) #endif +#if defined(HAS_ARGBTORGB24ROW_NEON) +RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 3) +#endif #undef RGBANY #ifdef HAS_ARGBTOYROW_SSSE3 diff --git a/source/row_neon.cc b/source/row_neon.cc index a50cc2cf6..c6d4ed28a 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -18,6 +18,8 @@ extern "C" { // This module is for GCC Neon #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) +// TODO(fbarchard): Make a fetch macro so different subsamples can be done. +// TODO(fbarchard): Rework register usage to produce RGB in d21 - d23. #define YUV422TORGB \ "vld1.u8 {d0}, [%0]! \n" \ "vld1.u32 {d2[0]}, [%1]! \n" \ @@ -358,6 +360,41 @@ void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { } #endif // HAS_MIRRORROWUV_NEON +#ifdef HAS_ARGBTORGBAROW_NEON +void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) { + asm volatile ( + "1: \n" + "vld4.u8 {q1,q2,q3,q4}, [%0]! \n" // load 16 pixels of ARGB. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vmov.u8 q0, q4 \n" + "vst4.u8 {q0,q1,q2,q3}, [%1]! \n" // store 16 pixels of RGBA. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgba), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3", "q4" // Clobber List + ); +} +#endif // HAS_ARGBTORGBAROW_NEON + +#ifdef HAS_ARGBTORGB24ROW_NEON +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { + asm volatile ( + "1: \n" + "vld4.u8 {q1,q2,q3,q4}, [%0]! \n" // load 16 pixels of ARGB. + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst3.u8 {q1,q2,q3}, [%1]! \n" // store 16 pixels of RGB24. + "bgt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_rgb24), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "q1", "q2", "q3", "q4" // Clobber List + ); +} +#endif // HAS_ARGBTORGB24ROW_NEON + #endif // __ARM_NEON__ #ifdef __cplusplus