From 793e5a06ffe55a911f8aa3f4731ae681039952bc Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Sat, 3 Nov 2012 15:12:48 +0000 Subject: [PATCH] YUY2ToARGB_NEON in one step BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/964010 git-svn-id: http://libyuv.googlecode.com/svn/trunk@468 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 40 +++++++++++++ include/libyuv/version.h | 2 +- source/convert_argb.cc | 126 +++++++-------------------------------- source/row_any.cc | 10 ++++ source/row_common.cc | 73 +++++++++++++++++++++++ source/row_neon.cc | 77 +++++++++++++++++++++++- 7 files changed, 220 insertions(+), 110 deletions(-) diff --git a/README.chromium b/README.chromium index 14c4582c8..f609f4de7 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 466 +Version: 468 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 7e0c42c48..cb574ece0 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -92,6 +92,8 @@ extern "C" { #define HAS_I422TOARGB4444ROW_SSSE3 #define HAS_I422TOARGB1555ROW_SSSE3 #define HAS_I422TORGB565ROW_SSSE3 +#define HAS_YUY2TOARGBROW_SSSE3 +#define HAS_UYVYTOARGBROW_SSSE3 // Effects #define HAS_ARGBAFFINEROW_SSE2 @@ -163,6 +165,8 @@ extern "C" { #define HAS_MIRRORROWUV_NEON #define HAS_NV12TOARGBROW_NEON #define HAS_NV21TOARGBROW_NEON +#define HAS_YUY2TOARGBROW_NEON +#define HAS_UYVYTOARGBROW_NEON #define HAS_NV12TORGB565ROW_NEON #define HAS_NV21TORGB565ROW_NEON #define HAS_RAWTOARGBROW_NEON @@ -304,6 +308,12 @@ void NV21ToRGB565Row_NEON(const uint8* y_buf, const uint8* uv_buf, uint8* rgb_buf, int width); +void YUY2ToARGBRow_NEON(const uint8* yuy2_buf, + uint8* rgb_buf, + int width); +void UYVYToARGBRow_NEON(const uint8* uyvy_buf, + uint8* rgb_buf, + int width); void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); @@ -501,6 +511,12 @@ void NV21ToARGBRow_C(const uint8* y_buf, const uint8* vu_buf, uint8* argb_buf, int width); +void YUY2ToARGBRow_C(const uint8* yuy2_buf, + uint8* argb_buf, + int width); +void UYVYToARGBRow_C(const uint8* uyvy_buf, + uint8* argb_buf, + int width); void I422ToBGRARow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -576,6 +592,12 @@ void NV21ToRGB565Row_SSSE3(const uint8* y_buf, const uint8* vu_buf, uint8* argb_buf, int width); +void YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, + uint8* argb_buf, + int width); +void UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, + uint8* argb_buf, + int width); void I422ToBGRARow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -641,6 +663,12 @@ void NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, const uint8* vu_buf, uint8* argb_buf, int width); +void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* yuy2_buf, + uint8* argb_buf, + int width); +void UYVYToARGBRow_Unaligned_SSSE3(const uint8* uyvy_buf, + uint8* argb_buf, + int width); void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -687,6 +715,12 @@ void NV21ToRGB565Row_Any_SSSE3(const uint8* y_buf, const uint8* vu_buf, uint8* argb_buf, int width); +void YUY2ToARGBRow_Any_SSSE3(const uint8* yuy2_buf, + uint8* argb_buf, + int width); +void UYVYToARGBRow_Any_SSSE3(const uint8* uyvy_buf, + uint8* argb_buf, + int width); void I422ToBGRARow_Any_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -840,6 +874,12 @@ void NV21ToRGB565Row_Any_NEON(const uint8* y_buf, const uint8* uv_buf, uint8* argb_buf, int width); +void YUY2ToARGBRow_Any_NEON(const uint8* yuy2_buf, + uint8* argb_buf, + int width); +void UYVYToARGBRow_Any_NEON(const uint8* uyvy_buf, + uint8* argb_buf, + int width); void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 95d51bb2a..6d1481261 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 466 +#define LIBYUV_VERSION 468 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 6f4c5ea10..636693688 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -740,79 +740,28 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; src_stride_yuy2 = -src_stride_yuy2; } - void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, - int pix) = YUY2ToUV422Row_C; - void (*YUY2ToYRow)(const uint8* src_yuy2, - uint8* dst_y, int pix) = YUY2ToYRow_C; -#if defined(HAS_YUY2TOYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - if (width > 16) { - YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; - YUY2ToYRow = YUY2ToYRow_Any_SSE2; - } - if (IS_ALIGNED(width, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; - YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_SSE2; - YUY2ToYRow = YUY2ToYRow_SSE2; - } - } - } -#elif defined(HAS_YUY2TOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - if (width > 8) { - YUY2ToYRow = YUY2ToYRow_Any_NEON; - if (width > 16) { - YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON; - } - } - if (IS_ALIGNED(width, 8)) { - YUY2ToYRow = YUY2ToYRow_NEON; - if (IS_ALIGNED(width, 16)) { - YUY2ToUV422Row = YUY2ToUV422Row_NEON; - } - } - } -#endif - - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I422ToARGBRow_C; -#if defined(HAS_I422TOARGBROW_SSSE3) + void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) = + YUY2ToARGBRow_C; +#if defined(HAS_YUY2TOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; + YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3; if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; + YUY2ToARGBRow = YUY2ToARGBRow_SSSE3; } } } -#elif defined(HAS_I422TOARGBROW_NEON) +#elif defined(HAS_YUY2TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - I422ToARGBRow = I422ToARGBRow_Any_NEON; + YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_NEON; + YUY2ToARGBRow = YUY2ToARGBRow_NEON; } } -#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; - } #endif - - SIMD_ALIGNED(uint8 rowy[kMaxStride]); - SIMD_ALIGNED(uint8 rowu[kMaxStride]); - SIMD_ALIGNED(uint8 rowv[kMaxStride]); - for (int y = 0; y < height; ++y) { - YUY2ToUV422Row(src_yuy2, rowu, rowv, width); - YUY2ToYRow(src_yuy2, rowy, width); - I422ToARGBRow(rowy, rowu, rowv, dst_argb, width); + YUY2ToARGBRow(src_yuy2, dst_argb, width); src_yuy2 += src_stride_yuy2; dst_argb += dst_stride_argb; } @@ -834,63 +783,28 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; src_stride_uyvy = -src_stride_uyvy; } - void (*UYVYToUV422Row)(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, - int pix) = UYVYToUV422Row_C; - void (*UYVYToYRow)(const uint8* src_uyvy, - uint8* dst_y, int pix) = UYVYToYRow_C; -#if defined(HAS_UYVYTOYROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - if (width > 16) { - UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; - UYVYToYRow = UYVYToYRow_Any_SSE2; - } - if (IS_ALIGNED(width, 16)) { - UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2; - UYVYToYRow = UYVYToYRow_Unaligned_SSE2; - if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { - UYVYToUV422Row = UYVYToUV422Row_SSE2; - UYVYToYRow = UYVYToYRow_SSE2; - } - } - } -#endif - void (*I422ToARGBRow)(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width) = I422ToARGBRow_C; -#if defined(HAS_I422TOARGBROW_SSSE3) + void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) = + UYVYToARGBRow_C; +#if defined(HAS_UYVYTOARGBROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; + UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3; + UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3; if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; + UYVYToARGBRow = UYVYToARGBRow_SSSE3; } } } -#elif defined(HAS_I422TOARGBROW_NEON) +#elif defined(HAS_UYVYTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - I422ToARGBRow = I422ToARGBRow_Any_NEON; + UYVYToARGBRow = UYVYToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_NEON; + UYVYToARGBRow = UYVYToARGBRow_NEON; } } -#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && - IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { - I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; - } #endif - - SIMD_ALIGNED(uint8 rowy[kMaxStride]); - SIMD_ALIGNED(uint8 rowu[kMaxStride]); - SIMD_ALIGNED(uint8 rowv[kMaxStride]); - for (int y = 0; y < height; ++y) { - UYVYToUV422Row(src_uyvy, rowu, rowv, width); - UYVYToYRow(src_uyvy, rowy, width); - I422ToARGBRow(rowy, rowu, rowv, dst_argb, width); + UYVYToARGBRow(src_uyvy, dst_argb, width); src_uyvy += src_stride_uyvy; dst_argb += dst_stride_argb; } diff --git a/source/row_any.cc b/source/row_any.cc index 3ece77b33..1efc5572f 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -114,6 +114,7 @@ NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2) #endif // HAS_NV12TORGB565ROW_NEON #undef NVANY +// YUY2 to RGB does 8 at a time. // RGB to RGB does multiple of 16 pixels with SIMD and remainder with C. // SSSE3 RGB24 is multiple of 16 pixels, aligned source and destination. // SSE2 RGB565 is multiple of 4 pixels, ARGB must be aligned to 16 bytes. @@ -141,6 +142,10 @@ RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C, 3, 4, 2) RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C, 7, 1, 4) +RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C, + 7, 2, 4) +RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C, + 7, 2, 4) #endif #if defined(HAS_ARGBTORGB24ROW_NEON) RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3) @@ -153,10 +158,15 @@ RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C, 7, 4, 2) RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C, 7, 1, 4) +RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C, + 7, 2, 4) +RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, + 7, 2, 4) #endif #undef RGBANY // RGB/YUV to Y does multiple of 16 with SIMD and last 16 with SIMD. +// TODO(fbarchard): Use last 16 method for all unsubsampled conversions. #define YANY(NAMEANY, ARGBTOY_SIMD, BPP, NUM) \ void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \ ARGBTOY_SIMD(src_argb, dst_y, width - NUM); \ diff --git a/source/row_common.cc b/source/row_common.cc index b438409a2..5ab935a89 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -801,6 +801,34 @@ void NV21ToRGB565Row_C(const uint8* y_buf, } } +void YUY2ToARGBRow_C(const uint8* yuy2_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + YuvPixel(yuy2_buf[0], yuy2_buf[1], yuy2_buf[3], rgb_buf + 0, 24, 16, 8, 0); + YuvPixel(yuy2_buf[2], yuy2_buf[1], yuy2_buf[3], rgb_buf + 4, 24, 16, 8, 0); + yuy2_buf += 4; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(yuy2_buf[0], yuy2_buf[1], yuy2_buf[3], rgb_buf + 0, 24, 16, 8, 0); + } +} + +void UYVYToARGBRow_C(const uint8* uyvy_buf, + uint8* rgb_buf, + int width) { + for (int x = 0; x < width - 1; x += 2) { + YuvPixel(uyvy_buf[1], uyvy_buf[0], uyvy_buf[2], rgb_buf + 0, 24, 16, 8, 0); + YuvPixel(uyvy_buf[3], uyvy_buf[0], uyvy_buf[2], rgb_buf + 4, 24, 16, 8, 0); + uyvy_buf += 4; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel(uyvy_buf[1], uyvy_buf[0], uyvy_buf[2], rgb_buf + 0, 24, 16, 8, 0); + } +} + void I422ToBGRARow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1402,6 +1430,7 @@ void I422ToARGB4444Row_SSSE3(const uint8* y_buf, I422ToARGBRow_SSSE3(y_buf, u_buf, v_buf, row, width); ARGBToARGB4444Row_SSE2(row, rgb_buf, width); } + void NV12ToRGB565Row_SSSE3(const uint8* src_y, const uint8* src_uv, uint8* dst_rgb565, @@ -1420,6 +1449,50 @@ void NV21ToRGB565Row_SSSE3(const uint8* src_y, ARGBToRGB565Row_SSE2(row, dst_rgb565, width); } +void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, + uint8* dst_argb, + int width) { + SIMD_ALIGNED(uint8 rowy[kMaxStride]); + SIMD_ALIGNED(uint8 rowu[kMaxStride]); + SIMD_ALIGNED(uint8 rowv[kMaxStride]); + YUY2ToUV422Row_SSE2(src_yuy2, rowu, rowv, width); + YUY2ToYRow_SSE2(src_yuy2, rowy, width); + I422ToARGBRow_SSSE3(rowy, rowu, rowv, dst_argb, width); +} + +void YUY2ToARGBRow_Unaligned_SSSE3(const uint8* src_yuy2, + uint8* dst_argb, + int width) { + SIMD_ALIGNED(uint8 rowy[kMaxStride]); + SIMD_ALIGNED(uint8 rowu[kMaxStride]); + SIMD_ALIGNED(uint8 rowv[kMaxStride]); + YUY2ToUV422Row_Unaligned_SSE2(src_yuy2, rowu, rowv, width); + YUY2ToYRow_Unaligned_SSE2(src_yuy2, rowy, width); + I422ToARGBRow_Unaligned_SSSE3(rowy, rowu, rowv, dst_argb, width); +} + +void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, + uint8* dst_argb, + int width) { + SIMD_ALIGNED(uint8 rowy[kMaxStride]); + SIMD_ALIGNED(uint8 rowu[kMaxStride]); + SIMD_ALIGNED(uint8 rowv[kMaxStride]); + UYVYToUV422Row_SSE2(src_uyvy, rowu, rowv, width); + UYVYToYRow_SSE2(src_uyvy, rowy, width); + I422ToARGBRow_SSSE3(rowy, rowu, rowv, dst_argb, width); +} + +void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy, + uint8* dst_argb, + int width) { + SIMD_ALIGNED(uint8 rowy[kMaxStride]); + SIMD_ALIGNED(uint8 rowu[kMaxStride]); + SIMD_ALIGNED(uint8 rowv[kMaxStride]); + UYVYToUV422Row_Unaligned_SSE2(src_uyvy, rowu, rowv, width); + UYVYToYRow_Unaligned_SSE2(src_uyvy, rowy, width); + I422ToARGBRow_Unaligned_SSSE3(rowy, rowu, rowv, dst_argb, width); +} + #endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) #endif // !defined(YUV_DISABLE_ASM) diff --git a/source/row_neon.cc b/source/row_neon.cc index 89072f83e..0014e5df6 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -51,7 +51,7 @@ extern "C" { "vld1.u8 {d2}, [%1]! \n" \ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ "vuzp.u8 d2, d3 \n" \ - "vtrn.u32 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" // Read 8 Y and 4 VU from NV21 #define READNV21 \ @@ -59,7 +59,22 @@ extern "C" { "vld1.u8 {d2}, [%1]! \n" \ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ "vuzp.u8 d3, d2 \n" \ - "vtrn.u32 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 YUY2 +#define READYUY2 \ + "vld2.u8 {d0, d2}, [%0]! \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" + +// Read 8 UYVY +#define READUYVY \ + "vld2.u8 {d2, d3}, [%0]! \n" \ + "vmov.u8 d0, d3 \n" \ + "vmov.u8 d3, d2 \n" \ + "vuzp.u8 d2, d3 \n" \ + "vtrn.u32 d2, d3 \n" #define YUV422TORGB \ "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\ @@ -674,6 +689,64 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, } #endif // HAS_NV21TORGB565ROW_NEON +#ifdef HAS_YUY2TOARGBROW_NEON +void YUY2ToARGBRow_NEON(const uint8* src_yuy2, + uint8* dst_argb, + int width) { + asm volatile ( + "vld1.u8 {d24}, [%3] \n" + "vld1.u8 {d25}, [%4] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUY2 + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kUVToRB), // %3 + "r"(&kUVToG) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_YUY2TOARGBROW_NEON + +#ifdef HAS_UYVYTOARGBROW_NEON +void UYVYToARGBRow_NEON(const uint8* src_uyvy, + uint8* dst_argb, + int width) { + asm volatile ( + "vld1.u8 {d24}, [%3] \n" + "vld1.u8 {d25}, [%4] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READUYVY + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kUVToRB), // %3 + "r"(&kUVToG) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_UYVYTOARGBROW_NEON + #ifdef HAS_SPLITUV_NEON // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v // Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.