From 4b4a32cb17596321ccee7ba3179bcd3ad6e2c81e Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Tue, 6 Nov 2012 01:56:52 +0000 Subject: [PATCH] ARGB1555 to ARGB Neon optimized BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/965007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@472 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/convert_argb.cc | 40 ++++++++++++++++------- source/row_any.cc | 4 +++ source/row_neon.cc | 69 ++++++++++++++++++++++++++++++++++++++-- 5 files changed, 102 insertions(+), 15 deletions(-) diff --git a/README.chromium b/README.chromium index 0b383abbe..67e4b9391 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 471 +Version: 472 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index d485c0f21..d11d47ed4 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 471 +#define LIBYUV_VERSION 472 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 2c0d2dd4f..9866f44e6 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -534,7 +534,7 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, uint8* dst_argb, int dst_stride_argb, int width, int height) { if (!src_argb1555 || !dst_argb || - width <= 0 || height == 0) { + width <= 0 || height == 0) { return -1; } // Negative height means invert the image. @@ -543,13 +543,22 @@ int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555, src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555; src_stride_argb1555 = -src_stride_argb1555; } - void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb, - int pix) = ARGB1555ToARGBRow_C; + void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb, int pix) = + ARGB1555ToARGBRow_C; #if defined(HAS_ARGB1555TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(width, 8) && + if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2; + } + } +#elif defined(HAS_ARGB1555TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON; + } } #endif @@ -576,13 +585,22 @@ int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444, src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444; src_stride_argb4444 = -src_stride_argb4444; } - void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb, - int pix) = ARGB4444ToARGBRow_C; + void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb, int pix) = + ARGB4444ToARGBRow_C; #if defined(HAS_ARGB4444TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(width, 8) && + if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2; + } + } +#elif defined(HAS_ARGB4444TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON; + } } #endif diff --git a/source/row_any.cc b/source/row_any.cc index 4dea57ff7..45330e510 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -184,6 +184,8 @@ YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2, 1, 16) YANY(RGB24ToARGBRow_Any_SSSE3, RGB24ToARGBRow_SSSE3, 3, 4, 16) YANY(RAWToARGBRow_Any_SSSE3, RAWToARGBRow_SSSE3, 3, 4, 16) YANY(RGB565ToARGBRow_Any_SSE2, RGB565ToARGBRow_SSE2, 2, 4, 8) +YANY(ARGB1555ToARGBRow_Any_SSE2, ARGB1555ToARGBRow_SSE2, 2, 4, 8) +YANY(ARGB4444ToARGBRow_Any_SSE2, ARGB4444ToARGBRow_SSE2, 2, 4, 8) #endif #ifdef HAS_ARGBTOYROW_NEON YANY(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 4, 1, 8) @@ -198,6 +200,8 @@ YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16) YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8) YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8) YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8) +YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8) +YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8) #endif #undef YANY diff --git a/source/row_neon.cc b/source/row_neon.cc index 2c6643f18..8851d2991 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1113,7 +1113,6 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { #endif // HAS_RAWTOARGBROW_NEON #ifdef HAS_RGB565TOARGBROW_NEON - #define RGB565TOARGB \ "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ "vshrn.u16 d5, q0, #5 \n" /* G xxGGGGGG */ \ @@ -1133,7 +1132,7 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { "vmov.u8 d7, #7 \n" // 5 bit mask ".p2align 2 \n" "1: \n" - "vld1.8 {q0}, [%0]! \n" // load 8 pixels of RGB565. + "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. @@ -1147,6 +1146,72 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { } #endif // HAS_RGB565TOARGBROW_NEON +#ifdef HAS_ARGB1555TOARGBROW_NEON +#define ARGB1555TOARGB \ + "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \ + "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \ + "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \ + "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \ + "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \ + "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \ + "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \ + "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \ + "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \ + "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \ + "vorr.u8 q1, q1, q3 \n" /* R,A */ \ + "vorr.u8 q0, q0, q2 \n" /* B,G */ \ + +void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, + int pix) { + asm volatile ( + "vmov.u8 d3, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB1555TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb1555), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} +#endif // HAS_ARGB1555TOARGBROW_NEON + +#ifdef HAS_ARGB4444TOARGBROW_NEON +#define ARGB4444TOARGB \ + "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \ + "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \ + "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \ + "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \ + "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \ + "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \ + "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \ + "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */ + +void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, + int pix) { + asm volatile ( + "vmov.u8 d3, #255 \n" // Alpha + ".p2align 2 \n" + "1: \n" + "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. + "subs %2, %2, #8 \n" // 8 processed per loop. + ARGB4444TOARGB + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. + "bgt 1b \n" + : "+r"(src_argb4444), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc", "q0", "q1", "q2", "q3" // Clobber List + ); +} +#endif // HAS_ARGB4444TOARGBROW_NEON + #ifdef HAS_ARGBTORGBAROW_NEON void ARGBToRGBARow_NEON(const uint8* src_argb, uint8* dst_rgba, int pix) { asm volatile (