From 00b69a2fe66183be5f72cb80c59f22e137b45359 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Fri, 2 Nov 2012 06:03:28 +0000 Subject: [PATCH] I400ToARGB_Neon optimized BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/935010 git-svn-id: http://libyuv.googlecode.com/svn/trunk@465 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 55 ++++++++++++-------------------------- include/libyuv/version.h | 2 +- source/convert_argb.cc | 20 ++++++++++---- source/row_any.cc | 5 ++++ source/row_neon.cc | 57 ++++++++++++++++++++++++++++++++++++++++ source/row_posix.cc | 31 ++++++++++++++++++++++ source/row_win.cc | 30 +++++++++++++++++++++ 8 files changed, 157 insertions(+), 45 deletions(-) diff --git a/README.chromium b/README.chromium index 96070ab5a..854e32785 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 464 +Version: 465 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index a7824ee16..4c2026583 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -144,6 +144,7 @@ extern "C" { #define HAS_ABGRTOARGBROW_NEON #define HAS_ARGBTOBAYERROW_NEON #define HAS_ARGBTORAWROW_NEON +#define HAS_I400TOARGBROW_NEON #define HAS_ARGBTORGB24ROW_NEON #define HAS_ARGBTORGBAROW_NEON #define HAS_BGRATOARGBROW_NEON @@ -450,31 +451,31 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); +void I400ToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int pix); +void I400ToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int pix); void I444ToARGBRow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* argb_buf, int width); - void I422ToARGBRow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* argb_buf, int width); - void I411ToARGBRow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); - void NV12ToARGBRow_C(const uint8* y_buf, const uint8* uv_buf, uint8* argb_buf, int width); - void NV21ToRGB565Row_C(const uint8* y_buf, const uint8* vu_buf, uint8* argb_buf, @@ -483,24 +484,20 @@ void NV12ToRGB565Row_C(const uint8* y_buf, const uint8* uv_buf, uint8* argb_buf, int width); - void NV21ToARGBRow_C(const uint8* y_buf, const uint8* vu_buf, uint8* argb_buf, int width); - void I422ToBGRARow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* bgra_buf, int width); - void I422ToABGRRow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* abgr_buf, int width); - void I422ToRGBARow_C(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -531,7 +528,6 @@ void I422ToRGB565Row_C(const uint8* y_buf, const uint8* v_buf, uint8* dst_rgb565, int width); - void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width); @@ -541,51 +537,42 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* v_buf, uint8* argb_buf, int width); - void I422ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* argb_buf, int width); - void I411ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); - void NV12ToARGBRow_SSSE3(const uint8* y_buf, const uint8* uv_buf, uint8* argb_buf, int width); - void NV21ToARGBRow_SSSE3(const uint8* y_buf, const uint8* vu_buf, uint8* argb_buf, int width); - void NV12ToRGB565Row_SSSE3(const uint8* y_buf, const uint8* uv_buf, uint8* argb_buf, int width); - void NV21ToRGB565Row_SSSE3(const uint8* y_buf, const uint8* vu_buf, uint8* argb_buf, int width); - void I422ToBGRARow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* bgra_buf, int width); - void I422ToABGRRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* abgr_buf, int width); - void I422ToRGBARow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -606,14 +593,12 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, const uint8* v_buf, uint8* rgb_buf, int width); - // RGB24/RAW are unaligned. void I422ToRGB24Row_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); - void I422ToRAWRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -719,20 +704,17 @@ void I422ToRGB565Row_Any_SSSE3(const uint8* y_buf, const uint8* v_buf, uint8* rgba_buf, int width); - // RGB24/RAW are unaligned. void I422ToRGB24Row_Any_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); - void I422ToRAWRow_Any_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width); - void YToARGBRow_SSE2(const uint8* y_buf, uint8* argb_buf, int width); @@ -847,24 +829,21 @@ void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf, const uint8* v_buf, uint8* rgb_buf, int width); - void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); - + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 74566f814..cd1e14ce0 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 464 +#define LIBYUV_VERSION 465 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 14ab96d70..cab63d8ff 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -248,13 +248,23 @@ int I400ToARGB(const uint8* src_y, int src_stride_y, void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) = I400ToARGBRow_C; #if defined(HAS_I400TOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8) && - IS_ALIGNED(src_y, 8) && IS_ALIGNED(src_stride_y, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I400ToARGBRow = I400ToARGBRow_SSE2; + if (TestCpuFlag(kCpuHasSSE2) && width >= 8) { + I400ToARGBRow = I400ToARGBRow_Any_SSE2; + if (IS_ALIGNED(width, 8)) { + I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I400ToARGBRow = I400ToARGBRow_SSE2; + } + } + } +#elif defined(HAS_I400TOARGBROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 8) { + I400ToARGBRow = I400ToARGBRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + I400ToARGBRow = I400ToARGBRow_NEON; + } } #endif - for (int y = 0; y < height; ++y) { I400ToARGBRow(src_y, dst_argb, width); src_y += src_stride_y; diff --git a/source/row_any.cc b/source/row_any.cc index 8a06202f4..bc5ea964d 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -116,6 +116,7 @@ NV2NY(NV21ToRGB565Row_Any_NEON, NV21ToRGB565Row_NEON, NV21ToRGB565Row_C, 0, 2) // SSSE3 RGB24 is multiple of 16 pixels, aligned source and destination. // SSE2 RGB565 is multiple of 4 pixels, ARGB must be aligned to 16 bytes. // NEON RGB24 is multiple of 8 pixels, unaligned source and destination. +// I400 To ARGB does multiple of 8 pixels with SIMD and remainder with C. #define RGBANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, MASK, SBPP, BPP) \ void NAMEANY(const uint8* argb_buf, \ uint8* rgb_buf, \ @@ -136,6 +137,8 @@ RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, ARGBToARGB1555Row_C, 3, 4, 2) RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C, 3, 4, 2) +RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C, + 7, 1, 4) #endif #if defined(HAS_ARGBTORGB24ROW_NEON) RGBANY(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, ARGBToRGB24Row_C, 7, 4, 3) @@ -146,6 +149,8 @@ RGBANY(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, ARGBToARGB1555Row_C, 7, 4, 2) RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C, 7, 4, 2) +RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C, + 7, 1, 4) #endif #undef RGBANY diff --git a/source/row_neon.cc b/source/row_neon.cc index 52783dcdb..f84d7ba47 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -24,6 +24,11 @@ extern "C" { "vld1.u32 {d2[0]}, [%1]! \n" \ "vld1.u32 {d2[1]}, [%2]! \n" +// Read 8 Y, and set 4 U and 4 V to 128 +#define READYUV400 \ + "vld1.u8 {d0}, [%0]! \n" \ + "vmov.u8 d2, #128 \n" + // Read 8 Y and 4 UV from NV12 #define READNV12 \ "vld1.u8 {d0}, [%0]! \n" \ @@ -411,6 +416,58 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, } #endif // HAS_I422TOARGB4444ROW_NEON +#ifdef HAS_YTOARGBROW_NEON +void YToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { + asm volatile ( + "vld1.u8 {d24}, [%3] \n" + "vld1.u8 {d25}, [%4] \n" + "vmov.u8 d26, #128 \n" + "vmov.u16 q14, #74 \n" + "vmov.u16 q15, #16 \n" + ".p2align 2 \n" + "1: \n" + READYUV400 + YUV422TORGB + "subs %2, %2, #8 \n" + "vmov.u8 d23, #255 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kUVToRB), // %3 + "r"(&kUVToG) // %4 + : "cc", "memory", "q0", "q1", "q2", "q3", + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + ); +} +#endif // HAS_YTOARGBROW_NEON + +#ifdef HAS_I400TOARGBROW_NEON +void I400ToARGBRow_NEON(const uint8* src_y, + uint8* dst_argb, + int width) { + asm volatile ( + ".p2align 2 \n" + "vmov.u8 d23, #255 \n" + "1: \n" + "vld1.u8 {d20}, [%0]! \n" + "vmov d21, d20 \n" + "vmov d22, d20 \n" + "subs %2, %2, #8 \n" + "vst4.8 {d20, d21, d22, d23}, [%1]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "d20", "d21", "d22", "d23" + ); +} +#endif // HAS_I400TOARGBROW_NEON + #ifdef HAS_NV12TOARGBROW_NEON void NV12ToARGBRow_NEON(const uint8* src_y, const uint8* src_uv, diff --git a/source/row_posix.cc b/source/row_posix.cc index 1078ed654..fa0c07ec6 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -171,6 +171,37 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { ); } +void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, + int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + ".p2align 4 \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm0 \n" + "por %%xmm5,%%xmm1 \n" + "movdqu %%xmm0,(%1) \n" + "movdqu %%xmm1,0x10(%1) \n" + "lea 0x20(%1),%1 \n" + "sub $0x8,%2 \n" + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(dst_argb), // %1 + "+r"(pix) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { asm volatile ( "movdqa %3,%%xmm5 \n" diff --git a/source/row_win.cc b/source/row_win.cc index f0001cf80..680e24935 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -131,6 +131,7 @@ static const uvec8 kShuffleMaskARGBToRAW_0 = { 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u }; +// Duplicates gray value 3 times and fills in alpha opaque. __declspec(naked) __declspec(align(16)) void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { __asm { @@ -159,6 +160,35 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { } } +__declspec(naked) __declspec(align(16)) +void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb, + int pix) { + __asm { + mov eax, [esp + 4] // src_y + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // pix + pcmpeqb xmm5, xmm5 // generate mask 0xff000000 + pslld xmm5, 24 + + align 16 + convertloop: + movq xmm0, qword ptr [eax] + lea eax, [eax + 8] + punpcklbw xmm0, xmm0 + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm0 + punpckhwd xmm1, xmm1 + por xmm0, xmm5 + por xmm1, xmm5 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + ret + } +} + __declspec(naked) __declspec(align(16)) void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { __asm {