From 16a96645b4987fddbcf726dea2fcf5dc87ca10e1 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Fri, 2 Mar 2012 22:38:09 +0000 Subject: [PATCH] splituv and mirroruv in row use 2 pixels at a time in C BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/432006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@201 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/rotate.cc | 87 +----------------- source/rotate_neon.cc | 162 --------------------------------- source/row.h | 6 ++ source/row_common.cc | 132 ++++++++++++++++----------- source/row_neon.cc | 191 +++++++++++++++++++++++++++++++++++---- source/row_posix.cc | 36 +++++++- source/row_win.cc | 36 +++++++- 9 files changed, 330 insertions(+), 324 deletions(-) diff --git a/README.chromium b/README.chromium index 6280fad0d..d046e76c4 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 200 +Version: 201 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index dcf55aab9..1c2305867 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 200 +#define LIBYUV_VERSION 201 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/rotate.cc b/source/rotate.cc index 4d186c06e..a10313614 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -23,12 +23,6 @@ extern "C" { #if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \ !defined(YUV_DISABLE_ASM) -// Note static const preferred, but gives internal compiler error on gcc 4.2 -// Shuffle table for reversing the bytes of UV channels. -uvec8 kShuffleMirrorUV = { - 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u -}; - #if defined(__APPLE__) && defined(__i386__) #define DECLARE_FUNCTION(name) \ ".text \n" \ @@ -759,8 +753,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride, static void TransposeWx8_C(const uint8* src, int src_stride, uint8* dst, int dst_stride, int w) { - int i; - for (i = 0; i < w; ++i) { + for (int i = 0; i < w; ++i) { dst[0] = src[0 * src_stride]; dst[1] = src[1 * src_stride]; dst[2] = src[2 * src_stride]; @@ -777,9 +770,8 @@ static void TransposeWx8_C(const uint8* src, int src_stride, static void TransposeWxH_C(const uint8* src, int src_stride, uint8* dst, int dst_stride, int width, int height) { - int i, j; - for (i = 0; i < width; ++i) - for (j = 0; j < height; ++j) + for (int i = 0; i < width; ++i) + for (int j = 0; j < height; ++j) dst[i * dst_stride + j] = src[j * src_stride + i]; } @@ -1005,79 +997,6 @@ void RotateUV270(const uint8* src, int src_stride, width, height); } -#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM) -#define HAS_MIRRORROW_UV_SSSE3 -__declspec(naked) -void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_a, uint8* dst_b, - int width) { - __asm { - push edi - mov eax, [esp + 4 + 4] // src - mov edx, [esp + 4 + 8] // dst_a - mov edi, [esp + 4 + 12] // dst_b - mov ecx, [esp + 4 + 16] // width - movdqa xmm1, kShuffleMirrorUV - lea eax, [eax + ecx * 2 - 16] - sub edi, edx - - convertloop: - movdqa xmm0, [eax] - lea eax, [eax - 16] - pshufb xmm0, xmm1 - sub ecx, 8 - movlpd qword ptr [edx], xmm0 - movhpd qword ptr [edx + edi], xmm0 - lea edx, [edx + 8] - ja convertloop - - pop edi - ret - } -} - -#elif (defined(__i386__) || defined(__x86_64__)) && \ - !defined(YUV_DISABLE_ASM) -#define HAS_MIRRORROW_UV_SSSE3 -void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_a, uint8* dst_b, - int width) { - intptr_t temp_width = static_cast(width); - asm volatile ( - "movdqa %4,%%xmm1 \n" - "lea -16(%0,%3,2),%0 \n" - "sub %1,%2 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "lea -16(%0),%0 \n" - "pshufb %%xmm1,%%xmm0 \n" - "sub $8,%3 \n" - "movlpd %%xmm0,(%1) \n" - "movhpd %%xmm0,(%1,%2) \n" - "lea 8(%1),%1 \n" - "ja 1b \n" - : "+r"(src), // %0 - "+r"(dst_a), // %1 - "+r"(dst_b), // %2 - "+r"(temp_width) // %3 - : "m"(kShuffleMirrorUV) // %4 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1" -#endif - ); -} -#endif - -static void MirrorRowUV_C(const uint8* src, - uint8* dst_a, uint8* dst_b, - int width) { - src += (width << 1) - 2; - for (int i = 0; i < width; ++i) { - dst_a[i] = src[0]; - dst_b[i] = src[1]; - src -= 2; - } -} - void RotateUV180(const uint8* src, int src_stride, uint8* dst_a, int dst_stride_a, uint8* dst_b, int dst_stride_b, diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc index 264e81e7e..0f01f02b2 100644 --- a/source/rotate_neon.cc +++ b/source/rotate_neon.cc @@ -19,94 +19,6 @@ extern "C" { #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) -void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { - asm volatile ( - // compute where to start writing destination - "add %1, %2 \n" - - // work on segments that are multiples of 16 - "lsrs r3, %2, #4 \n" - - // the output is written in two block. 8 bytes followed - // by another 8. reading is done sequentially, from left to - // right. writing is done from right to left in block sizes - // %1, the destination pointer is incremented after writing - // the first of the two blocks. need to subtract that 8 off - // along with 16 to get the next location. - "mov r3, #-24 \n" - - "beq 2f \n" - - // back of destination by the size of the register that is - // going to be mirrord - "sub %1, #16 \n" - - // the loop needs to run on blocks of 16. what will be left - // over is either a negative number, the residuals that need - // to be done, or 0. if this isn't subtracted off here the - // loop will run one extra time. - "sub %2, #16 \n" - - "1: \n" - "vld1.8 {q0}, [%0]! \n" // src += 16 - - // mirror the bytes in the 64 bit segments. unable to mirror - // the bytes in the entire 128 bits in one go. - "vrev64.8 q0, q0 \n" - - // because of the inability to mirror the entire 128 bits - // mirror the writing out of the two 64 bit segments. - "vst1.8 {d1}, [%1]! \n" - "vst1.8 {d0}, [%1], r3 \n" // dst -= 16 - - "subs %2, #16 \n" - "bge 1b \n" - - // add 16 back to the counter. if the result is 0 there is no - // residuals so jump past - "adds %2, #16 \n" - "beq 5f \n" - - "add %1, #16 \n" - - "2: \n" - - "mov r3, #-3 \n" - - "sub %1, #2 \n" - "subs %2, #2 \n" - // check for 16*n+1 scenarios where segments_of_2 should not - // be run, but there is something left over. - "blt 4f \n" - -// do this in neon registers as per -// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ - "3: \n" - "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2 - - "vst1.8 {d1[0]}, [%1]! \n" - "vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2 - - "subs %2, #2 \n" - "bge 3b \n" - - "adds %2, #2 \n" - "beq 5f \n" - - "4: \n" - "add %1, #1 \n" - "vld1.8 {d0[0]}, [%0] \n" - "vst1.8 {d0[0]}, [%1] \n" - - "5: \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : - : "memory", "cc", "r3", "q0" - ); -} - static const uvec8 vtbl_4x4_transpose = { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 }; @@ -272,80 +184,6 @@ void TransposeWx8_NEON(const uint8* src, int src_stride, ); } -void MirrorRowUV_NEON(const uint8* src, - uint8* dst_a, uint8* dst_b, - int width) { - asm volatile ( - // compute where to start writing destination - "add %1, %3 \n" // dst_a + width - "add %2, %3 \n" // dst_b + width - - // work on input segments that are multiples of 16, but - // width that has been passed is output segments, half - // the size of input. - "lsrs r12, %3, #3 \n" - - "beq 2f \n" - - // the output is written in to two blocks. - "mov r12, #-8 \n" - - // back of destination by the size of the register that is - // going to be mirrord - "sub %1, #8 \n" - "sub %2, #8 \n" - - // the loop needs to run on blocks of 8. what will be left - // over is either a negative number, the residuals that need - // to be done, or 0. if this isn't subtracted off here the - // loop will run one extra time. - "sub %3, #8 \n" - - "1: \n" - "vld2.8 {d0, d1}, [%0]! \n" // src += 16 - - // mirror the bytes in the 64 bit segments - "vrev64.8 q0, q0 \n" - - "vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8 - "vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8 - - "subs %3, #8 \n" - "bge 1b \n" - - // add 8 back to the counter. if the result is 0 there is no - // residuals so return - "adds %3, #8 \n" - "beq 4f \n" - - "add %1, #8 \n" - "add %2, #8 \n" - - "2: \n" - - "mov r12, #-1 \n" - - "sub %1, #1 \n" - "sub %2, #1 \n" - - "3: \n" - "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2 - - "vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1 - "vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1 - - "subs %3, %3, #1 \n" - "bgt 3b \n" - "4: \n" - : "+r"(src), // %0 - "+r"(dst_a), // %1 - "+r"(dst_b), // %2 - "+r"(width) // %3 - : - : "memory", "cc", "r12", "q0" - ); -} - static const uvec8 vtbl_4x4_transpose_di = { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 }; diff --git a/source/row.h b/source/row.h index f90eb2473..f1da41e50 100644 --- a/source/row.h +++ b/source/row.h @@ -54,6 +54,7 @@ extern "C" { #define HAS_I444TOARGBROW_SSSE3 #define HAS_MIRRORROW_SSSE3 #define HAS_MIRRORROW_SSE2 +#define HAS_MIRRORROWUV_SSSE3 #define HAS_SPLITUV_SSE2 #define HAS_COPYROW_SSE2 #define HAS_COPYROW_X86 @@ -66,6 +67,7 @@ extern "C" { // The following are available on Neon platforms #if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM) #define HAS_MIRRORROW_NEON +#define HAS_MIRRORROWUV_NEON #define HAS_SPLITUV_NEON #define HAS_COPYROW_NEON #define HAS_I420TOARGBROW_NEON @@ -126,6 +128,10 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width); void MirrorRow_NEON(const uint8* src, uint8* dst, int width); void MirrorRow_C(const uint8* src, uint8* dst, int width); +void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width); +void MirrorRowUV_NEON(const uint8* src, uint8* dst_u, uint8* dst_v, int width); +void MirrorRowUV_C(const uint8* src, uint8* dst_u, uint8* dst_v, int width); + void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix); diff --git a/source/row_common.cc b/source/row_common.cc index 23352c8b1..30b1da6fd 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -18,8 +18,8 @@ namespace libyuv { extern "C" { #endif -void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) { - for (int x = 0; x < pix; ++x) { +void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { // To support in-place conversion. uint8 r = src_abgr[0]; uint8 g = src_abgr[1]; @@ -34,8 +34,8 @@ void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) { } } -void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) { - for (int x = 0; x < pix; ++x) { +void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { // To support in-place conversion. uint8 a = src_bgra[0]; uint8 r = src_bgra[1]; @@ -50,8 +50,8 @@ void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) { } } -void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix) { - for (int x = 0; x < pix; ++x) { +void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { uint8 b = src_rgb24[0]; uint8 g = src_rgb24[1]; uint8 r = src_rgb24[2]; @@ -64,8 +64,8 @@ void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix) { } } -void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) { - for (int x = 0; x < pix; ++x) { +void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { uint8 r = src_raw[0]; uint8 g = src_raw[1]; uint8 b = src_raw[2]; @@ -78,8 +78,8 @@ void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) { } } -void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) { - for (int x = 0; x < pix; ++x) { +void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { uint8 b = src_rgb[0] & 0x1f; uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x07) << 3); uint8 r = src_rgb[1] >> 3; @@ -92,8 +92,8 @@ void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) { } } -void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) { - for (int x = 0; x < pix; ++x) { +void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { uint8 b = src_rgb[0] & 0x1f; uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x03) << 3); uint8 r = (src_rgb[1] & 0x7c) >> 2; @@ -107,8 +107,8 @@ void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) { } } -void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) { - for (int x = 0; x < pix; ++x) { +void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) { + for (int x = 0; x < width; ++x) { uint8 a = src_rgb[1] >> 4; uint8 r = src_rgb[1] & 0x0f; uint8 g = src_rgb[0] >> 4; @@ -122,8 +122,8 @@ void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) { } } -void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) { - for (int x = 0; x < pix; ++x) { +void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + for (int x = 0; x < width; ++x) { uint8 b = src_argb[0]; uint8 g = src_argb[1]; uint8 r = src_argb[2]; @@ -135,8 +135,8 @@ void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) { } } -void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix) { - for (int x = 0; x < pix; ++x) { +void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) { + for (int x = 0; x < width; ++x) { uint8 b = src_argb[0]; uint8 g = src_argb[1]; uint8 r = src_argb[2]; @@ -149,8 +149,8 @@ void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix) { } // TODO(fbarchard): support big endian CPU -void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) { - for (int x = 0; x < pix; ++x) { +void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + for (int x = 0; x < width; ++x) { uint8 b = src_argb[0] >> 3; uint8 g = src_argb[1] >> 2; uint8 r = src_argb[2] >> 3; @@ -160,8 +160,8 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) { } } -void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) { - for (int x = 0; x < pix; ++x) { +void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + for (int x = 0; x < width; ++x) { uint8 b = src_argb[0] >> 3; uint8 g = src_argb[1] >> 3; uint8 r = src_argb[2] >> 3; @@ -172,8 +172,8 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) { } } -void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) { - for (int x = 0; x < pix; ++x) { +void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { + for (int x = 0; x < width; ++x) { uint8 b = src_argb[0] >> 4; uint8 g = src_argb[1] >> 4; uint8 r = src_argb[2] >> 4; @@ -233,9 +233,9 @@ MAKEROWY(ARGB,2,1,0) MAKEROWY(BGRA,1,2,3) MAKEROWY(ABGR,0,1,2) -void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) { +void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) { // Copy a Y to RGB. - for (int x = 0; x < pix; ++x) { + for (int x = 0; x < width; ++x) { uint8 y = src_y[0]; dst_argb[2] = dst_argb[1] = dst_argb[0] = y; dst_argb[3] = 255u; @@ -360,20 +360,42 @@ void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) { void MirrorRow_C(const uint8* src, uint8* dst, int width) { src += width - 1; - for (int i = 0; i < width; ++i) { - dst[i] = src[0]; - --src; + for (int x = 0; x < width - 1; x += 2) { + dst[x] = src[0]; + dst[x + 1] = src[-1]; + src -= 2; + } + if (width & 1) { + dst[width - 1] = src[0]; } } -void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { - // Copy a row of UV. - for (int x = 0; x < pix; ++x) { - dst_u[0] = src_uv[0]; - dst_v[0] = src_uv[1]; - src_uv += 2; - dst_u += 1; - dst_v += 1; +void MirrorRowUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { + src_uv += (width - 1) << 1; + for (int x = 0; x < width - 1; x += 2) { + dst_u[x] = src_uv[0]; + dst_u[x + 1] = src_uv[-2]; + dst_v[x] = src_uv[1]; + dst_v[x + 1] = src_uv[-2 + 1]; + src_uv -= 4; + } + if (width & 1) { + dst_u[width - 1] = src_uv[0]; + dst_v[width - 1] = src_uv[1]; + } +} + +void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { + for (int x = 0; x < width - 1; x += 2) { + dst_u[x] = src_uv[0]; + dst_u[x + 1] = src_uv[2]; + dst_v[x] = src_uv[1]; + dst_v[x + 1] = src_uv[3]; + src_uv += 4; + } + if (width & 1) { + dst_u[width - 1] = src_uv[0]; + dst_v[width - 1] = src_uv[1]; } } @@ -383,9 +405,9 @@ void CopyRow_C(const uint8* src, uint8* dst, int count) { // Filter 2 rows of YUY2 UV's (422) into U and V (420) void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { // Output a row of UV values, filtering 2 rows of YUY2 - for (int x = 0; x < pix; x += 2) { + for (int x = 0; x < width; x += 2) { dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; src_yuy2 += 4; @@ -394,20 +416,22 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, } } -void YUY2ToYRow_C(const uint8* src_yuy2, - uint8* dst_y, int pix) { +void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { // Copy a row of yuy2 Y values - for (int x = 0; x < pix; ++x) { - dst_y[0] = src_yuy2[0]; - src_yuy2 += 2; - dst_y += 1; + for (int x = 0; x < width - 1; x += 2) { + dst_y[x] = src_yuy2[0]; + dst_y[x + 1] = src_yuy2[2]; + src_yuy2 += 4; + } + if (width & 1) { + dst_y[width - 1] = src_yuy2[0]; } } void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) { + uint8* dst_u, uint8* dst_v, int width) { // Copy a row of uyvy UV values - for (int x = 0; x < pix; x += 2) { + for (int x = 0; x < width; x += 2) { dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1; dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1; src_uyvy += 4; @@ -416,13 +440,15 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy, } } -void UYVYToYRow_C(const uint8* src_uyvy, - uint8* dst_y, int pix) { +void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { // Copy a row of uyvy Y values - for (int x = 0; x < pix; ++x) { - dst_y[0] = src_uyvy[1]; - src_uyvy += 2; - dst_y += 1; + for (int x = 0; x < width - 1; x += 2) { + dst_y[x] = src_yuy2[1]; + dst_y[x + 1] = src_yuy2[3]; + src_yuy2 += 4; + } + if (width & 1) { + dst_y[width - 1] = src_yuy2[1]; } } diff --git a/source/row_neon.cc b/source/row_neon.cc index afa98bdbf..bd88eae93 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -22,38 +22,26 @@ extern "C" { "vld1.u8 {d0}, [%0]! \n" \ "vld1.u32 {d2[0]}, [%1]! \n" \ "vld1.u32 {d2[1]}, [%2]! \n" \ - \ "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\ - \ "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\ - \ "vmull.s8 q9, d2, d25 \n"/* u/v G component */\ - \ "vmov.u8 d1, #0 \n"/* split odd/even y apart */\ "vtrn.u8 d0, d1 \n" \ - \ "vsub.s16 q0, q0, q15 \n"/* offset y */\ "vmul.s16 q0, q0, q14 \n" \ - \ "vadd.s16 d18, d19 \n" \ - \ "vqadd.s16 d20, d0, d16 \n" \ "vqadd.s16 d21, d1, d16 \n" \ - \ "vqadd.s16 d22, d0, d17 \n" \ "vqadd.s16 d23, d1, d17 \n" \ - \ "vqadd.s16 d16, d0, d18 \n" \ "vqadd.s16 d17, d1, d18 \n" \ - \ "vqrshrun.s16 d0, q10, #6 \n" \ "vqrshrun.s16 d1, q11, #6 \n" \ "vqrshrun.s16 d2, q8, #6 \n" \ - \ "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\ "vmovl.u8 q11, d1 \n" \ "vmovl.u8 q8, d2 \n" \ - \ "vtrn.u8 d20, d21 \n" \ "vtrn.u8 d22, d23 \n" \ "vtrn.u8 d16, d17 \n" \ @@ -67,7 +55,7 @@ static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52, 0, 0, 0, 0, 0, 0, 0, 0 }; #endif -#if defined(HAS_I420TOARGBROW_NEON) +#ifdef HAS_I420TOARGBROW_NEON void I420ToARGBRow_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -99,7 +87,7 @@ YUVTORGB } #endif -#if defined(HAS_I420TOBGRAROW_NEON) +#ifdef HAS_I420TOBGRAROW_NEON void I420ToBGRARow_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -132,7 +120,7 @@ YUVTORGB } #endif -#if defined(HAS_I420TOABGRROW_NEON) +#ifdef HAS_I420TOABGRROW_NEON void I420ToABGRRow_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -165,10 +153,10 @@ YUVTORGB } #endif -#if defined(HAS_SPLITUV_NEON) +#ifdef HAS_SPLITUV_NEON // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v // Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels. -void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { +void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( "1: \n" "vld2.u8 {q0,q1}, [%0]! \n" // load 16 pairs of UV @@ -179,15 +167,14 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(pix) // %3 // Output registers + "+r"(width) // %3 // Output registers : // Input registers : "memory", "cc", "q0", "q1" // Clobber List ); } #endif -#if defined(HAS_COPYROW_NEON) -// TODO(fbarchard): Test without pld on NexusS +#ifdef HAS_COPYROW_NEON // Copy multiple of 64 void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile ( @@ -206,6 +193,170 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { } #endif // HAS_COPYROW_NEON +#ifdef HAS_MIRRORROW_NEON +void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { + asm volatile ( + // compute where to start writing destination + "add %1, %2 \n" + + // work on segments that are multiples of 16 + "lsrs r3, %2, #4 \n" + + // the output is written in two block. 8 bytes followed + // by another 8. reading is done sequentially, from left to + // right. writing is done from right to left in block sizes + // %1, the destination pointer is incremented after writing + // the first of the two blocks. need to subtract that 8 off + // along with 16 to get the next location. + "mov r3, #-24 \n" + + "beq 2f \n" + + // back of destination by the size of the register that is + // going to be mirrord + "sub %1, #16 \n" + + // the loop needs to run on blocks of 16. what will be left + // over is either a negative number, the residuals that need + // to be done, or 0. if this isn't subtracted off here the + // loop will run one extra time. + "sub %2, #16 \n" + + "1: \n" + "vld1.8 {q0}, [%0]! \n" // src += 16 + + // mirror the bytes in the 64 bit segments. unable to mirror + // the bytes in the entire 128 bits in one go. + "vrev64.8 q0, q0 \n" + + // because of the inability to mirror the entire 128 bits + // mirror the writing out of the two 64 bit segments. + "vst1.8 {d1}, [%1]! \n" + "vst1.8 {d0}, [%1], r3 \n" // dst -= 16 + + "subs %2, #16 \n" + "bge 1b \n" + + // add 16 back to the counter. if the result is 0 there is no + // residuals so jump past + "adds %2, #16 \n" + "beq 5f \n" + + "add %1, #16 \n" + + "2: \n" + + "mov r3, #-3 \n" + + "sub %1, #2 \n" + "subs %2, #2 \n" + // check for 16*n+1 scenarios where segments_of_2 should not + // be run, but there is something left over. + "blt 4f \n" + +// do this in neon registers as per +// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/ + "3: \n" + "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2 + + "vst1.8 {d1[0]}, [%1]! \n" + "vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2 + + "subs %2, #2 \n" + "bge 3b \n" + + "adds %2, #2 \n" + "beq 5f \n" + + "4: \n" + "add %1, #1 \n" + "vld1.8 {d0[0]}, [%0] \n" + "vst1.8 {d0[0]}, [%1] \n" + + "5: \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : + : "memory", "cc", "r3", "q0" + ); +} +#endif + +#ifdef HAS_MIRRORROWUV_NEON +void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) { + asm volatile ( + // compute where to start writing destination + "add %1, %3 \n" // dst_a + width + "add %2, %3 \n" // dst_b + width + + // work on input segments that are multiples of 16, but + // width that has been passed is output segments, half + // the size of input. + "lsrs r12, %3, #3 \n" + + "beq 2f \n" + + // the output is written in to two blocks. + "mov r12, #-8 \n" + + // back of destination by the size of the register that is + // going to be mirrord + "sub %1, #8 \n" + "sub %2, #8 \n" + + // the loop needs to run on blocks of 8. what will be left + // over is either a negative number, the residuals that need + // to be done, or 0. if this isn't subtracted off here the + // loop will run one extra time. + "sub %3, #8 \n" + + "1: \n" + "vld2.8 {d0, d1}, [%0]! \n" // src += 16 + + // mirror the bytes in the 64 bit segments + "vrev64.8 q0, q0 \n" + + "vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8 + "vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8 + + "subs %3, #8 \n" + "bge 1b \n" + + // add 8 back to the counter. if the result is 0 there is no + // residuals so return + "adds %3, #8 \n" + "beq 4f \n" + + "add %1, #8 \n" + "add %2, #8 \n" + + "2: \n" + + "mov r12, #-1 \n" + + "sub %1, #1 \n" + "sub %2, #1 \n" + + "3: \n" + "vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2 + + "vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1 + "vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1 + + "subs %3, %3, #1 \n" + "bgt 3b \n" + "4: \n" + : "+r"(src), // %0 + "+r"(dst_a), // %1 + "+r"(dst_b), // %2 + "+r"(width) // %3 + : + : "memory", "cc", "r12", "q0" + ); +} +#endif + #endif // __ARM_NEON__ #ifdef __cplusplus diff --git a/source/row_posix.cc b/source/row_posix.cc index de9a954d7..ee2e77968 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -1493,7 +1493,6 @@ void YToARGBRow_SSE2(const uint8* y_buf, #endif #ifdef HAS_MIRRORROW_SSSE3 - // Shuffle table for reversing the bytes. CONST uvec8 kShuffleMirror = { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u @@ -1524,7 +1523,6 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { #endif #ifdef HAS_MIRRORROW_SSE2 - void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { intptr_t temp_width = static_cast(width); asm volatile ( @@ -1554,6 +1552,40 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { } #endif +#ifdef HAS_MIRRORROW_UV_SSSE3 +// Shuffle table for reversing the bytes of UV channels. +CONST uvec8 kShuffleMirrorUV = { + 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u +}; +void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, + int width) { + intptr_t temp_width = static_cast(width); + asm volatile ( + "movdqa %4,%%xmm1 \n" + "lea -16(%0,%3,2),%0 \n" + "sub %1,%2 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "lea -16(%0),%0 \n" + "pshufb %%xmm1,%%xmm0 \n" + "sub $8,%3 \n" + "movlpd %%xmm0,(%1) \n" + "movhpd %%xmm0,(%1,%2) \n" + "lea 8(%1),%1 \n" + "ja 1b \n" + : "+r"(src), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(temp_width) // %3 + : "m"(kShuffleMirrorUV) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif + #ifdef HAS_SPLITUV_SSE2 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( diff --git a/source/row_win.cc b/source/row_win.cc index d6169a306..8b008e830 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -1501,7 +1501,6 @@ __asm { #endif #ifdef HAS_MIRRORROW_SSE2 - // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 // version can not. __declspec(naked) @@ -1529,6 +1528,41 @@ __asm { } #endif +#ifdef HAS_MIRRORROW_UV_SSSE3 +// Shuffle table for reversing the bytes of UV channels. +static const uvec8 kShuffleMirrorUV = { + 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u +}; + +__declspec(naked) +void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, + int width) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // width + movdqa xmm1, kShuffleMirrorUV + lea eax, [eax + ecx * 2 - 16] + sub edi, edx + + convertloop: + movdqa xmm0, [eax] + lea eax, [eax - 16] + pshufb xmm0, xmm1 + sub ecx, 8 + movlpd qword ptr [edx], xmm0 + movhpd qword ptr [edx + edi], xmm0 + lea edx, [edx + 8] + ja convertloop + + pop edi + ret + } +} +#endif + #ifdef HAS_SPLITUV_SSE2 __declspec(naked) void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {