From 373cdbdc58d6e7b7e4653840677ef01468607e84 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Wed, 14 Dec 2011 21:10:07 +0000 Subject: [PATCH] reorder stores for FastConvertYUVToABGRRow_SSSE3 and FastConvertYUVToBGRARow_SSSE3. ReverseRow_SSE2. cpu detect allow environment variable override set LIBYUV_DISABLE_SSSE3=1 set LIBYUV_DISABLE_SSE2=1. Reorder stores in rotate for core2 BUG=none TEST=none Review URL: http://webrtc-codereview.appspot.com/317010 git-svn-id: http://libyuv.googlecode.com/svn/trunk@107 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- source/cpu_id.cc | 10 +++++ source/planar_functions.cc | 12 ++++++ source/rotate.cc | 12 +++++- source/row.h | 4 ++ source/row_posix.cc | 83 +++++++++++++++++++++++++++----------- source/row_win.cc | 33 +++++++++++++-- 7 files changed, 126 insertions(+), 30 deletions(-) diff --git a/README.chromium b/README.chromium index 2c4937a49..d02f3a0e7 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 106 +Version: 107 License: BSD License File: LICENSE diff --git a/source/cpu_id.cc b/source/cpu_id.cc index cae34b0b8..58fd3732c 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -10,6 +10,7 @@ #include "libyuv/cpu_id.h" +#include // for getenv #ifdef _MSC_VER #include #endif @@ -55,6 +56,15 @@ int InitCpuFlags() { cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) | (cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) | kCpuInitialized; + + // environment variable overrides for testing. + if (getenv("LIBYUV_DISABLE_SSE2")) { + cpu_info_ &= ~kCpuHasSSE2; + } + // environment variable overrides for testing. + if (getenv("LIBYUV_DISABLE_SSSE3")) { + cpu_info_ &= ~kCpuHasSSSE3; + } #elif defined(__ANDROID__) && defined(__ARM_NEON__) uint64_t features = android_getCpuFeatures(); cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) | diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 4e6dbb6cc..c03eae1d1 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -340,6 +340,18 @@ int I420Mirror(const uint8* src_y, int src_stride_y, IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { ReverseRow = ReverseRow_SSSE3; } else +#endif +#if defined(HAS_REVERSE_ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 32) && + IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) && + IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) && + IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && + IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16) && + IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) && + IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { + ReverseRow = ReverseRow_SSE2; + } else #endif { ReverseRow = ReverseRow_C; diff --git a/source/rotate.cc b/source/rotate.cc index cd96e410b..5ded3c703 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -867,6 +867,14 @@ void RotatePlane180(const uint8* src, int src_stride, IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { ReverseRow = ReverseRow_SSSE3; } else +#endif +#if defined(HAS_REVERSE_ROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(width, 16) && + IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { + ReverseRow = ReverseRow_SSE2; + } else #endif { ReverseRow = ReverseRow_C; @@ -1019,8 +1027,8 @@ __asm { lea eax, [eax - 16] pshufb xmm0, xmm5 movlpd qword ptr [edx], xmm0 - lea edx, [edx + 8] movhpd qword ptr [edi], xmm0 + lea edx, [edx + 8] lea edi, [edi + 8] sub ecx, 8 ja convertloop @@ -1044,8 +1052,8 @@ void ReverseRowUV_SSSE3(const uint8* src, "lea -16(%0),%0 \n" "pshufb %%xmm5,%%xmm0 \n" "movlpd %%xmm0,(%1) \n" - "lea 8(%1),%1 \n" "movhpd %%xmm0,(%2) \n" + "lea 8(%1),%1 \n" "lea 8(%2),%2 \n" "sub $8,%3 \n" "ja 1b \n" diff --git a/source/row.h b/source/row.h index d39d0b9ba..631b5a9e1 100644 --- a/source/row.h +++ b/source/row.h @@ -65,6 +65,7 @@ void FastConvertYUVToABGRRow_NEON(const uint8* y_buf, #define HAS_FASTCONVERTYUVTOABGRROW_SSSE3 #define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3 #define HAS_REVERSE_ROW_SSSE3 +#define HAS_REVERSE_ROW_SSE2 #endif // The following are available on Neon platforms @@ -102,6 +103,9 @@ void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, #ifdef HAS_REVERSE_ROW_SSSE3 void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width); #endif +#ifdef HAS_REVERSE_ROW_SSE2 +void ReverseRow_SSE2(const uint8* src, uint8* dst, int width); +#endif #ifdef HAS_REVERSE_ROW_NEON void ReverseRow_NEON(const uint8* src, uint8* dst, int width); #endif diff --git a/source/row_posix.cc b/source/row_posix.cc index eadde7818..b6e9bf9e0 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -17,16 +17,22 @@ namespace libyuv { extern "C" { #endif +#ifdef __APPLE__ +#define CONST +#else +#define CONST static const +#endif + #ifdef HAS_ARGBTOUVROW_SSSE3 -vec8 kARGBToU = { +CONST vec8 kARGBToU = { 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 }; -uvec8 kARGBToV = { +CONST uvec8 kARGBToV = { -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0 }; -uvec8 kAddUV128 = { +CONST uvec8 kAddUV128 = { 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u }; @@ -35,31 +41,31 @@ uvec8 kAddUV128 = { #ifdef HAS_ARGBTOYROW_SSSE3 // Constant multiplication table for converting ARGB to I400. -vec8 kARGBToY = { +CONST vec8 kARGBToY = { 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 }; -uvec8 kAddY16 = { +CONST uvec8 kAddY16 = { 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u }; // Shuffle table for converting BG24 to ARGB. -uvec8 kShuffleMaskBG24ToARGB = { +CONST uvec8 kShuffleMaskBG24ToARGB = { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u }; // Shuffle table for converting RAW to ARGB. -uvec8 kShuffleMaskRAWToARGB = { +CONST uvec8 kShuffleMaskRAWToARGB = { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; // Shuffle table for converting ABGR to ARGB. -uvec8 kShuffleMaskABGRToARGB = { +CONST uvec8 kShuffleMaskABGRToARGB = { 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u }; // Shuffle table for converting BGRA to ARGB. -uvec8 kShuffleMaskBGRAToARGB = { +CONST uvec8 kShuffleMaskBGRAToARGB = { 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u }; @@ -352,7 +358,7 @@ struct { vec16 kUVBiasR; vec16 kYSub16; vec16 kYToRgb; -} SIMD_ALIGNED(kYuvConstants) = { +} CONST SIMD_ALIGNED(kYuvConstants) = { { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }, { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }, { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR }, @@ -445,8 +451,8 @@ void OMITFP FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, // rdi "punpcklbw %%xmm2,%%xmm5 \n" "movdqa %%xmm5,%%xmm0 \n" "punpcklwd %%xmm1,%%xmm5 \n" - "movdqa %%xmm5,(%3) \n" "punpckhwd %%xmm1,%%xmm0 \n" + "movdqa %%xmm5,(%3) \n" "movdqa %%xmm0,0x10(%3) \n" "lea 0x20(%3),%3 \n" "sub $0x8,%4 \n" @@ -480,8 +486,8 @@ void OMITFP FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, // rdi "punpcklbw %%xmm5,%%xmm0 \n" "movdqa %%xmm2,%%xmm1 \n" "punpcklwd %%xmm0,%%xmm2 \n" - "movdqa %%xmm2,(%3) \n" "punpckhwd %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,(%3) \n" "movdqa %%xmm1,0x10(%3) \n" "lea 0x20(%3),%3 \n" "sub $0x8,%4 \n" @@ -640,11 +646,8 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, #ifdef HAS_REVERSE_ROW_SSSE3 -// TODO(fbarchard): define CONST macro that is static const for linux, but -// does nothing for gcc on OSX (which has an internal compiler fault) - // Shuffle table for reversing the bytes. -uvec8 kShuffleReverse = { +CONST uvec8 kShuffleReverse = { 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u }; @@ -653,14 +656,14 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) { asm volatile ( "movdqa %3,%%xmm5 \n" "lea -0x10(%0,%2,1),%0 \n" -"1: \n" - "movdqa (%0),%%xmm0 \n" - "lea -0x10(%0),%0 \n" - "pshufb %%xmm5,%%xmm0 \n" - "movdqa %%xmm0,(%1) \n" - "lea 0x10(%1),%1 \n" - "sub $0x10,%2 \n" - "ja 1b \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "pshufb %%xmm5,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(temp_width) // %2 @@ -673,6 +676,38 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) { } #endif +#ifdef HAS_REVERSE_ROW_SSE2 + +void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) { + intptr_t temp_width = static_cast(width); + asm volatile ( + "lea -0x10(%0,%2,1),%0 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "lea -0x10(%0),%0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "psllw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "por %%xmm1,%%xmm0 \n" + "pshuflw $0x1b,%%xmm0,%%xmm0 \n" + "pshufhw $0x1b,%%xmm0,%%xmm0 \n" + "pshufd $0x4e,%%xmm0,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(temp_width) // %2 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} +#endif + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_win.cc b/source/row_win.cc index 9acd70764..e744f9149 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -654,8 +654,8 @@ void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, punpcklbw xmm5, xmm2 // AR movdqa xmm0, xmm5 punpcklwd xmm5, xmm1 // BGRA first 4 pixels - movdqa [edx], xmm5 punpckhwd xmm0, xmm1 // BGRA next 4 pixels + movdqa [edx], xmm5 movdqa [edx + 16], xmm0 lea edx, [edx + 32] @@ -694,8 +694,8 @@ void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, punpcklbw xmm0, xmm5 // BA movdqa xmm1, xmm2 punpcklwd xmm2, xmm0 // RGBA first 4 pixels - movdqa [edx], xmm2 punpckhwd xmm1, xmm0 // RGBA next 4 pixels + movdqa [edx], xmm2 movdqa [edx + 16], xmm1 lea edx, [edx + 32] @@ -794,7 +794,7 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf, convertloop: // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 - movq xmm0, [eax] + movq xmm0, qword ptr [eax] lea eax, [eax + 8] punpcklbw xmm0, xmm0 // Y.Y psubusw xmm0, xmm3 @@ -849,6 +849,33 @@ __asm { } #endif +#ifdef HAS_REVERSE_ROW_SSE2 + +__declspec(naked) +void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) { +__asm { + mov eax, [esp + 4] // src + mov edx, [esp + 8] // dst + mov ecx, [esp + 12] // width + lea eax, [eax + ecx - 16] + convertloop: + movdqa xmm0, [eax] + lea eax, [eax - 16] + movdqa xmm1, xmm0 // swap bytes + psllw xmm0, 8 + psrlw xmm1, 8 + por xmm0, xmm1 + pshuflw xmm0, xmm0, 0x1b // swap words + pshufhw xmm0, xmm0, 0x1b + pshufd xmm0, xmm0, 0x4e + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja convertloop + ret + } +} +#endif #ifdef __cplusplus } // extern "C" } // namespace libyuv