From 14f657b44a6fa2e19573684bea7203cd18975a6e Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Wed, 27 Feb 2013 06:38:41 +0000 Subject: [PATCH] SSE2 scale for ARGB and posix with specializations BUG=177 TESTED=try bots Review URL: https://webrtc-codereview.appspot.com/1115008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@585 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 3 +- include/libyuv/version.h | 2 +- source/scale.cc | 85 +++++++++++++----- source/scale_argb.cc | 190 ++++++++++++++++++++++++++------------- 5 files changed, 193 insertions(+), 89 deletions(-) diff --git a/README.chromium b/README.chromium index 60b8b2da4..d57660be9 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 584 +Version: 585 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 5bb958698..f051b3def 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -169,7 +169,8 @@ extern "C" { #endif // The following are available on Neon platforms -#if !defined(LIBYUV_DISABLE_NEON) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) +#if !defined(LIBYUV_DISABLE_NEON) && \ + (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_ABGRTOARGBROW_NEON #define HAS_ABGRTOUVROW_NEON #define HAS_ABGRTOYROW_NEON diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 01960a700..65ec74f9d 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 584 +#define LIBYUV_VERSION 585 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/scale.cc b/source/scale.cc index 50425ccd5..a146580a9 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -23,9 +23,6 @@ namespace libyuv { extern "C" { #endif -// Bilinear SSE2 is disabled. -#define SSE2_DISABLED 1 - // Note: Some SSE2 reference manuals // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf @@ -1954,20 +1951,29 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { + asm volatile ( asm volatile ( "sub %1,%0 \n" + "shr %3 \n" "cmp $0x0,%3 \n" - "je 2f \n" - "cmp $0x80,%3 \n" - "je 3f \n" + "je 100f \n" + "cmp $0x20,%3 \n" + "je 75f \n" + "cmp $0x40,%3 \n" + "je 50f \n" + "cmp $0x60,%3 \n" + "je 25f \n" + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" "movd %3,%%xmm5 \n" - "punpcklbw %%xmm5,%%xmm5 \n" - "psrlw $0x1,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" "punpcklwd %%xmm5,%%xmm5 \n" - "punpckldq %%xmm5,%%xmm5 \n" - "punpcklqdq %%xmm5,%%xmm5 \n" + "pshufd $0x0,%%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" + + // General purpose row blend. ".p2align 4 \n" "1: \n" "movdqa (%1),%%xmm0 \n" @@ -1991,25 +1997,56 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 1b \n" - "jmp 4f \n" + "jmp 99f \n" + + // Blend 25 / 75. ".p2align 4 \n" - "2: \n" + "25: \n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 25b \n" + "jmp 99f \n" + + // Blend 50 / 50. + ".p2align 4 \n" + "50: \n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 75 / 25. + ".p2align 4 \n" + "75: \n" + "movdqa (%1),%%xmm1 \n" + "movdqa (%1,%4,1),%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x10,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 75b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + ".p2align 4 \n" + "100: \n" "movdqa (%1),%%xmm0 \n" "sub $0x10,%2 \n" "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" - "jg 2b \n" - "jmp 4f \n" - ".p2align 4 \n" - "3: \n" - "movdqa (%1),%%xmm0 \n" - "pavgb (%1,%4,1),%%xmm0 \n" - "sub $0x10,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "jg 3b \n" - ".p2align 4 \n" - "4: \n" + "jg 100b \n" + + "99: \n" "punpckhbw %%xmm0,%%xmm0 \n" "pshufhw $0xff,%%xmm0,%%xmm0 \n" "punpckhqdq %%xmm0,%%xmm0 \n" diff --git a/source/scale_argb.cc b/source/scale_argb.cc index d593a54b8..12ae70f03 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -200,8 +200,7 @@ static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_argb, } // Bilinear row filtering combines 4x2 -> 4x1. SSE2 version. -#ifndef SSE2_DISABLED -#define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED +#define HAS_SCALEARGBFILTERROWS_SSE2 __declspec(naked) __declspec(align(16)) void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb, ptrdiff_t src_stride, int dst_width, @@ -215,19 +214,24 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb, mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) sub edi, esi + // Dispatch to specialized filters if applicable. cmp eax, 0 - je xloop1 + je xloop100 // 0 / 256. Blend 100 / 0. + cmp eax, 64 + je xloop75 // 64 / 256 is 0.25. Blend 75 / 25. cmp eax, 128 - je xloop2 + je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. + cmp eax, 192 + je xloop25 // 192 / 256 is 0.75. Blend 25 / 75. movd xmm5, eax // xmm5 = y fraction punpcklbw xmm5, xmm5 + psrlw xmm5, 1 punpcklwd xmm5, xmm5 - pshufd xmm5, xmm5, 0 + punpckldq xmm5, xmm5 + punpcklqdq xmm5, xmm5 pxor xmm4, xmm4 - // f * row1 + (1 - frac) row0 - // frac * (row1 - row0) + row0 align 16 xloop: movdqa xmm0, [esi] // row0 @@ -240,6 +244,8 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb, punpckhbw xmm1, xmm4 psubw xmm2, xmm0 // row1 - row0 psubw xmm3, xmm1 + paddw xmm2, xmm2 // 9 bits * 15 bits = 8.16 + paddw xmm3, xmm3 pmulhw xmm2, xmm5 // scale diff pmulhw xmm3, xmm5 paddw xmm0, xmm2 // sum rows @@ -249,44 +255,63 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb, movdqa [esi + edi], xmm0 lea esi, [esi + 16] jg xloop + jmp xloop99 + // Blend 25 / 75. + align 16 + xloop25: + movdqa xmm0, [esi] + movdqa xmm1, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop25 + jmp xloop99 + + // Blend 50 / 50. + align 16 + xloop50: + movdqa xmm0, [esi] + movdqa xmm1, [esi + edx] + pavgb xmm0, xmm1 + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop50 + jmp xloop99 + + // Blend 75 / 25. + align 16 + xloop75: + movdqa xmm1, [esi] + movdqa xmm0, [esi + edx] + pavgb xmm0, xmm1 + pavgb xmm0, xmm1 + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop75 + jmp xloop99 + + // Blend 100 / 0 - Copy row unchanged. + align 16 + xloop100: + movdqa xmm0, [esi] + sub ecx, 4 + movdqa [esi + edi], xmm0 + lea esi, [esi + 16] + jg xloop100 + + xloop99: shufps xmm0, xmm0, 0xff movdqa [esi + edi], xmm0 // duplicate last pixel for filtering pop edi pop esi ret - - align 16 - xloop1: - movdqa xmm0, [esi] - sub ecx, 4 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop1 - - shufps xmm0, xmm0, 0xff - movdqa [esi + edi], xmm0 - pop edi - pop esi - ret - - align 16 - xloop2: - movdqa xmm0, [esi] - pavgb xmm0, [esi + edx] - sub ecx, 4 - movdqa [esi + edi], xmm0 - lea esi, [esi + 16] - jg xloop2 - - shufps xmm0, xmm0, 0xff - movdqa [esi + edi], xmm0 - pop edi - pop esi - ret } } -#endif // SSE2_DISABLED // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version. #define HAS_SCALEARGBFILTERROWS_SSSE3 @@ -544,23 +569,33 @@ static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_argb, ); } -#ifndef SSE2_DISABLED // Bilinear row filtering combines 4x2 -> 4x1. SSE2 version -#define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED +#define HAS_SCALEARGBFILTERROWS_SSE2 void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { asm volatile ( "sub %1,%0 \n" + "shr %3 \n" "cmp $0x0,%3 \n" - "je 2f \n" - "cmp $0x80,%3 \n" - "je 3f \n" + "je 100f \n" + "cmp $0x20,%3 \n" + "je 75f \n" + "cmp $0x40,%3 \n" + "je 50f \n" + "cmp $0x60,%3 \n" + "je 25f \n" + + "movd %3,%%xmm0 \n" + "neg %3 \n" + "add $0x80,%3 \n" "movd %3,%%xmm5 \n" - "punpcklbw %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm5 \n" "punpcklwd %%xmm5,%%xmm5 \n" "pshufd $0x0,%%xmm5,%%xmm5 \n" "pxor %%xmm4,%%xmm4 \n" + + // General purpose row blend. ".p2align 4 \n" "1: \n" "movdqa (%1),%%xmm0 \n" @@ -573,6 +608,8 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb, "punpckhbw %%xmm4,%%xmm1 \n" "psubw %%xmm0,%%xmm2 \n" "psubw %%xmm1,%%xmm3 \n" + "paddw %%xmm2,%%xmm2 \n" + "paddw %%xmm3,%%xmm3 \n" "pmulhw %%xmm5,%%xmm2 \n" "pmulhw %%xmm5,%%xmm3 \n" "paddw %%xmm2,%%xmm0 \n" @@ -582,31 +619,61 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb, "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" "jg 1b \n" - "jmp 4f \n" + "jmp 99f \n" + + // Blend 25 / 75. ".p2align 4 \n" - "2: \n" + "25: \n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 25b \n" + "jmp 99f \n" + + // Blend 50 / 50. + ".p2align 4 \n" + "50: \n" + "movdqa (%1),%%xmm0 \n" + "movdqa (%1,%4,1),%%xmm1 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 50b \n" + "jmp 99f \n" + + // Blend 75 / 25. + ".p2align 4 \n" + "75: \n" + "movdqa (%1),%%xmm1 \n" + "movdqa (%1,%4,1),%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "pavgb %%xmm1,%%xmm0 \n" + "sub $0x4,%2 \n" + "movdqa %%xmm0,(%1,%0,1) \n" + "lea 0x10(%1),%1 \n" + "jg 75b \n" + "jmp 99f \n" + + // Blend 100 / 0 - Copy row unchanged. + ".p2align 4 \n" + "100: \n" "movdqa (%1),%%xmm0 \n" "sub $0x4,%2 \n" "movdqa %%xmm0,(%1,%0,1) \n" "lea 0x10(%1),%1 \n" - "jg 2b \n" - "jmp 4f \n" - ".p2align 4 \n" - "3: \n" - "movdqa (%1),%%xmm0 \n" - "pavgb (%1,%4,1),%%xmm0 \n" - "sub $0x4,%2 \n" - "movdqa %%xmm0,(%1,%0,1) \n" - "lea 0x10(%1),%1 \n" - "lea 0x10(%1),%1 \n" - "jg 3b \n" - ".p2align 4 \n" - "4: \n" + "jg 100b \n" + + "99: \n" "shufps $0xff,%%xmm0,%%xmm0 \n" "movdqa %%xmm0,(%1,%0,1) \n" - : "+r"(dst_argb), // %0 - "+r"(src_argb), // %1 - "+r"(dst_width), // %2 + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width), // %2 "+r"(source_y_fraction) // %3 : "r"(static_cast(src_stride)) // %4 : "memory", "cc" @@ -615,7 +682,6 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb, #endif ); } -#endif // SSE2_DISABLED // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version #define HAS_SCALEARGBFILTERROWS_SSSE3