From 8ffe78abd29e43ca344734af9fb0fc91067e7d1c Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Thu, 30 May 2013 07:14:14 +0000 Subject: [PATCH] Scale down by 4 used 3rd pixel BUG=232 TEST=convert.exe -f 0 faces_640x480_P420.yuv face2_160x120_P420.yuv R=changjun.yang@intel.com Review URL: https://webrtc-codereview.appspot.com/1579005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@709 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/convert.cc | 12 +- source/scale.cc | 430 ++++++++------------------------------ source/scale_argb.cc | 28 +-- source/scale_argb_neon.cc | 16 +- source/scale_mips.cc | 12 +- source/scale_neon.cc | 52 +++-- unit_test/compare_test.cc | 2 +- 9 files changed, 151 insertions(+), 405 deletions(-) diff --git a/README.chromium b/README.chromium index 0a637246f..4f9044897 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 708 +Version: 709 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index eb2624953..779b8f00d 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 708 +#define LIBYUV_VERSION 709 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert.cc b/source/convert.cc index 1e066d5ec..980df7edd 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -137,15 +137,15 @@ int I422ToI420(const uint8* src_y, int src_stride_y, #if !defined(LIBYUV_DISABLE_NEON) && \ (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_SCALEROWDOWN2_NEON -void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); #elif !defined(LIBYUV_DISABLE_X86) && \ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) -void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); #endif -void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); LIBYUV_API @@ -173,11 +173,11 @@ int I444ToI420(const uint8* src_y, int src_stride_y, } int halfwidth = (width + 1) >> 1; void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) = ScaleRowDown2Int_C; + uint8* dst_ptr, int dst_width) = ScaleRowDown2Box_C; #if defined(HAS_SCALEROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(halfwidth, 16)) { - ScaleRowDown2 = ScaleRowDown2Int_NEON; + ScaleRowDown2 = ScaleRowDown2Box_NEON; } #elif defined(HAS_SCALEROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2) && @@ -186,7 +186,7 @@ int I444ToI420(const uint8* src_y, int src_stride_y, IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) && IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) && IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) { - ScaleRowDown2 = ScaleRowDown2Int_SSE2; + ScaleRowDown2 = ScaleRowDown2Box_SSE2; } #endif diff --git a/source/scale.cc b/source/scale.cc index 721beee08..4189d3dcd 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -55,13 +55,13 @@ void SetUseReferenceImpl(bool use) { void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width); -void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); #define HAS_SCALEROWDOWN4_NEON void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst_ptr, int dst_width); -void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); #define HAS_SCALEROWDOWN34_NEON @@ -71,10 +71,10 @@ void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, void ScaleRowDown34_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst_ptr, int dst_width); -void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, +void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); -void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, +void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); @@ -84,11 +84,11 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst_ptr, int dst_width); // 32x3 -> 12x1 -void ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, +void ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); // 32x2 -> 12x1 -void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, +void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); // 16x2 -> 16x1 @@ -217,7 +217,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, // Blends 32x2 rectangle to 16x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) __declspec(align(16)) -void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { push esi @@ -290,7 +290,7 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, // Blends 32x2 rectangle to 16x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) __declspec(align(16)) -static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr, +static void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -343,8 +343,9 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x000000ff + pcmpeqb xmm5, xmm5 // generate mask 0x00ff0000 psrld xmm5, 24 + pslld xmm5, 16 align 16 wloop: @@ -354,6 +355,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, pand xmm0, xmm5 pand xmm1, xmm5 packuswb xmm0, xmm1 + psrlw xmm0, 8 packuswb xmm0, xmm0 sub ecx, 8 movq qword ptr [edx], xmm0 @@ -367,7 +369,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, // Blends 32x4 rectangle to 8x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. __declspec(naked) __declspec(align(16)) -static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, +static void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { push esi @@ -425,112 +427,6 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, } } -#define HAS_SCALEROWDOWN8_SSE2 -// Point samples 32 pixels to 4 pixels. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - mov eax, [esp + 4] // src_ptr - // src_stride ignored - mov edx, [esp + 12] // dst_ptr - mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask isolating 1 src 8 bytes - psrlq xmm5, 56 - - align 16 - wloop: - movdqa xmm0, [eax] - movdqa xmm1, [eax + 16] - lea eax, [eax + 32] - pand xmm0, xmm5 - pand xmm1, xmm5 - packuswb xmm0, xmm1 // 32->16 - packuswb xmm0, xmm0 // 16->8 - packuswb xmm0, xmm0 // 8->4 - sub ecx, 4 - movd dword ptr [edx], xmm0 - lea edx, [edx + 4] - jg wloop - - ret - } -} - -// Blends 32x8 rectangle to 4x1. -// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned. -__declspec(naked) __declspec(align(16)) -static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - __asm { - push esi - push edi - push ebp - mov eax, [esp + 12 + 4] // src_ptr - mov esi, [esp + 12 + 8] // src_stride - mov edx, [esp + 12 + 12] // dst_ptr - mov ecx, [esp + 12 + 16] // dst_width - lea edi, [esi + esi * 2] // src_stride * 3 - pxor xmm7, xmm7 - - align 16 - wloop: - movdqa xmm0, [eax] // average 8 rows to 1 - movdqa xmm1, [eax + 16] - movdqa xmm2, [eax + esi] - movdqa xmm3, [eax + esi + 16] - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - movdqa xmm2, [eax + esi * 2] - movdqa xmm3, [eax + esi * 2 + 16] - movdqa xmm4, [eax + edi] - movdqa xmm5, [eax + edi + 16] - lea ebp, [eax + esi * 4] - lea eax, [eax + 32] - pavgb xmm2, xmm4 - pavgb xmm3, xmm5 - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - - movdqa xmm2, [ebp] - movdqa xmm3, [ebp + 16] - movdqa xmm4, [ebp + esi] - movdqa xmm5, [ebp + esi + 16] - pavgb xmm2, xmm4 - pavgb xmm3, xmm5 - movdqa xmm4, [ebp + esi * 2] - movdqa xmm5, [ebp + esi * 2 + 16] - movdqa xmm6, [ebp + edi] - pavgb xmm4, xmm6 - movdqa xmm6, [ebp + edi + 16] - pavgb xmm5, xmm6 - pavgb xmm2, xmm4 - pavgb xmm3, xmm5 - pavgb xmm0, xmm2 - pavgb xmm1, xmm3 - - psadbw xmm0, xmm7 // average 32 pixels to 4 - psadbw xmm1, xmm7 - pshufd xmm0, xmm0, 0xd8 // x1x0 -> xx01 - pshufd xmm1, xmm1, 0x8d // x3x2 -> 32xx - por xmm0, xmm1 // -> 3201 - psrlw xmm0, 3 - packuswb xmm0, xmm0 - packuswb xmm0, xmm0 - - sub ecx, 4 - movd dword ptr [edx], xmm0 - lea edx, [edx + 4] - jg wloop - - pop ebp - pop edi - pop esi - ret - } -} - #define HAS_SCALEROWDOWN34_SSSE3 // Point samples 32 pixels to 24 pixels. // Produces three 8 byte values. For each 8 bytes, 16 bytes are read. @@ -588,7 +484,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, // Note that movdqa+palign may be better than movdqu. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. __declspec(naked) __declspec(align(16)) -static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, +static void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -647,7 +543,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, // Note that movdqa+palign may be better than movdqu. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. __declspec(naked) __declspec(align(16)) -static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, +static void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -743,7 +639,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, // Scale 16x3 pixels to 6x1 with interpolation __declspec(naked) __declspec(align(16)) -static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, +static void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -809,7 +705,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, // Scale 16x2 pixels to 6x1 with interpolation __declspec(naked) __declspec(align(16)) -static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, +static void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { __asm { @@ -1288,7 +1184,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown2Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown2Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" @@ -1353,7 +1249,7 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ); } -static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr, +static void ScaleRowDown2Box_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( @@ -1398,6 +1294,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrld $0x18,%%xmm5 \n" + "pslld $0x10,%%xmm5 \n" ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" @@ -1406,6 +1303,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, "pand %%xmm5,%%xmm0 \n" "pand %%xmm5,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" + "psrlw $0x8,%%xmm0 \n" "packuswb %%xmm0,%%xmm0 \n" "movq %%xmm0,(%1) \n" "lea 0x8(%1),%1 \n" @@ -1422,7 +1320,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, +static void ScaleRowDown4Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { intptr_t stridex3 = 0; asm volatile ( @@ -1476,103 +1374,6 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -#define HAS_SCALEROWDOWN8_SSE2 -static void ScaleRowDown8_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlq $0x38,%%xmm5 \n" - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" - "packuswb %%xmm1,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%1) \n" - "lea 0x4(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width) // %2 - : - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" -#endif - ); -} - -static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) { - intptr_t stridex3 = 0; - intptr_t row4 = 0; - asm volatile ( - "lea (%5,%5,2),%3 \n" - "pxor %%xmm7,%%xmm7 \n" - ".p2align 4 \n" - "1: \n" - "movdqa (%0),%%xmm0 \n" - "movdqa 0x10(%0),%%xmm1 \n" - "movdqa (%0,%5,1),%%xmm2 \n" - "movdqa 0x10(%0,%5,1),%%xmm3 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa (%0,%5,2),%%xmm2 \n" - "movdqa 0x10(%0,%5,2),%%xmm3 \n" - "movdqa (%0,%3,1),%%xmm4 \n" - "movdqa 0x10(%0,%3,1),%%xmm5 \n" - "lea (%0,%5,4),%4 \n" - "lea 0x20(%0),%0 \n" - "pavgb %%xmm4,%%xmm2 \n" - "pavgb %%xmm5,%%xmm3 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "movdqa 0x0(%4),%%xmm2 \n" - "movdqa 0x10(%4),%%xmm3 \n" - "movdqa 0x0(%4,%5,1),%%xmm4 \n" - "movdqa 0x10(%4,%5,1),%%xmm5 \n" - "pavgb %%xmm4,%%xmm2 \n" - "pavgb %%xmm5,%%xmm3 \n" - "movdqa 0x0(%4,%5,2),%%xmm4 \n" - "movdqa 0x10(%4,%5,2),%%xmm5 \n" - "movdqa 0x0(%4,%3,1),%%xmm6 \n" - "pavgb %%xmm6,%%xmm4 \n" - "movdqa 0x10(%4,%3,1),%%xmm6 \n" - "pavgb %%xmm6,%%xmm5 \n" - "pavgb %%xmm4,%%xmm2 \n" - "pavgb %%xmm5,%%xmm3 \n" - "pavgb %%xmm2,%%xmm0 \n" - "pavgb %%xmm3,%%xmm1 \n" - "psadbw %%xmm7,%%xmm0 \n" - "psadbw %%xmm7,%%xmm1 \n" - "pshufd $0xd8,%%xmm0,%%xmm0 \n" - "pshufd $0x8d,%%xmm1,%%xmm1 \n" - "por %%xmm1,%%xmm0 \n" - "psrlw $0x3,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "packuswb %%xmm0,%%xmm0 \n" - "movd %%xmm0,(%1) \n" - "lea 0x4(%1),%1 \n" - "sub $0x4,%2 \n" - "jg 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+rm"(dst_width), // %2 - "+r"(stridex3), // %3 - "+r"(row4) // %4 - : "r"(static_cast(src_stride)) // %5 - : "memory", "cc" -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" -#endif - ); -} - #define HAS_SCALEROWDOWN34_SSSE3 static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { @@ -1613,7 +1414,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, ); } -static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, +static void ScaleRowDown34_1_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( @@ -1680,7 +1481,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, ); } -static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, +static void ScaleRowDown34_0_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( @@ -1783,7 +1584,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, ); } -static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, +static void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( @@ -1829,7 +1630,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, ); } -static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, +static void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( @@ -2282,7 +2083,7 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, #define HAS_SCALEROWDOWN2_MIPS_DSPR2 void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width); -void ScaleRowDown2Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); #define HAS_SCALEFILTERROWS_MIPS_DSPR2 void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr, @@ -2292,21 +2093,21 @@ void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr, #define HAS_SCALEROWDOWN4_MIPS_DSPR2 void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width); -void ScaleRowDown4Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); #define HAS_SCALEROWDOWN34_MIPS_DSPR2 void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width); -void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* d, int dst_width); -void ScaleRowDown34_1_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* d, int dst_width); #define HAS_SCALEROWDOWN38_MIPS_DSPR2 void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width); -void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); -void ScaleRowDown38_3_Int_MIPS_DSPR2(const uint8* src_ptr, +void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width); #endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) @@ -2326,7 +2127,7 @@ static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, } } -void ScaleRowDown2Int_C(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown2Box_C(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { const uint8* s = src_ptr; const uint8* t = src_ptr + src_stride; @@ -2347,17 +2148,17 @@ static void ScaleRowDown4_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width) { uint8* dend = dst + dst_width - 1; do { - dst[0] = src_ptr[0]; - dst[1] = src_ptr[4]; + dst[0] = src_ptr[2]; + dst[1] = src_ptr[6]; dst += 2; src_ptr += 8; } while (dst < dend); if (dst_width & 1) { - dst[0] = src_ptr[0]; + dst[0] = src_ptr[2]; } } -static void ScaleRowDown4Int_C(const uint8* src_ptr, ptrdiff_t src_stride, +static void ScaleRowDown4Box_C(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { intptr_t stride = src_stride; uint8* dend = dst + dst_width - 1; @@ -2393,33 +2194,6 @@ static void ScaleRowDown4Int_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -static void ScaleRowDown8_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, - uint8* dst, int dst_width) { - uint8* dend = dst + dst_width - 1; - do { - dst[0] = src_ptr[0]; - dst[1] = src_ptr[8]; - dst += 2; - src_ptr += 16; - } while (dst < dend); - if (dst_width & 1) { - dst[0] = src_ptr[0]; - } -} - -// Note calling code checks width is less than max and if not -// uses ScaleRowDown8_C instead. -static void ScaleRowDown8Int_C(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst, int dst_width) { - SIMD_ALIGNED(uint8 src_row[kMaxStride * 2]); - assert(dst_width <= kMaxStride); - ScaleRowDown4Int_C(src_ptr, src_stride, src_row, dst_width * 2); - ScaleRowDown4Int_C(src_ptr + src_stride * 4, src_stride, - src_row + kMaxStride, - dst_width * 2); - ScaleRowDown2Int_C(src_row, kMaxStride, dst, dst_width); -} - static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); @@ -2434,7 +2208,7 @@ static void ScaleRowDown34_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, } // Filter rows 0 and 1 together, 3 : 1 -static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, ptrdiff_t src_stride, +static void ScaleRowDown34_0_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, uint8* d, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); const uint8* s = src_ptr; @@ -2457,7 +2231,7 @@ static void ScaleRowDown34_0_Int_C(const uint8* src_ptr, ptrdiff_t src_stride, } // Filter rows 1 and 2 together, 1 : 1 -static void ScaleRowDown34_1_Int_C(const uint8* src_ptr, ptrdiff_t src_stride, +static void ScaleRowDown34_1_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, uint8* d, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); const uint8* s = src_ptr; @@ -2524,7 +2298,7 @@ static void ScaleFilterCols34_C(uint8* dst_ptr, const uint8* src_ptr, #define HAS_SCALEROWDOWN34_SSE2 // Filter rows 0 and 1 together, 3 : 1 -static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, +static void ScaleRowDown34_0_Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); @@ -2534,7 +2308,7 @@ static void ScaleRowDown34_0_Int_SSE2(const uint8* src_ptr, } // Filter rows 1 and 2 together, 1 : 1 -static void ScaleRowDown34_1_Int_SSE2(const uint8* src_ptr, +static void ScaleRowDown34_1_Box_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); @@ -2557,7 +2331,7 @@ static void ScaleRowDown38_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, } // 8x3 -> 3x1 -static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, +static void ScaleRowDown38_3_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); @@ -2583,7 +2357,7 @@ static void ScaleRowDown38_3_Int_C(const uint8* src_ptr, } // 8x2 -> 3x1 -static void ScaleRowDown38_2_Int_C(const uint8* src_ptr, ptrdiff_t src_stride, +static void ScaleRowDown38_2_Box_C(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { assert((dst_width % 3 == 0) && (dst_width > 0)); intptr_t stride = src_stride; @@ -2657,35 +2431,40 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */, FilterMode filtering) { void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) = - filtering ? ScaleRowDown2Int_C : ScaleRowDown2_C; + filtering ? ScaleRowDown2Box_C : ScaleRowDown2_C; + int row_stride = src_stride << 1; + if (!filtering) { + src_ptr += src_stride; // Point to odd rows. + src_stride = 0; + } + #if defined(HAS_SCALEROWDOWN2_NEON) - if (TestCpuFlag(kCpuHasNEON) && - IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON; + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Box_NEON : ScaleRowDown2_NEON; } #elif defined(HAS_SCALEROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { - ScaleRowDown2 = filtering ? ScaleRowDown2Int_Unaligned_SSE2 : + ScaleRowDown2 = filtering ? ScaleRowDown2Box_Unaligned_SSE2 : ScaleRowDown2_Unaligned_SSE2; - if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + if (IS_ALIGNED(src_ptr, 16) && + IS_ALIGNED(src_stride, 16) && IS_ALIGNED(row_stride, 16) && IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { - ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; + ScaleRowDown2 = filtering ? ScaleRowDown2Box_SSE2 : ScaleRowDown2_SSE2; } } #elif defined(HAS_SCALEROWDOWN2_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && - IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_ptr, 4) && + IS_ALIGNED(src_stride, 4) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { ScaleRowDown2 = filtering ? - ScaleRowDown2Int_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2; + ScaleRowDown2Box_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2; } #endif - src_ptr += src_stride; // Point to odd rows. // TODO(fbarchard): Loop through source height to allow odd height. for (int y = 0; y < dst_height; ++y) { ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); - src_ptr += (src_stride << 1); + src_ptr += row_stride; dst_ptr += dst_stride; } } @@ -2701,58 +2480,34 @@ static void ScalePlaneDown4(int /* src_width */, int /* src_height */, FilterMode filtering) { void (*ScaleRowDown4)(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) = - filtering ? ScaleRowDown4Int_C : ScaleRowDown4_C; + filtering ? ScaleRowDown4Box_C : ScaleRowDown4_C; + int row_stride = src_stride << 2; + if (!filtering) { + src_ptr += src_stride * 2; // Point to row 2. + src_stride = 0; + } #if defined(HAS_SCALEROWDOWN4_NEON) - if (TestCpuFlag(kCpuHasNEON) && - IS_ALIGNED(dst_width, 4)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Int_NEON : ScaleRowDown4_NEON; + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8)) { + ScaleRowDown4 = filtering ? ScaleRowDown4Box_NEON : ScaleRowDown4_NEON; } #elif defined(HAS_SCALEROWDOWN4_SSE2) if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(dst_width, 8) && + IS_ALIGNED(dst_width, 8) && IS_ALIGNED(row_stride, 16) && IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { - ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2; + ScaleRowDown4 = filtering ? ScaleRowDown4Box_SSE2 : ScaleRowDown4_SSE2; } #elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2) - if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(row_stride, 4) && IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { ScaleRowDown4 = filtering ? - ScaleRowDown4Int_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2; + ScaleRowDown4Box_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2; } #endif for (int y = 0; y < dst_height; ++y) { ScaleRowDown4(src_ptr, src_stride, dst_ptr, dst_width); - src_ptr += (src_stride << 2); - dst_ptr += dst_stride; - } -} - -// Scale plane, 1/8 -// This is an optimized version for scaling down a plane to 1/8 -// of its original size. - -static void ScalePlaneDown8(int /* src_width */, int /* src_height */, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr, - FilterMode filtering) { - void (*ScaleRowDown8)(const uint8* src_ptr, ptrdiff_t src_stride, - uint8* dst_ptr, int dst_width) = - filtering && (dst_width <= kMaxStride) ? - ScaleRowDown8Int_C : ScaleRowDown8_C; -#if defined(HAS_SCALEROWDOWN8_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(dst_width, 4) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { - ScaleRowDown8 = filtering ? ScaleRowDown8Int_SSE2 : ScaleRowDown8_SSE2; - } -#endif - - for (int y = 0; y < dst_height; ++y) { - ScaleRowDown8(src_ptr, src_stride, dst_ptr, dst_width); - src_ptr += (src_stride << 3); + src_ptr += row_stride; dst_ptr += dst_stride; } } @@ -2773,8 +2528,8 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */, ScaleRowDown34_0 = ScaleRowDown34_C; ScaleRowDown34_1 = ScaleRowDown34_C; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Int_C; - ScaleRowDown34_1 = ScaleRowDown34_1_Int_C; + ScaleRowDown34_0 = ScaleRowDown34_0_Box_C; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_C; } #if defined(HAS_SCALEROWDOWN34_NEON) if (TestCpuFlag(kCpuHasNEON) && (dst_width % 24 == 0)) { @@ -2782,16 +2537,16 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */, ScaleRowDown34_0 = ScaleRowDown34_NEON; ScaleRowDown34_1 = ScaleRowDown34_NEON; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Int_NEON; - ScaleRowDown34_1 = ScaleRowDown34_1_Int_NEON; + ScaleRowDown34_0 = ScaleRowDown34_0_Box_NEON; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_NEON; } } #endif #if defined(HAS_SCALEROWDOWN34_SSE2) if (TestCpuFlag(kCpuHasSSE2) && (dst_width % 24 == 0) && IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && filtering) { - ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSE2; - ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSE2; + ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSE2; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSE2; } #endif #if defined(HAS_SCALEROWDOWN34_SSSE3) @@ -2801,8 +2556,8 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */, ScaleRowDown34_0 = ScaleRowDown34_SSSE3; ScaleRowDown34_1 = ScaleRowDown34_SSSE3; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Int_SSSE3; - ScaleRowDown34_1 = ScaleRowDown34_1_Int_SSSE3; + ScaleRowDown34_0 = ScaleRowDown34_0_Box_SSSE3; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_SSSE3; } } #endif @@ -2814,8 +2569,8 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */, ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2; ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2; } else { - ScaleRowDown34_0 = ScaleRowDown34_0_Int_MIPS_DSPR2; - ScaleRowDown34_1 = ScaleRowDown34_1_Int_MIPS_DSPR2; + ScaleRowDown34_0 = ScaleRowDown34_0_Box_MIPS_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_1_Box_MIPS_DSPR2; } } #endif @@ -2874,8 +2629,8 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */, ScaleRowDown38_3 = ScaleRowDown38_C; ScaleRowDown38_2 = ScaleRowDown38_C; } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Int_C; - ScaleRowDown38_2 = ScaleRowDown38_2_Int_C; + ScaleRowDown38_3 = ScaleRowDown38_3_Box_C; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_C; } #if defined(HAS_SCALEROWDOWN38_NEON) if (TestCpuFlag(kCpuHasNEON) && (dst_width % 12 == 0)) { @@ -2883,8 +2638,8 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */, ScaleRowDown38_3 = ScaleRowDown38_NEON; ScaleRowDown38_2 = ScaleRowDown38_NEON; } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Int_NEON; - ScaleRowDown38_2 = ScaleRowDown38_2_Int_NEON; + ScaleRowDown38_3 = ScaleRowDown38_3_Box_NEON; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_NEON; } } #elif defined(HAS_SCALEROWDOWN38_SSSE3) @@ -2894,8 +2649,8 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */, ScaleRowDown38_3 = ScaleRowDown38_SSSE3; ScaleRowDown38_2 = ScaleRowDown38_SSSE3; } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Int_SSSE3; - ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3; + ScaleRowDown38_3 = ScaleRowDown38_3_Box_SSSE3; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_SSSE3; } } #elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2) @@ -2906,8 +2661,8 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */, ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2; ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2; } else { - ScaleRowDown38_3 = ScaleRowDown38_3_Int_MIPS_DSPR2; - ScaleRowDown38_2 = ScaleRowDown38_2_Int_MIPS_DSPR2; + ScaleRowDown38_3 = ScaleRowDown38_3_Box_MIPS_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_2_Box_MIPS_DSPR2; } } #endif @@ -3330,11 +3085,6 @@ void ScalePlane(const uint8* src, int src_stride, // optimized, 1/4 ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); - } else if (8 * dst_width == src_width && 8 * dst_height == src_height && - filtering != kFilterBilinear) { - // optimized, 1/8 - ScalePlaneDown8(src_width, src_height, dst_width, dst_height, - src_stride, dst_stride, src, dst, filtering); } else { // Arbitrary downsample ScalePlaneDown(src_width, src_height, dst_width, dst_height, diff --git a/source/scale_argb.cc b/source/scale_argb.cc index fa271556a..989df55a6 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -34,12 +34,12 @@ static __inline int Abs(int v) { void ScaleARGBRowDownEven_NEON(const uint8* src_argb, int src_stride, int src_stepx, uint8* dst_argb, int dst_width); -void ScaleARGBRowDownEvenInt_NEON(const uint8* src_argb, int src_stride, +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, int src_stride, int src_stepx, uint8* dst_argb, int dst_width); void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width); -void ScaleARGBRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width); #endif @@ -75,7 +75,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb, // Blends 8x2 rectangle to 4x1. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. __declspec(naked) __declspec(align(16)) -static void ScaleARGBRowDown2Int_SSE2(const uint8* src_argb, +static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) { __asm { @@ -150,7 +150,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, // Blends four 2x2 to 4x1. // Alignment requirement: dst_argb 16 byte aligned. __declspec(naked) __declspec(align(16)) -static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_argb, +static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width) { @@ -366,7 +366,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb, ); } -static void ScaleARGBRowDown2Int_SSE2(const uint8* src_argb, +static void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) { asm volatile ( @@ -438,7 +438,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, // Blends four 2x2 to 4x1. // Alignment requirement: dst_argb 16 byte aligned. -static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_argb, +static void ScaleARGBRowDownEvenBox_SSE2(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width) { intptr_t src_stepx_x4 = static_cast(src_stepx); @@ -644,7 +644,7 @@ static void ScaleARGBRowDown2_C(const uint8* src_argb, } } -static void ScaleARGBRowDown2Int_C(const uint8* src_argb, ptrdiff_t src_stride, +static void ScaleARGBRowDown2Box_C(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) { for (int x = 0; x < dst_width; ++x) { dst_argb[0] = (src_argb[0] + src_argb[4] + @@ -677,7 +677,7 @@ void ScaleARGBRowDownEven_C(const uint8* src_argb, ptrdiff_t /* src_stride */, } } -static void ScaleARGBRowDownEvenInt_C(const uint8* src_argb, +static void ScaleARGBRowDownEvenBox_C(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width) { @@ -748,18 +748,18 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */, int row_stride = src_stride * (dy >> 16); void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) = - filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C; + filtering ? ScaleARGBRowDown2Box_C : ScaleARGBRowDown2_C; #if defined(HAS_SCALEARGBROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { - ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 : + ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_SSE2 : ScaleARGBRowDown2_SSE2; } #elif defined(HAS_SCALEARGBROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) && IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) { - ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_NEON : + ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON : ScaleARGBRowDown2_NEON; } #endif @@ -788,17 +788,17 @@ static void ScaleARGBDownEven(int src_width, int src_height, src_argb += (y >> 16) * src_stride + (x >> 16) * 4; void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride, int src_step, uint8* dst_argb, int dst_width) = - filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C; + filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C; #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenInt_SSE2 : + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 : ScaleARGBRowDownEven_SSE2; } #elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) && IS_ALIGNED(src_argb, 4)) { - ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenInt_NEON : + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON : ScaleARGBRowDownEven_NEON; } #endif diff --git a/source/scale_argb_neon.cc b/source/scale_argb_neon.cc index 819186bc7..1b297b53d 100644 --- a/source/scale_argb_neon.cc +++ b/source/scale_argb_neon.cc @@ -38,11 +38,11 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, ); } -void ScaleARGBRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( // change the stride to row 2 pointer - "add %1, %0 \n" + "add %1, %1, %0 \n" "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. @@ -74,11 +74,9 @@ void ScaleARGBRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, - int src_stepx, +void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, int src_stepx, uint8* dst_argb, int dst_width) { asm volatile ( - "add %0, #4 \n" // point to odd pixels. "mov r12, %3, lsl #2 \n" ".p2align 2 \n" "1: \n" @@ -86,7 +84,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, "vld1.32 {d0[1]}, [%0], r12 \n" "vld1.32 {d1[0]}, [%0], r12 \n" "vld1.32 {d1[1]}, [%0], r12 \n" - "subs %2, #4 \n" // 4 pixels per loop. + "subs %2, %2, #4 \n" // 4 pixels per loop. "vst1.8 {q0}, [%1]! \n" "bgt 1b \n" : "+r"(src_argb), // %0 @@ -99,12 +97,12 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. -void ScaleARGBRowDownEvenInt_NEON(const uint8* src_argb, ptrdiff_t src_stride, +void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, uint8* dst_argb, int dst_width) { asm volatile ( "mov r12, %4, lsl #2 \n" - "add %1, %0 \n" + "add %1, %1, %0 \n" ".p2align 2 \n" "1: \n" "vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1 @@ -125,7 +123,7 @@ void ScaleARGBRowDownEvenInt_NEON(const uint8* src_argb, ptrdiff_t src_stride, "vadd.u16 q2, q2, q3 \n" // (e+f)_(g+h) "vrshrn.u16 d0, q0, #2 \n" // first 2 pixels. "vrshrn.u16 d1, q2, #2 \n" // next 2 pixels. - "subs %3, #4 \n" // 4 pixels per loop. + "subs %3, %3, #4 \n" // 4 pixels per loop. "vst1.8 {q0}, [%2]! \n" "bgt 1b \n" : "+r"(src_argb), // %0 diff --git a/source/scale_mips.cc b/source/scale_mips.cc index b30eaba0c..66f2571a1 100644 --- a/source/scale_mips.cc +++ b/source/scale_mips.cc @@ -76,7 +76,7 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, ); } -void ScaleRowDown2Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown2Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { const uint8* t = src_ptr + src_stride; @@ -230,7 +230,7 @@ void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, ); } -void ScaleRowDown4Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown4Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { intptr_t stride = src_stride; const uint8* s1 = src_ptr + stride; @@ -355,7 +355,7 @@ void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, ); } -void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown34_0_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* d, int dst_width) { __asm__ __volatile__ ( ".set push \n" @@ -410,7 +410,7 @@ void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown34_1_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown34_1_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* d, int dst_width) { __asm__ __volatile__ ( ".set push \n" @@ -506,7 +506,7 @@ void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, ); } -void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown38_2_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { intptr_t stride = src_stride; const uint8* t = src_ptr + stride; @@ -558,7 +558,7 @@ void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ); } -void ScaleRowDown38_3_Int_MIPS_DSPR2(const uint8* src_ptr, +void ScaleRowDown38_3_Box_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { intptr_t stride = src_stride; diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 2449ec80e..33240396b 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -27,7 +27,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, asm volatile ( "1: \n" // load even pixels into q0, odd into q1 - "vld2.u8 {q0,q1}, [%0]! \n" + "vld2.u8 {q0, q1}, [%0]! \n" "subs %2, %2, #16 \n" // 16 processed per loop "vst1.u8 {q1}, [%1]! \n" // store odd pixels "bgt 1b \n" @@ -39,14 +39,14 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, ); } -void ScaleRowDown2Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( // change the stride to row 2 pointer "add %1, %0 \n" "1: \n" - "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc - "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc + "vld1.u8 {q0, q1}, [%0]! \n" // load row 1 and post inc + "vld1.u8 {q2, q3}, [%1]! \n" // load row 2 and post inc "subs %3, %3, #16 \n" // 16 processed per loop "vpaddl.u8 q0, q0 \n" // row 1 add adjacent "vpaddl.u8 q1, q1 \n" @@ -69,12 +69,10 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst_ptr, int dst_width) { asm volatile ( "1: \n" - "vld2.u8 {d0, d1}, [%0]! \n" - "subs %2, #4 \n" - "vtrn.u8 d1, d0 \n" - "vshrn.u16 d0, q0, #8 \n" - "vst1.u32 {d0[1]}, [%1]! \n" - "bgt 1b \n" + "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "subs %2, %2, #8 \n" // 8 processed per loop + "vst1.u8 {d2}, [%1]! \n" + "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 "+r"(dst_width) // %2 @@ -83,7 +81,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, ); } -void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, +void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( "add r4, %0, %3 \n" @@ -94,7 +92,7 @@ void ScaleRowDown4Int_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "vld1.u8 {q1}, [r4]! \n" "vld1.u8 {q2}, [r5]! \n" "vld1.u8 {q3}, [%3]! \n" - "subs %2, #4 \n" + "subs %2, %2, #4 \n" "vpaddl.u8 q0, q0 \n" "vpadal.u8 q0, q1 \n" "vpadal.u8 q0, q2 \n" @@ -121,7 +119,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, asm volatile ( "1: \n" "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "subs %2, #24 \n" + "subs %2, %2, #24 \n" "vmov d2, d3 \n" // order d0, d1, d2 "vst3.u8 {d0, d1, d2}, [%1]! \n" "bgt 1b \n" @@ -133,7 +131,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, ); } -void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, +void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( @@ -142,7 +140,7 @@ void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, "1: \n" "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, #24 \n" + "subs %2, %2, #24 \n" // filter src line 0 with src line 1 // expand chars to shorts to allow for room @@ -189,7 +187,7 @@ void ScaleRowDown34_0_Int_NEON(const uint8* src_ptr, ); } -void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, +void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( @@ -198,7 +196,7 @@ void ScaleRowDown34_1_Int_NEON(const uint8* src_ptr, "1: \n" "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 - "subs %2, #24 \n" + "subs %2, %2, #24 \n" // average src line 0 with src line 1 "vrhadd.u8 q0, q0, q2 \n" "vrhadd.u8 q1, q1, q3 \n" @@ -247,7 +245,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, "vld1.u8 {q3}, [%3] \n" "1: \n" "vld1.u8 {d0, d1, d2, d3}, [%0]! \n" - "subs %2, #12 \n" + "subs %2, %2, #12 \n" "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" "vst1.u8 {d4}, [%1]! \n" @@ -262,7 +260,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, } // 32x3 -> 12x1 -void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, +void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( @@ -280,7 +278,7 @@ void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" "vld4.u8 {d16, d17, d18, d19}, [r4]! \n" - "subs %2, #12 \n" + "subs %2, %2, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 @@ -372,7 +370,7 @@ void OMITFP ScaleRowDown38_3_Int_NEON(const uint8* src_ptr, } // 32x2 -> 12x1 -void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, +void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( @@ -387,7 +385,7 @@ void ScaleRowDown38_2_Int_NEON(const uint8* src_ptr, // d3 = 30 70 31 71 32 72 33 73 "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" - "subs %2, #12 \n" + "subs %2, %2, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 @@ -487,7 +485,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "1: \n" "vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q1}, [%2]! \n" - "subs %3, #16 \n" + "subs %3, %3, #16 \n" "vmull.u8 q13, d0, d4 \n" "vmull.u8 q14, d1, d4 \n" "vmlal.u8 q13, d2, d5 \n" @@ -502,7 +500,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "25: \n" "vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q1}, [%2]! \n" - "subs %3, #16 \n" + "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" "vst1.u8 {q0}, [%0]! \n" @@ -513,7 +511,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "50: \n" "vld1.u8 {q0}, [%1]! \n" "vld1.u8 {q1}, [%2]! \n" - "subs %3, #16 \n" + "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vst1.u8 {q0}, [%0]! \n" "bgt 50b \n" @@ -523,7 +521,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "75: \n" "vld1.u8 {q1}, [%1]! \n" "vld1.u8 {q0}, [%2]! \n" - "subs %3, #16 \n" + "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" "vst1.u8 {q0}, [%0]! \n" @@ -533,7 +531,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, // Blend 100 / 0 - Copy row unchanged. "100: \n" "vld1.u8 {q0}, [%1]! \n" - "subs %3, #16 \n" + "subs %3, %3, #16 \n" "vst1.u8 {q0}, [%0]! \n" "bgt 100b \n" diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc index 742d044f0..7fe6c3b0b 100644 --- a/unit_test/compare_test.cc +++ b/unit_test/compare_test.cc @@ -165,7 +165,7 @@ TEST_F(libyuvTest, BenchmarkSumSquareError_Opt) { memset(src_b, 0, kMaxWidth); int count = benchmark_iterations_ * - (benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth; + ((benchmark_width_ * benchmark_height_ + kMaxWidth - 1) / kMaxWidth); for (int i = 0; i < count; ++i) { h1 = ComputeSumSquareError(src_a, src_b, kMaxWidth); }