From b5ea79d84566ec7ac6651e54c9a93254ae549cbd Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 13 Apr 2015 18:56:08 +0000 Subject: [PATCH] add rows handle height of 1 using a more general while-style loop. BUG=none TESTED=try bots Review URL: https://webrtc-codereview.appspot.com/45999004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1366 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/scale.cc | 4 ++-- source/scale_posix.cc | 35 ++++++++++++++--------------------- source/scale_win.cc | 30 ++++++++++++------------------ 5 files changed, 30 insertions(+), 43 deletions(-) diff --git a/README.chromium b/README.chromium index b2f917e5f..a34b15f3b 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1365 +Version: 1366 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 30b658db4..1156f3891 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1365 +#define LIBYUV_VERSION 1366 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/scale.cc b/source/scale.cc index 761c79a4d..08f981504 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -745,8 +745,8 @@ static void ScalePlaneBox(int src_width, int src_height, ScaleSlope(src_width, src_height, dst_width, dst_height, kFilterBox, &x, &y, &dx, &dy); src_width = Abs(src_width); - // TODO(fbarchard): Remove this and make AddRows handle boxheight 1. - if (!IS_ALIGNED(src_width, 16) || dst_height * 2 > src_height) { + // TODO(fbarchard): Remove this and make AddRows handle odd width. + if (!IS_ALIGNED(src_width, 16)) { uint8* dst = dst_ptr; int j; for (j = 0; j < dst_height; ++j) { diff --git a/source/scale_posix.cc b/source/scale_posix.cc index bb6e57efe..c373e4122 100644 --- a/source/scale_posix.cc +++ b/source/scale_posix.cc @@ -573,44 +573,38 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8* src_ptr, ); } +// Reads 16xN bytes and produces 16 shorts at a time. void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint16* dst_ptr, int src_width, int src_height) { int tmp_height = 0; intptr_t tmp_src = 0; asm volatile ( "pxor %%xmm4,%%xmm4 \n" - "sub $0x1,%5 \n" + "mov %0,%3 \n" // row pointer + "mov %5,%2 \n" // height + "pxor %%xmm0,%%xmm0 \n" // clear accumulators + "pxor %%xmm1,%%xmm1 \n" LABELALIGN "1: \n" - "movdqu " MEMACCESS(0) ",%%xmm0 \n" - "mov %0,%3 \n" - "add %6,%0 \n" - "movdqa %%xmm0,%%xmm1 \n" - "punpcklbw %%xmm4,%%xmm0 \n" - "punpckhbw %%xmm4,%%xmm1 \n" - "mov %5,%2 \n" - "test %2,%2 \n" - "je 3f \n" - - LABELALIGN - "2: \n" - "movdqu " MEMACCESS(0) ",%%xmm2 \n" - "add %6,%0 \n" + "movdqu " MEMACCESS(3) ",%%xmm2 \n" + "add %6,%3 \n" "movdqa %%xmm2,%%xmm3 \n" "punpcklbw %%xmm4,%%xmm2 \n" "punpckhbw %%xmm4,%%xmm3 \n" "paddusw %%xmm2,%%xmm0 \n" "paddusw %%xmm3,%%xmm1 \n" "sub $0x1,%2 \n" - "jg 2b \n" + "jg 1b \n" - LABELALIGN - "3: \n" "movdqu %%xmm0," MEMACCESS(1) " \n" "movdqu %%xmm1," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x10,3) ",%0 \n" "lea " MEMLEA(0x20,1) ",%1 \n" + "lea " MEMLEA(0x10,0) ",%0 \n" // src_ptr += 16 + "mov %0,%3 \n" // row pointer + "mov %5,%2 \n" // height + "pxor %%xmm0,%%xmm0 \n" // clear accumulators + "pxor %%xmm1,%%xmm1 \n" "sub $0x10,%4 \n" "jg 1b \n" : "+r"(src_ptr), // %0 @@ -799,8 +793,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8* src_argb, // Reads 4 pixels at a time. // Alignment requirement: dst_argb 16 byte aligned. void ScaleARGBRowDownEven_SSE2(const uint8* src_argb, ptrdiff_t src_stride, - int src_stepx, - uint8* dst_argb, int dst_width) { + int src_stepx, uint8* dst_argb, int dst_width) { intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x12 = 0; asm volatile ( diff --git a/source/scale_win.cc b/source/scale_win.cc index 63c66d69f..9067d673e 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -708,11 +708,9 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8* src_ptr, } // Reads 16xN bytes and produces 16 shorts at a time. -// TODO(fbarchard): Make this handle 4xN bytes for any width ARGB. __declspec(naked) void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, - uint16* dst_ptr, int src_width, - int src_height) { + uint16* dst_ptr, int src_width, int src_height) { __asm { push esi push edi @@ -724,21 +722,14 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, mov ecx, [esp + 16 + 16] // dst_width mov ebx, [esp + 16 + 20] // height pxor xmm4, xmm4 - dec ebx + mov eax, esi // row pointer + mov ebp, ebx // height + pxor xmm0, xmm0 // clear accumulators + pxor xmm1, xmm1 xloop: - // first row - movdqu xmm0, [esi] - lea eax, [esi + edx] - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm4 - punpckhbw xmm1, xmm4 - lea esi, [esi + 16] - mov ebp, ebx - test ebp, ebp - je ydone - // sum remaining rows + // sum rows yloop: movdqu xmm2, [eax] // read 16 pixels lea eax, [eax + edx] // advance to next row @@ -750,11 +741,14 @@ void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, sub ebp, 1 jg yloop - ydone: movdqu [edi], xmm0 movdqu [edi + 16], xmm1 - lea edi, [edi + 32] - + lea edi, [edi + 32] // dst_ptr += 16 + lea esi, [esi + 16] // src_ptr += 16 + mov eax, esi // row pointer + mov ebp, ebx // height + pxor xmm0, xmm0 // clear accumulators + pxor xmm1, xmm1 sub ecx, 16 jg xloop