From f906ae136098fab859149ab44a972c1dbc6c6689 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Tue, 1 May 2012 00:30:51 +0000 Subject: [PATCH] scale to half size handle unaligned memory BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/543006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@254 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/scale.cc | 150 +++++++++++++++++++++++++++++++++++++-- 3 files changed, 145 insertions(+), 9 deletions(-) diff --git a/README.chromium b/README.chromium index 3b06206af..52f29a6c3 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 253 +Version: 254 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index c15f5b904..b6797c893 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 253 +#define LIBYUV_VERSION 254 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/scale.cc b/source/scale.cc index b67098aab..13ca2288f 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -57,6 +57,7 @@ void SetUseReferenceImpl(bool use) { #if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__) #define HAS_SCALEROWDOWN2_NEON +// Note - not static due to reuse in convert for 444 to 420. void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */, uint8* dst, int dst_width) { asm volatile ( @@ -747,6 +748,80 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, } } +// Reads 32 pixels, throws half away and writes 16 pixels. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + mov eax, [esp + 4] // src_ptr + // src_stride ignored + mov edx, [esp + 12] // dst_ptr + mov ecx, [esp + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 16 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 + pand xmm1, xmm5 + packuswb xmm0, xmm1 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + ret + } +} +// Blends 32x2 rectangle to 16x1. +// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr, + int src_stride, + uint8* dst_ptr, int dst_width) { + __asm { + push esi + mov eax, [esp + 4 + 4] // src_ptr + mov esi, [esp + 4 + 8] // src_stride + mov edx, [esp + 4 + 12] // dst_ptr + mov ecx, [esp + 4 + 16] // dst_width + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + + align 16 + wloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + esi] + movdqu xmm3, [eax + esi + 16] + lea eax, [eax + 32] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + + movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + psrlw xmm0, 8 + movdqa xmm3, xmm1 + psrlw xmm1, 8 + pand xmm2, xmm5 + pand xmm3, xmm5 + pavgw xmm0, xmm2 + pavgw xmm1, xmm3 + packuswb xmm0, xmm1 + + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop esi + ret + } +} + #define HAS_SCALEROWDOWN4_SSE2 // Point samples 32 pixels to 8 pixels. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned. @@ -1578,8 +1653,8 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride, ); } -static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { +void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -1612,6 +1687,65 @@ static void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, ); } +static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" +"1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +); +} + +static void ScaleRowDown2Int_Unaligned_SSE2(const uint8* src_ptr, + int src_stride, + uint8* dst_ptr, int dst_width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" +"1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu (%0,%3,1),%%xmm2 \n" + "movdqu 0x10(%0,%3,1),%%xmm3 \n" + "lea 0x20(%0),%0 \n" + "pavgb %%xmm2,%%xmm0 \n" + "pavgb %%xmm3,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "psrlw $0x8,%%xmm0 \n" + "movdqa %%xmm1,%%xmm3 \n" + "psrlw $0x8,%%xmm1 \n" + "pand %%xmm5,%%xmm2 \n" + "pand %%xmm5,%%xmm3 \n" + "pavgw %%xmm2,%%xmm0 \n" + "pavgw %%xmm3,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(dst_width) // %2 + : "r"(static_cast(src_stride)) // %3 + : "memory", "cc" +); +} + #define HAS_SCALEROWDOWN4_SSE2 static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride, uint8* dst_ptr, int dst_width) { @@ -3118,11 +3252,13 @@ static void ScalePlaneDown2(int src_width, int src_height, ScaleRowDown2 = filtering ? ScaleRowDown2Int_NEON : ScaleRowDown2_NEON; } #elif defined(HAS_SCALEROWDOWN2_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(dst_width, 16) && - IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && - IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { - ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_Unaligned_SSE2 : + ScaleRowDown2_Unaligned_SSE2; + if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleRowDown2 = filtering ? ScaleRowDown2Int_SSE2 : ScaleRowDown2_SSE2; + } } #endif