From 67a0987dd9f4100d7dd2233f392f8b91c276380c Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Thu, 7 Nov 2013 20:35:17 +0000 Subject: [PATCH] Scale Up2 ported to NaCL. BUG=none TEST=none R=nfullagar@chromium.org, nfullagar@google.com Review URL: https://webrtc-codereview.appspot.com/3589004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@846 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/planar_functions.h | 1 + include/libyuv/version.h | 2 +- source/scale_argb.cc | 36 ++++++++++++++++++++++++++++--- 4 files changed, 36 insertions(+), 5 deletions(-) diff --git a/README.chromium b/README.chromium index 06c1c70d1..6a7da8db4 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 845 +Version: 846 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 6b0dae855..22650f2ad 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -355,6 +355,7 @@ int ARGBComputeCumulativeSum(const uint8* src_argb, int src_stride_argb, // dst_cumsum table of width * height * 16 bytes aligned to 16 byte boundary. // dst_stride32_cumsum is number of ints in a row (width * 4). // radius is number of pixels around the center. e.g. 1 = 3x3. 2=5x5. +// Blur is optimized for radius of 5 (11x11) or less. LIBYUV_API int ARGBBlur(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 436574664..ad6b5253d 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 845 +#define LIBYUV_VERSION 846 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/scale_argb.cc b/source/scale_argb.cc index f224761fb..21ed8bcb9 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -371,7 +371,6 @@ static void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, // Reads 4 pixels, duplicates them and writes 8 pixels. // Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. -#define HAS_SCALEARGBCOLSUP2_SSE2 __declspec(naked) __declspec(align(16)) void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, int dst_width, int /* x */, int /* dx */) { @@ -675,6 +674,39 @@ void ScaleARGBCols_SSE2(uint8* dst_argb, const uint8* src_argb, ); } +// Reads 4 pixels, duplicates them and writes 8 pixels. +// Alignment requirement: src_argb 16 byte aligned, dst_argb 16 byte aligned. +void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, + int dst_width, int /* x */, int /* dx */) { + asm volatile ( + ".p2align 4 \n" + BUNDLEALIGN + "1: \n" + "movdqa " MEMACCESS(1) ",%%xmm0 \n" + "lea " MEMLEA(0x10,1) ",%1 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpckldq %%xmm0,%%xmm0 \n" + "punpckhdq %%xmm1,%%xmm1 \n" + "sub $0x8,%2 \n" + "movdqa %%xmm0," MEMACCESS(0) " \n" + "movdqa %%xmm1," MEMACCESS2(0x10,0) " \n" + "lea " MEMLEA(0x20,0) ",%0 \n" + "jg 1b \n" + + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc" +#if defined(__native_client__) && defined(__x86_64__) + , "r14" +#endif +#if defined(__SSE2__) + , "xmm0", "xmm1" +#endif + ); +} + // Shuffle table for arranging 2 pixels into pairs for pmaddubsw static uvec8 kShuffleColARGB = { 0u, 4u, 1u, 5u, 2u, 6u, 3u, 7u, // bbggrraa 1st pixel @@ -1363,14 +1395,12 @@ static void ScaleARGBSimple(int src_width, int src_height, #if defined(HAS_SCALEARGBCOLS_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { ScaleARGBCols = ScaleARGBCols_SSE2; -#if defined(HAS_SCALEARGBCOLS_SSE2) if (src_width * 2 == dst_width && IS_ALIGNED(dst_width, 8) && (x >> 16) == 0 && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { ScaleARGBCols = ScaleARGBColsUp2_SSE2; } -#endif } #endif