From 714050a29dea9ab6aebb54acb8f79edf9b9f337d Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Fri, 17 Feb 2012 22:59:56 +0000 Subject: [PATCH] sse version of BGRA and ABGR To I420 BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/400004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@178 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- DEPS | 5 +- README.chromium | 2 +- include/libyuv/convert.h | 11 + include/libyuv/version.h | 2 +- source/row_posix.cc | 538 +++++++++++++++++++++++++++++++++------ source/row_win.cc | 2 +- 6 files changed, 475 insertions(+), 85 deletions(-) diff --git a/DEPS b/DEPS index 2c2fa5ed3..1b79d1f61 100644 --- a/DEPS +++ b/DEPS @@ -1,7 +1,7 @@ vars = { "libyuv_trunk" : "https://libyuv.googlecode.com/svn/trunk", "chromium_trunk" : "http://src.chromium.org/svn/trunk", - "chromium_revision": "119959", + "chromium_revision": "95033", # Use this googlecode_url variable only if there is an internal mirror for it. # If you do not know, use the full path while defining your new deps entry. "googlecode_url": "http://%s.googlecode.com/svn", @@ -22,13 +22,12 @@ deps = { # Dependencies used by libjpeg-turbo "trunk/third_party/libjpeg_turbo/": - Var("chromium_trunk") + "/src/third_party/libjpeg_turbo@" + Var("chromium_revision"), + Var("chromium_trunk") + "/deps/third_party/libjpeg_turbo@119959", "trunk/third_party/yasm/": Var("chromium_trunk") + "/src/third_party/yasm@" + Var("chromium_revision"), } - hooks = [ # A change to a .gyp, .gypi, or to GYP itself should run the generator. { diff --git a/README.chromium b/README.chromium index 84282c05c..e4d057ad6 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 177 +Version: 178 License: BSD License File: LICENSE diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index 665c5a723..1912cdd77 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -156,6 +156,17 @@ int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame, uint8* dst_v, int dst_stride_v, int width, int height); +#ifdef HAVE_JPEG +// src_width/height provided by capture +// dst_width/height for clipping determine final size. +int MJPGToI420(const uint8* sample, size_t sample_size, + uint8* dst_y, int dst_stride_y, + uint8* dst_u, int dst_stride_u, + uint8* dst_v, int dst_stride_v, + int src_width, int src_height, + int dst_width, int dst_height); +#endif + // Note Bayer formats (BGGR) To I420 are in format_conversion.h // Convert camera sample to I420 with cropping, rotation and vertical flip. diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 30558d0cc..832224dd1 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 177 +#define LIBYUV_VERSION 178 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_posix.cc b/source/row_posix.cc index 7ea47fbbe..249ef8e29 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -20,37 +20,63 @@ extern "C" { // This module is for GCC x86 and x64 #if (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM) +// GCC 4.2 on OSX has link error when passing static or const to inline. +// TODO(fbarchard): Use static const when gcc 4.2 support is dropped. #ifdef __APPLE__ #define CONST #else #define CONST static const #endif -#ifdef HAS_ARGBTOUVROW_SSSE3 +#ifdef HAS_ARGBTOYROW_SSSE3 + +// Constants for ARGB +CONST vec8 kARGBToY = { + 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 +}; + CONST vec8 kARGBToU = { 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 }; -CONST uvec8 kARGBToV = { - -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0 +CONST vec8 kARGBToV = { + -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, +}; + +// Constants for BGRA +CONST vec8 kBGRAToY = { + 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 +}; + +CONST vec8 kBGRAToU = { + 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 +}; + +CONST vec8 kBGRAToV = { + 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 +}; + +// Constants for ABGR +CONST vec8 kABGRToY = { + 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 +}; + +CONST vec8 kABGRToU = { + -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 +}; + +CONST vec8 kABGRToV = { + 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 +}; + +CONST uvec8 kAddY16 = { + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u }; CONST uvec8 kAddUV128 = { 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u }; -#endif - -#ifdef HAS_ARGBTOYROW_SSSE3 - -// Constant multiplication table for converting ARGB to I400. -CONST vec8 kARGBToY = { - 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 -}; - -CONST uvec8 kAddY16 = { - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u -}; // Shuffle table for converting RGB24 to ARGB. CONST uvec8 kShuffleMaskRGB24ToARGB = { @@ -77,7 +103,6 @@ CONST uvec8 kShuffleMaskARGBToRGB24 = { 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u }; - // Shuffle table for converting ARGB to RAW. CONST uvec8 kShuffleMaskARGBToRAW = { 2u, 1u,0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u @@ -569,7 +594,6 @@ void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) { ); } - void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( "movdqa %4,%%xmm5 \n" @@ -641,9 +665,12 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { #endif ); } -#endif -#ifdef HAS_ARGBTOUVROW_SSSE3 +// TODO(fbarchard): pass xmm constants to single block of assembly. +// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes +// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers, +// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around +// and considered unsafe. void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( @@ -775,7 +802,420 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, #endif ); } + + +void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kBGRAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif + ); +} + +void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_bgra), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kBGRAToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kBGRAToU), // %0 + "m"(kBGRAToV), // %1 + "m"(kAddUV128) // %2 + : +#if defined(__SSE2__) + "xmm3", "xmm4", "xmm5" +#endif + ); + asm volatile ( + "sub %1,%2 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm6 \n" + "pavgb (%0,%4,1),%%xmm0 \n" + "pavgb 0x10(%0,%4,1),%%xmm1 \n" + "pavgb 0x20(%0,%4,1),%%xmm2 \n" + "pavgb 0x30(%0,%4,1),%%xmm6 \n" + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "ja 1b \n" + : "+r"(src_bgra0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"(static_cast(src_stride_bgra)) + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kBGRAToU), // %0 + "m"(kBGRAToV), // %1 + "m"(kAddUV128) // %2 + : +#if defined(__SSE2__) + "xmm3", "xmm4", "xmm5" +#endif + ); + asm volatile ( + "sub %1,%2 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu (%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "ja 1b \n" + : "+r"(src_bgra0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"(static_cast(src_stride_bgra)) + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} +#endif + + +void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kABGRToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_abgr), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kABGRToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kABGRToU), // %0 + "m"(kABGRToV), // %1 + "m"(kAddUV128) // %2 + : +#if defined(__SSE2__) + "xmm3", "xmm4", "xmm5" +#endif + ); + asm volatile ( + "sub %1,%2 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "movdqa 0x20(%0),%%xmm2 \n" + "movdqa 0x30(%0),%%xmm6 \n" + "pavgb (%0,%4,1),%%xmm0 \n" + "pavgb 0x10(%0,%4,1),%%xmm1 \n" + "pavgb 0x20(%0,%4,1),%%xmm2 \n" + "pavgb 0x30(%0,%4,1),%%xmm6 \n" + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "ja 1b \n" + : "+r"(src_abgr0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"(static_cast(src_stride_abgr)) + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} + +void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kABGRToU), // %0 + "m"(kABGRToV), // %1 + "m"(kAddUV128) // %2 + : +#if defined(__SSE2__) + "xmm3", "xmm4", "xmm5" +#endif + ); + asm volatile ( + "sub %1,%2 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu (%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "ja 1b \n" + : "+r"(src_abgr0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"(static_cast(src_stride_abgr)) + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif + ); +} +#endif + +#endif // HAS_ARGBTOYROW_SSSE3 #ifdef HAS_I420TOARGBROW_SSSE3 #define UB 127 /* min(63,static_cast(2.018 * 64)) */ @@ -1056,66 +1496,6 @@ void YToARGBRow_SSE2(const uint8* y_buf, } #endif -#ifdef HAS_ARGBTOYROW_SSSE3 -void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride]); - ABGRToARGBRow_SSSE3(src_argb, row, pix); - ARGBToYRow_SSSE3(row, dst_y, pix); -} - -void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride]); - BGRAToARGBRow_SSSE3(src_argb, row, pix); - ARGBToYRow_SSSE3(row, dst_y, pix); -} - -void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride]); - ABGRToARGBRow_C(src_argb, row, pix); - ARGBToYRow_SSSE3(row, dst_y, pix); -} - -void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride]); - BGRAToARGBRow_C(src_argb, row, pix); - ARGBToYRow_SSSE3(row, dst_y, pix); -} -#endif - -#ifdef HAS_ARGBTOUVROW_SSSE3 -void ABGRToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride * 2]); - ABGRToARGBRow_SSSE3(src_argb, row, pix); - ABGRToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix); - ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix); -} - -void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride * 2]); - BGRAToARGBRow_SSSE3(src_argb, row, pix); - BGRAToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix); - ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix); -} - -void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride * 2]); - ABGRToARGBRow_C(src_argb, row, pix); - ABGRToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix); - ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix); -} - -void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int pix) { - SIMD_ALIGNED(uint8 row[kMaxStride * 2]); - BGRAToARGBRow_C(src_argb, row, pix); - BGRAToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix); - ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix); -} -#endif - #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. diff --git a/source/row_win.cc b/source/row_win.cc index a71bc7e4e..d6169a306 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -20,7 +20,7 @@ extern "C" { #ifdef HAS_ARGBTOYROW_SSSE3 -// Constant multiplication table for converting ARGB to I400. +// Constants for ARGB static const vec8 kARGBToY = { 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 };