From 228bdc24e44264baf3402124aaa6d4d81c8896f5 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Tue, 15 Nov 2011 21:58:26 +0000 Subject: [PATCH] port yuv to rgb ssse3 to gcc BUG=none TEST=media_unittest Review URL: http://webrtc-codereview.appspot.com/269015 git-svn-id: http://libyuv.googlecode.com/svn/trunk@80 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- source/planar_functions.cc | 38 +-- source/row.h | 17 +- source/row_posix.cc | 475 ++++++++++++++++++++++--------------- source/row_win.cc | 38 +-- 5 files changed, 299 insertions(+), 271 deletions(-) diff --git a/README.chromium b/README.chromium index 3c60491ee..40059043c 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 79 +Version: 80 License: BSD License File: LICENSE diff --git a/source/planar_functions.cc b/source/planar_functions.cc index f84b4cacb..b63b8a7a4 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1136,19 +1136,6 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; } else -#endif -#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - (width % 4 == 0) && - IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow4_SSE2; - } else -#endif -#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - (width % 2 == 0)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2; - } else #endif { FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; @@ -1188,12 +1175,6 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSSE3; } else -#endif -#if defined(HAS_FASTCONVERTYUVTOBGRAROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - (width % 2 == 0)) { - FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSE2; - } else #endif { FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_C; @@ -1233,12 +1214,6 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSSE3; } else -#endif -#if defined(HAS_FASTCONVERTYUVTOABGRROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - (width % 2 == 0)) { - FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSE2; - } else #endif { FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_C; @@ -1278,12 +1253,6 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSSE3; } else -#endif -#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2) && - (width % 2 == 0)) { - FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2; - } else #endif { FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C; @@ -1321,11 +1290,6 @@ int I444ToARGB(const uint8* src_y, int src_stride_y, IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSSE3; } else -#endif -#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSE2)) { - FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSE2; - } else #endif { FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_C; @@ -1354,7 +1318,7 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, uint8* rgb_buf, int width); #if defined(HAS_FASTCONVERTYTOARGBROW_SSE2) - if (TestCpuFlag(kCpuHasSSSE3) && + if (TestCpuFlag(kCpuHasSSE2) && (width % 8 == 0) && IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) { FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2; diff --git a/source/row.h b/source/row.h index 92ed6a805..53cf8a67a 100644 --- a/source/row.h +++ b/source/row.h @@ -37,28 +37,17 @@ #define HAS_BGRATOUVROW_SSSE3 #define HAS_ABGRTOUVROW_SSSE3 #define HAS_I400TOARGBROW_SSE2 -#endif - -// The following are available on Linux (32/64 bit) -// TODO(fbarchard): enable for fpic on linux -#if (defined(__x86_64__) || \ - (defined(__i386__) && !defined(__pic__))) && \ - !defined(LIBYUV_DISABLE_ASM) -#define HAS_FASTCONVERTYUVTOARGBROW_SSE2 -#define HAS_FASTCONVERTYUVTOBGRAROW_SSE2 -#define HAS_FASTCONVERTYUVTOABGRROW_SSE2 -#define HAS_FASTCONVERTYUV444TOARGBROW_SSE2 #define HAS_FASTCONVERTYTOARGBROW_SSE2 #endif -// The following are available on Windows -#if defined(WIN32) && \ +// The following are available on all x86 platforms except 32 bit OSX +#if (defined(WIN32) || defined(__x86_64__) || \ + (defined(__i386__) && !defined(__APPLE__))) && \ !defined(LIBYUV_DISABLE_ASM) #define HAS_FASTCONVERTYUVTOARGBROW_SSSE3 #define HAS_FASTCONVERTYUVTOBGRAROW_SSSE3 #define HAS_FASTCONVERTYUVTOABGRROW_SSSE3 #define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3 -#define HAS_FASTCONVERTYTOARGBROW_SSE2 #endif extern "C" { diff --git a/source/row_posix.cc b/source/row_posix.cc index 8db175548..2eb5fc3af 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -14,49 +14,49 @@ extern "C" { -#ifdef HAS_ARGBTOYROW_SSSE3 - -// Constant multiplication table for converting ARGB to I400. -static const vec8 kARGBToY = { - 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 -}; - -static const uvec8 kAddY16 = { - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u -}; - #ifdef HAS_ARGBTOUVROW_SSSE3 -static const vec8 kARGBToU = { +vec8 kARGBToU = { 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 }; -static const uvec8 kARGBToV = { +uvec8 kARGBToV = { -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0 }; -static const uvec8 kAddUV128 = { +uvec8 kAddUV128 = { 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u }; #endif +#ifdef HAS_ARGBTOYROW_SSSE3 + +// Constant multiplication table for converting ARGB to I400. +vec8 kARGBToY = { + 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 +}; + +uvec8 kAddY16 = { + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u +}; + // Shuffle table for converting BG24 to ARGB. -static const uvec8 kShuffleMaskBG24ToARGB = { +uvec8 kShuffleMaskBG24ToARGB = { 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u }; // Shuffle table for converting RAW to ARGB. -static const uvec8 kShuffleMaskRAWToARGB = { +uvec8 kShuffleMaskRAWToARGB = { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; // Shuffle table for converting ABGR to ARGB. -static const uvec8 kShuffleMaskABGRToARGB = { +uvec8 kShuffleMaskABGRToARGB = { 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u }; // Shuffle table for converting BGRA to ARGB. -static const uvec8 kShuffleMaskBGRAToARGB = { +uvec8 kShuffleMaskBGRAToARGB = { 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u }; @@ -145,17 +145,17 @@ void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { "movdqa 0x20(%0),%%xmm3 \n" "lea 0x30(%0),%0 \n" "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" // xmm2 = { xmm3[0:3] xmm1[8:15] } + "palignr $0x8,%%xmm1,%%xmm2 \n" "pshufb %%xmm4,%%xmm2 \n" "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" // xmm1 = { xmm3[0:7] xmm0[12:15] } + "palignr $0xc,%%xmm0,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" "movdqa %%xmm2,0x20(%1) \n" "por %%xmm5,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" "movdqa %%xmm0,(%1) \n" "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" // xmm3 = { xmm3[4:15] } + "palignr $0x4,%%xmm3,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n" "movdqa %%xmm1,0x10(%1) \n" "por %%xmm5,%%xmm3 \n" @@ -185,17 +185,17 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) { "movdqa 0x20(%0),%%xmm3 \n" "lea 0x30(%0),%0 \n" "movdqa %%xmm3,%%xmm2 \n" - "palignr $0x8,%%xmm1,%%xmm2 \n" // xmm2 = { xmm3[0:3] xmm1[8:15] } + "palignr $0x8,%%xmm1,%%xmm2 \n" "pshufb %%xmm4,%%xmm2 \n" "por %%xmm5,%%xmm2 \n" - "palignr $0xc,%%xmm0,%%xmm1 \n" // xmm1 = { xmm3[0:7] xmm0[12:15] } + "palignr $0xc,%%xmm0,%%xmm1 \n" "pshufb %%xmm4,%%xmm0 \n" "movdqa %%xmm2,0x20(%1) \n" "por %%xmm5,%%xmm0 \n" "pshufb %%xmm4,%%xmm1 \n" "movdqa %%xmm0,(%1) \n" "por %%xmm5,%%xmm1 \n" - "palignr $0x4,%%xmm3,%%xmm3 \n" // xmm3 = { xmm3[4:15] } + "palignr $0x4,%%xmm3,%%xmm3 \n" "pshufb %%xmm4,%%xmm3 \n" "movdqa %%xmm1,0x10(%1) \n" "por %%xmm5,%%xmm3 \n" @@ -318,229 +318,320 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, } #endif -// The following code requires 6 registers and prefers 7 registers. -// 7 registers requires -fpic to be off, and -fomit-frame-pointer -#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2 -#if defined(__x86_64__) -#define REG_a "rax" -#define REG_d "rdx" -#else -#define REG_a "eax" -#define REG_d "edx" + +#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2 +#define YG 74 /* static_cast(1.164 * 64 + 0.5) */ + +vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; +vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; #endif + +#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3 +#define UB 127 /* min(63,static_cast(2.018 * 64)) */ +#define UG -25 /* static_cast(-0.391 * 64 - 0.5) */ +#define UR 0 + +#define VB 0 +#define VG -52 /* static_cast(-0.813 * 64 - 0.5) */ +#define VR 102 /* static_cast(1.596 * 64 + 0.5) */ + +// Bias +#define BB UB * 128 + VB * 128 +#define BG UG * 128 + VG * 128 +#define BR UR * 128 + VR * 128 + +vec8 kUVToB = { + UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB +}; + +vec8 kUVToR = { + UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR +}; + +vec8 kUVToG = { + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG +}; + + +vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; +vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; +vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; + #if defined(__APPLE__) || defined(__x86_64__) #define OMITFP #else #define OMITFP __attribute__((optimize("omit-frame-pointer"))) #endif -#define CLOBBER "%"REG_a, "%"REG_d -// This version produces 2 pixels +// This version produces 8 pixels #define YUVTORGB \ -"1: \n" \ - "movzb (%1),%%"REG_a" \n" \ - "lea 1(%1),%1 \n" \ - "movzb (%2),%%"REG_d" \n" \ - "lea 1(%2),%2 \n" \ - "movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \ - "movzb 0(%0),%%"REG_a" \n" \ - "movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \ - "paddsw %%xmm1,%%xmm0 \n" \ - "movzb 1(%0),%%"REG_d" \n" \ - "punpcklqdq %%xmm0,%%xmm0 \n" \ - "lea 2(%0),%0 \n" \ - "movq 0(%5,%%"REG_a",8),%%xmm1 \n" \ - "movhps 0(%5,%%"REG_d",8),%%xmm1 \n" \ - "paddsw %%xmm0,%%xmm1 \n" \ - "psraw $6,%%xmm1 \n" \ - "packuswb %%xmm1,%%xmm1 \n" \ - "movq %%xmm1,0(%3) \n" \ - "lea 8(%3),%3 \n" \ - "sub $0x2,%4 \n" \ - "ja 1b \n" -// This version produces 4 pixels -#define YUVTORGB4 \ -"1: \n" \ - "movzb 0(%1),%%"REG_a" \n" \ - "movzb 0(%2),%%"REG_d" \n" \ - "movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \ - "movzb 0(%0),%%"REG_a" \n" \ - "movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \ - "paddsw %%xmm1,%%xmm0 \n" \ - "movzb 1(%0),%%"REG_d" \n" \ - "punpcklqdq %%xmm0,%%xmm0 \n" \ - "movq 0(%5,%%"REG_a",8),%%xmm2 \n" \ - "movhps 0(%5,%%"REG_d",8),%%xmm2 \n" \ - "paddsw %%xmm0,%%xmm2 \n" \ - "psraw $6,%%xmm2 \n" \ - "movzb 1(%1),%%"REG_a" \n" \ - "movzb 1(%2),%%"REG_d" \n" \ - "movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \ - "movzb 2(%0),%%"REG_a" \n" \ - "movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \ - "paddsw %%xmm1,%%xmm0 \n" \ - "movzb 3(%0),%%"REG_d" \n" \ - "punpcklqdq %%xmm0,%%xmm0 \n" \ - "movq 0(%5,%%"REG_a",8),%%xmm3 \n" \ - "movhps 0(%5,%%"REG_d",8),%%xmm3 \n" \ - "paddsw %%xmm0,%%xmm3 \n" \ - "psraw $6,%%xmm3 \n" \ - "lea 2(%1),%1 \n" \ - "lea 2(%2),%2 \n" \ - "lea 4(%0),%0 \n" \ - "packuswb %%xmm3,%%xmm2 \n" \ - "movdqa %%xmm2,0(%3) \n" \ - "lea 16(%3),%3 \n" \ - "sub $0x4,%4 \n" \ - "ja 1b \n" \ + "movd (%1),%%xmm0 \n" \ + "movd (%1,%2,1),%%xmm1 \n" \ + "lea 0x4(%1),%1 \n" \ + "punpcklbw %%xmm1,%%xmm0 \n" \ + "punpcklwd %%xmm0,%%xmm0 \n" \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "pmaddubsw %5,%%xmm0 \n" \ + "pmaddubsw %6,%%xmm1 \n" \ + "pmaddubsw %7,%%xmm2 \n" \ + "psubw %8,%%xmm0 \n" \ + "psubw %9,%%xmm1 \n" \ + "psubw %10,%%xmm2 \n" \ + "movq (%0),%%xmm3 \n" \ + "lea 0x8(%0),%0 \n" \ + "punpcklbw %%xmm4,%%xmm3 \n" \ + "psubsw %11,%%xmm3 \n" \ + "pmullw %12,%%xmm3 \n" \ + "paddw %%xmm3,%%xmm0 \n" \ + "paddw %%xmm3,%%xmm1 \n" \ + "paddw %%xmm3,%%xmm2 \n" \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" -// 6 or 7 registers -void OMITFP FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, // rdi - const uint8* u_buf, // rsi - const uint8* v_buf, // rdx - uint8* rgb_buf, // rcx - int width) { // r8 - asm volatile ( - YUVTORGB - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 - "+rm"(width) // %4 - : "r" (kCoefficientsRgbY) // %5 - : "memory", "cc", CLOBBER -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" -#endif -); -} - -// 6 or 7 registers -void OMITFP FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf, // rdi +void OMITFP FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, // rdi const uint8* u_buf, // rsi const uint8* v_buf, // rdx uint8* rgb_buf, // rcx int width) { // r8 asm volatile ( - YUVTORGB4 - : "+r"(y_buf), // %0 - "+r"(u_buf), // %1 - "+r"(v_buf), // %2 - "+r"(rgb_buf), // %3 - "+rm"(width) // %4 - : "r" (kCoefficientsRgbY) // %5 - : "memory", "cc", CLOBBER -#if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" -#endif -); -} + "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" -void OMITFP FastConvertYUVToBGRARow_SSE2(const uint8* y_buf, // rdi - const uint8* u_buf, // rsi - const uint8* v_buf, // rdx - uint8* rgb_buf, // rcx - int width) { // r8 - asm volatile ( + "1: \n" YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "movdqa %%xmm0,(%3) \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqa %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x8,%4 \n" + "ja 1b \n" : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 "+r"(rgb_buf), // %3 "+rm"(width) // %4 - : "r" (kCoefficientsBgraY) // %5 - : "memory", "cc", CLOBBER + : "m" (kUVToB), // %5 + "m" (kUVToG), // %6 + "m" (kUVToR), // %7 + "m" (kUVBiasB), // %8 + "m" (kUVBiasG), // %9 + "m" (kUVBiasR), // %10 + "m" (kYSub16), // %11 + "m" (kYToRgb) // %12 + : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif -); + ); } -void OMITFP FastConvertYUVToABGRRow_SSE2(const uint8* y_buf, // rdi - const uint8* u_buf, // rsi - const uint8* v_buf, // rdx - uint8* rgb_buf, // rcx - int width) { // r8 +void OMITFP FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 asm volatile ( + "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + + "1: \n" YUVTORGB + "pcmpeqb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm5 \n" + "movdqa %%xmm5,%%xmm0 \n" + "punpcklwd %%xmm1,%%xmm5 \n" + "movdqa %%xmm5,(%3) \n" + "punpckhwd %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x8,%4 \n" + "ja 1b \n" : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 "+r"(rgb_buf), // %3 "+rm"(width) // %4 - : "r" (kCoefficientsAbgrY) // %5 - : "memory", "cc", CLOBBER + : "m" (kUVToB), // %5 + "m" (kUVToG), // %6 + "m" (kUVToR), // %7 + "m" (kUVBiasB), // %8 + "m" (kUVBiasG), // %9 + "m" (kUVBiasR), // %10 + "m" (kYSub16), // %11 + "m" (kYToRgb) // %12 + : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif -); + ); } -// 6 registers -void OMITFP FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, // rdi - const uint8* u_buf, // rsi - const uint8* v_buf, // rdx - uint8* rgb_buf, // rcx - int width) { // r8 +void OMITFP FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 asm volatile ( -"1: \n" - "movzb (%1),%%"REG_a" \n" - "lea 1(%1),%1 \n" - "movq 2048(%5,%%"REG_a",8),%%xmm0 \n" - "movzb (%2),%%"REG_a" \n" - "lea 1(%2),%2 \n" - "movq 4096(%5,%%"REG_a",8),%%xmm1 \n" - "paddsw %%xmm1,%%xmm0 \n" - "movzb (%0),%%"REG_a" \n" - "lea 1(%0),%0 \n" - "movq 0(%5,%%"REG_a",8),%%xmm2 \n" - "paddsw %%xmm0,%%xmm2 \n" - "shufps $0x44,%%xmm2,%%xmm2 \n" - "psraw $0x6,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movd %%xmm2,0x0(%3) \n" - "lea 4(%3),%3 \n" - "sub $0x1,%4 \n" - "ja 1b \n" + "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + + "1: \n" + YUVTORGB + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm1,%%xmm1 \n" + "packuswb %%xmm2,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm2 \n" + "movdqa %%xmm2,(%3) \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqa %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x8,%4 \n" + "ja 1b \n" : "+r"(y_buf), // %0 "+r"(u_buf), // %1 "+r"(v_buf), // %2 "+r"(rgb_buf), // %3 "+rm"(width) // %4 - : "r" (kCoefficientsRgbY) // %5 - : "memory", "cc", "%"REG_a + : "m" (kUVToB), // %5 + "m" (kUVToG), // %6 + "m" (kUVToR), // %7 + "m" (kUVBiasB), // %8 + "m" (kUVBiasG), // %9 + "m" (kUVBiasR), // %10 + "m" (kYSub16), // %11 + "m" (kYToRgb) // %12 + : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif -); + ); } -// 5 registers +void OMITFP FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, // rdi + const uint8* u_buf, // rsi + const uint8* v_buf, // rdx + uint8* rgb_buf, // rcx + int width) { // r8 + asm volatile ( + "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + + "1: \n" + "movd (%1),%%xmm0 \n" + "movd (%1,%2,1),%%xmm1 \n" + "lea 0x4(%1),%1 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm0,%%xmm2 \n" + "pmaddubsw %5,%%xmm0 \n" + "pmaddubsw %6,%%xmm1 \n" + "pmaddubsw %7,%%xmm2 \n" + "psubw %8,%%xmm0 \n" + "psubw %9,%%xmm1 \n" + "psubw %10,%%xmm2 \n" + "movd (%0),%%xmm3 \n" + "lea 0x4(%0),%0 \n" + "punpcklbw %%xmm4,%%xmm3 \n" + "psubsw %11,%%xmm3 \n" + "pmullw %12,%%xmm3 \n" + "paddw %%xmm3,%%xmm0 \n" + "paddw %%xmm3,%%xmm1 \n" + "paddw %%xmm3,%%xmm2 \n" + "psraw $0x6,%%xmm0 \n" + "psraw $0x6,%%xmm1 \n" + "psraw $0x6,%%xmm2 \n" + "packuswb %%xmm0,%%xmm0 \n" + "packuswb %%xmm1,%%xmm1 \n" + "packuswb %%xmm2,%%xmm2 \n" + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "movdqa %%xmm0,(%3) \n" + "lea 0x10(%3),%3 \n" + "sub $0x4,%4 \n" + "ja 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+rm"(width) // %4 + : "m" (kUVToB), // %5 + "m" (kUVToG), // %6 + "m" (kUVToR), // %7 + "m" (kUVBiasB), // %8 + "m" (kUVBiasG), // %9 + "m" (kUVBiasR), // %10 + "m" (kYSub16), // %11 + "m" (kYToRgb) // %12 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} +#endif + +#ifdef HAS_FASTCONVERTYTOARGBROW_SSE2 void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi uint8* rgb_buf, // rcx int width) { // r8 asm volatile ( -"1: \n" - "movzb (%0),%%"REG_a" \n" - "movzb 0x1(%0),%%"REG_d" \n" - "movq (%3,%%"REG_a",8),%%xmm2 \n" - "lea 2(%0),%0 \n" - "movhps (%3,%%"REG_d",8),%%xmm2 \n" - "psraw $0x6,%%xmm2 \n" - "packuswb %%xmm2,%%xmm2 \n" - "movq %%xmm2,0x0(%1) \n" - "lea 8(%1),%1 \n" - "sub $0x2,%2 \n" - "ja 1b \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pslld $0x18,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + "movdqa %3,%%xmm3 \n" + "movdqa %4,%%xmm2 \n" + + "1: \n" + // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm4,%%xmm0 \n" + "psubsw %%xmm3,%%xmm0 \n" + "pmullw %%xmm2,%%xmm0 \n" + "psraw $0x6,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + + // Step 2: Weave into ARGB + "punpcklbw %%xmm0,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm0 \n" + "por %%xmm5,%%xmm0 \n" + "movdqa %%xmm0,(%1) \n" + "punpckhwd %%xmm1,%%xmm1 \n" + "por %%xmm5,%%xmm1 \n" + "movdqa %%xmm1,16(%1) \n" + "lea 32(%1),%1 \n" + + "sub $0x8,%2 \n" + "ja 1b \n" : "+r"(y_buf), // %0 "+r"(rgb_buf), // %1 "+rm"(width) // %2 - : "r" (kCoefficientsRgbY) // %3 - : "memory", "cc", "%"REG_a, "%"REG_d + : "m" (kYSub16), // %3 + "m" (kYToRgb) // %4 + : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif -); + ); } #endif diff --git a/source/row_win.cc b/source/row_win.cc index bd8c33cce..12716fb03 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -54,8 +54,7 @@ static const vec8 kABGRToV = { }; static const uvec8 kAddY16 = { - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, - 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, + 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u }; static const uvec8 kAddUV128 = { @@ -548,27 +547,13 @@ static const vec8 kUVToG = { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG }; -static const vec16 kYToRgb = { - YG, YG, YG, YG, YG, YG, YG, YG -}; +static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; +static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; +static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; +static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; +static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; -static const vec16 kYSub16 = { - 16, 16, 16, 16, 16, 16, 16, 16 -}; - -static const vec16 kUVBiasB = { - BB, BB, BB, BB, BB, BB, BB, BB -}; - -static const vec16 kUVBiasG = { - BG, BG, BG, BG, BG, BG, BG, BG -}; - -static const vec16 kUVBiasR = { - BR, BR, BR, BR, BR, BR, BR, BR -}; - -#define YUVTORGB_SSSE3 __asm { \ +#define YUVTORGB __asm { \ /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ __asm movd xmm0, [esi] /* U */ \ __asm movd xmm1, [esi + edi] /* V */ \ @@ -619,7 +604,7 @@ void FastConvertYUVToARGBRow_SSSE3(const uint8* y_buf, pxor xmm4, xmm4 convertloop: - YUVTORGB_SSSE3 + YUVTORGB // Step 3: Weave into ARGB punpcklbw xmm0, xmm1 // BG @@ -658,7 +643,7 @@ void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, pxor xmm4, xmm4 convertloop: - YUVTORGB_SSSE3 + YUVTORGB // Step 3: Weave into BGRA pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha @@ -699,7 +684,7 @@ void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, pxor xmm4, xmm4 convertloop: - YUVTORGB_SSSE3 + YUVTORGB // Step 3: Weave into ARGB punpcklbw xmm2, xmm1 // RG @@ -787,7 +772,6 @@ void FastConvertYUV444ToARGBRow_SSSE3(const uint8* y_buf, #endif #ifdef HAS_FASTCONVERTYTOARGBROW_SSE2 - __declspec(naked) void FastConvertYToARGBRow_SSE2(const uint8* y_buf, uint8* rgb_buf, @@ -829,8 +813,8 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf, ret } } - #endif + #endif } // extern "C"