From 7a0d01ef8ba25bdad7df1f27d8b0969f0e0a9185 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Thu, 19 Sep 2013 17:55:54 +0000 Subject: [PATCH] Luma Table optimized for SSSE3 BUG=267 TESTED=lUMA unittest R=jingning@google.com, nfullagar@google.com Review URL: https://webrtc-codereview.appspot.com/2257004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@793 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 3 +- include/libyuv/version.h | 2 +- source/planar_functions.cc | 2 +- source/row_win.cc | 81 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 85 insertions(+), 5 deletions(-) diff --git a/README.chromium b/README.chromium index 1647a093b..92fb87c4b 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 792 +Version: 793 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 5ab299a73..9a2768f7e 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -146,8 +146,7 @@ extern "C" { // TODO(fbarchard): Port to gcc. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) // Effects: -// SSSE3 version incomplete: -// #define HAS_ARGBLUMACOLORTABLEROW_SSSE3 +#define HAS_ARGBLUMACOLORTABLEROW_SSSE3 // Caveat: Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 672514940..3ce716bec 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 792 +#define LIBYUV_VERSION 793 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index e4a255194..0fef75159 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2090,7 +2090,7 @@ int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, const uint8* luma, int width) = ARGBLumaColorTableRow_C; #if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3) - if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 2)) { + if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4)) { ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3; } #endif diff --git a/source/row_win.cc b/source/row_win.cc index 0e1af3a88..677dd1c2e 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -5092,6 +5092,20 @@ void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 #ifdef HAS_ARGBCOLORTABLEROW_X86 + +static uvec8 kMaskB = { + 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, +}; +static uvec8 kMaskG = { + 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, +}; +static uvec8 kMaskR = { + 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, +}; +static uvec8 kMaskA = { + 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, +}; + // Tranform ARGB pixels with color table. __declspec(naked) __declspec(align(16)) void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, @@ -6844,6 +6858,73 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 + +// RGB to Luminance. +// Leverage the fact that we want shifted left by 8 by the caller. +// +// Borrowed from libyuv/files/source/row_common.cc. +// JPeg 7 bit Y: +// b 0.11400 * 128 = 14.592 = 15 +// g 0.58700 * 128 = 75.136 = 75 +// r 0.29900 * 128 = 38.272 = 38 + +// Convert 16 ARGB pixels (64 bytes) to 16 Y values. +__declspec(naked) __declspec(align(16)) +void ARGBToYJx4_SSSE3(const uint8* src_argb, const uint8* luma, uint8** lut) { + __asm { + mov eax, [esp + 4] /* src_argb */ + movdqa xmm0, [eax] + pmaddubsw xmm0, kARGBToYJ + movd xmm1, [esp + 8] /* luma */ + mov edx, [esp + 12] /* lut */ + phaddw xmm0, xmm0 + pshufd xmm1, xmm1, 0 + pxor xmm2, xmm2 + psrlw xmm0, 8 + psllw xmm0, 8 // 0y0y0y0y + punpcklwd xmm0, xmm2 // 000y000y000y000y + paddd xmm0, xmm1 // lum0lum1lum2lum3 + movdqa [edx], xmm0 + ret + } +} + +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, + uint8* dst_argb, const uint8* luma, + int width) { + SIMD_ALIGNED(uint8* lut4[4]); + ARGBToYJx4_SSSE3(src_argb, luma, lut4); + for (int i = 0; i < width - 3; i += 4) { + // Luminance in rows, color values in columns. + const uint8* luma0 = lut4[0]; + dst_argb[0] = luma0[src_argb[0]]; + dst_argb[1] = luma0[src_argb[1]]; + dst_argb[2] = luma0[src_argb[2]]; + dst_argb[3] = src_argb[3]; + + luma0 = lut4[1]; + dst_argb[4] = luma0[src_argb[4]]; + dst_argb[5] = luma0[src_argb[5]]; + dst_argb[6] = luma0[src_argb[6]]; + dst_argb[7] = src_argb[7]; + + luma0 = lut4[2]; + dst_argb[8] = luma0[src_argb[8]]; + dst_argb[9] = luma0[src_argb[9]]; + dst_argb[10] = luma0[src_argb[10]]; + dst_argb[11] = src_argb[11]; + + luma0 = lut4[3]; + dst_argb[12] = luma0[src_argb[12]]; + dst_argb[13] = luma0[src_argb[13]]; + dst_argb[14] = luma0[src_argb[14]]; + dst_argb[15] = src_argb[15]; + + src_argb += 16; + dst_argb += 16; + } +} + #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #ifdef __cplusplus