From 6f7e514caa3e1c57ab1fd765151c52b9156113be Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Mon, 28 Oct 2013 17:10:49 +0000 Subject: [PATCH] Full metal BCS BUG=none TEST=Luma* unittest R=thorcarpenter@google.com Review URL: https://webrtc-codereview.appspot.com/3029004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@828 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 3 +- include/libyuv/version.h | 2 +- source/planar_functions.cc | 18 ++- source/row_common.cc | 2 +- source/row_posix.cc | 100 ++++++++++++- source/row_win.cc | 278 ++++++++++++++++++++----------------- 7 files changed, 268 insertions(+), 137 deletions(-) diff --git a/README.chromium b/README.chromium index d9251e83f..09fe88852 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 827 +Version: 828 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index d75f21ad3..b21267d83 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -144,6 +144,7 @@ extern "C" { // Effects: #define HAS_ARGBCOLORTABLEROW_X86 #define HAS_RGBCOLORTABLEROW_X86 +#define HAS_ARGBLUMACOLORTABLEROW_SSSE3 #endif // The following are available on all x86 platforms, including NaCL, but @@ -173,8 +174,6 @@ extern "C" { // TODO(fbarchard): Port to gcc. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) // Effects: -// TODO(fbarchard): Optimize and enable -// #define HAS_ARGBLUMACOLORTABLEROW_SSSE3 // Caveat: Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 309acac93..e3b9f1ec2 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 827 +#define LIBYUV_VERSION 828 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 6880de2e2..802d9c39d 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1848,7 +1848,7 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb, } // ARGBToBayer used to select G channel from ARGB. void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, - uint32 selector, int pix) = ARGBToBayerRow_C; + uint32 selector, int pix) = ARGBToBayerGGRow_C; #if defined(HAS_ARGBTOBAYERGGROW_SSE2) if (TestCpuFlag(kCpuHasSSE2) && width >= 8 && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { @@ -2014,9 +2014,15 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, const float* poly, int width, int height) { - if (!src_argb || !dst_argb || !poly || width <= 0 || height <= 0) { + if (!src_argb || !dst_argb || !poly || width <= 0 || height == 0) { return -1; } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { @@ -2052,9 +2058,15 @@ int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, const uint8* luma, int width, int height) { - if (!src_argb || !dst_argb || !luma || width <= 0 || height <= 0) { + if (!src_argb || !dst_argb || !luma || width <= 0 || height == 0) { return -1; } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } // Coalesce rows. if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { diff --git a/source/row_common.cc b/source/row_common.cc index 4f3d937a8..a12869bbd 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1864,7 +1864,7 @@ void ARGBToBayerRow_C(const uint8* src_argb, // Select G channel from ARGB. e.g. GGGGGGGG void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer, uint32 /*selector*/, int pix) { - // Copy a row of Bayer. + // Copy a row of G. for (int x = 0; x < pix - 1; x += 2) { dst_bayer[0] = src_argb[1]; dst_bayer[1] = src_argb[5]; diff --git a/source/row_posix.cc b/source/row_posix.cc index 823500f74..e30656173 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -6336,7 +6336,7 @@ void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, #ifdef HAS_RGBCOLORTABLEROW_X86 // Tranform RGB pixels with color table. void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { - uintptr_t pixel_temp = 0u; + uintptr_t pixel_temp = 0u; asm volatile ( // 1 pixel loop. ".p2align 4 \n" @@ -6361,6 +6361,104 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { } #endif // HAS_RGBCOLORTABLEROW_X86 +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 +// Tranform RGB pixels with luma table. +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, + uint8* dst_argb, const uint8* luma, + int width) { + uintptr_t pixel_temp = 0u; + uintptr_t table_temp = 0u; + asm volatile ( + "movdqa %6,%%xmm3 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" + "psllw $0x8,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" + + // 4 pixel loop. + ".p2align 4 \n" + "1: \n" + "movq (%2),%%xmm0 \n" + "pmaddubsw %%xmm3,%%xmm0 \n" + "phaddw %%xmm0,%%xmm0 \n" + "pand %%xmm4,%%xmm0 \n" + "punpcklwd %%xmm5,%%xmm0 \n" + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb (%2),%0 \n" + "movzb (%1,%0,1),%0 \n" + "mov %b0,(%3) \n" + "movzb 0x1(%2),%0 \n" + "movzb (%1,%0,1),%0 \n" + "mov %b0,0x1(%3) \n" + "movzb 0x2(%2),%0 \n" + "movzb (%1,%0,1),%0 \n" + "mov %b0,0x2(%3) \n" + "movzb 0x3(%2),%0 \n" + "mov %b0,0x3(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb 0x4(%2),%0 \n" + "movzb (%1,%0,1),%0 \n" + "mov %b0,0x4(%3) \n" + "movzb 0x5(%2),%0 \n" + "movzb (%1,%0,1),%0 \n" + "mov %b0,0x5(%3) \n" + "movzb 0x6(%2),%0 \n" + "movzb (%1,%0,1),%0 \n" + "mov %b0,0x6(%3) \n" + "movzb 0x7(%2),%0 \n" + "mov %b0,0x7(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + "pshufd $0x39,%%xmm0,%%xmm0 \n" + + "movzb 0x8(%2),%0 \n" + "movzb (%1,%0,1),%0 \n" + "mov %b0,0x8(%3) \n" + "movzb 0x9(%2),%0 \n" + "movzb (%1,%0,1),%0 \n" + "mov %b0,0x9(%3) \n" + "movzb 0xa(%2),%0 \n" + "movzb (%1,%0,1),%0 \n" + "mov %b0,0xa(%3) \n" + "movzb 0xb(%2),%0 \n" + "mov %b0,0xb(%3) \n" + + "movd %%xmm0,%k1 \n" // 32 bit offset + "add %5,%1 \n" + + "movzb 0xc(%2),%0 \n" + "movzb (%1,%0,1),%0 \n" + "mov %b0,0xc(%3) \n" + "movzb 0xd(%2),%0 \n" + "movzb (%1,%0,1),%0 \n" + "mov %b0,0xd(%3) \n" + "movzb 0xe(%2),%0 \n" + "movzb (%1,%0,1),%0 \n" + "mov %b0,0xe(%3) \n" + "movzb 0xf(%2),%0 \n" + "mov %b0,0xf(%3) \n" + "sub $0x4,%4 \n" + "lea 0x10(%2),%2 \n" + "lea 0x10(%3),%3 \n" + "jg 1b \n" + : "+d"(pixel_temp), // %0 + "+b"(table_temp), // %1 + "+r"(src_argb), // %2 + "+r"(dst_argb), // %3 + "+rm"(width) // %4 + : "rm"(luma), // %5 + "m"(kARGBToYJ) // %6 + : "memory", "cc"); +} +#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/row_win.cc b/source/row_win.cc index 1540a7faf..dfa0f6e93 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -5202,83 +5202,6 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, } #endif // HAS_ARGBCOLORMATRIXROW_SSSE3 -#ifdef HAS_ARGBCOLORTABLEROW_X86 - -static uvec8 kMaskB = { - 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, -}; -static uvec8 kMaskG = { - 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, -}; -static uvec8 kMaskR = { - 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, -}; -static uvec8 kMaskA = { - 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, 0, 0, 0, 255, -}; - -// Tranform ARGB pixels with color table. -__declspec(naked) __declspec(align(16)) -void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, - int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ - mov ecx, [esp + 4 + 12] /* width */ - - convertloop: - movzx edx, byte ptr [eax] - lea eax, [eax + 4] - movzx edx, byte ptr [esi + edx * 4] - mov byte ptr [eax - 4], dl - movzx edx, byte ptr [eax - 4 + 1] - movzx edx, byte ptr [esi + edx * 4 + 1] - mov byte ptr [eax - 4 + 1], dl - movzx edx, byte ptr [eax - 4 + 2] - movzx edx, byte ptr [esi + edx * 4 + 2] - mov byte ptr [eax - 4 + 2], dl - movzx edx, byte ptr [eax - 4 + 3] - movzx edx, byte ptr [esi + edx * 4 + 3] - mov byte ptr [eax - 4 + 3], dl - dec ecx - jg convertloop - pop esi - ret - } -} -#endif // HAS_ARGBCOLORTABLEROW_X86 - -#ifdef HAS_RGBCOLORTABLEROW_X86 -// Tranform RGB pixels with color table. -__declspec(naked) __declspec(align(16)) -void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { - __asm { - push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ - mov ecx, [esp + 4 + 12] /* width */ - - convertloop: - movzx edx, byte ptr [eax] - lea eax, [eax + 4] - movzx edx, byte ptr [esi + edx * 4] - mov byte ptr [eax - 4], dl - movzx edx, byte ptr [eax - 4 + 1] - movzx edx, byte ptr [esi + edx * 4 + 1] - mov byte ptr [eax - 4 + 1], dl - movzx edx, byte ptr [eax - 4 + 2] - movzx edx, byte ptr [esi + edx * 4 + 2] - mov byte ptr [eax - 4 + 2], dl - dec ecx - jg convertloop - - pop esi - ret - } -} -#endif // HAS_RGBCOLORTABLEROW_X86 - #ifdef HAS_ARGBQUANTIZEROW_SSE2 // Quantize 4 ARGB pixels (16 bytes). // Aligned to 16 bytes. @@ -7149,72 +7072,171 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, } #endif // HAS_ARGBPOLYNOMIALROW_AVX2 - -// RGB to Luminance. -// Leverage the fact that we want shifted left by 8 by the caller. -// -// Borrowed from libyuv/files/source/row_common.cc. -// JPeg 7 bit Y: -// b 0.11400 * 128 = 14.592 = 15 -// g 0.58700 * 128 = 75.136 = 75 -// r 0.29900 * 128 = 38.272 = 38 - -// Convert 16 ARGB pixels (64 bytes) to 16 Y values. +#ifdef HAS_ARGBCOLORTABLEROW_X86 +// Tranform ARGB pixels with color table. __declspec(naked) __declspec(align(16)) -void ARGBToYJx4_SSSE3(const uint8* src_argb, const uint8* luma, uint8** lut) { +void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, + int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - movdqa xmm0, [eax] - pmaddubsw xmm0, kARGBToYJ - movd xmm1, [esp + 8] /* luma */ - mov edx, [esp + 12] /* lut */ - phaddw xmm0, xmm0 - pshufd xmm1, xmm1, 0 - pxor xmm2, xmm2 - psrlw xmm0, 8 - psllw xmm0, 8 // 0y0y0y0y - punpcklwd xmm0, xmm2 // 000y000y000y000y - paddd xmm0, xmm1 // lum0lum1lum2lum3 - movdqa [edx], xmm0 + push esi + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ + + // 1 pixel loop. + align 4 + convertloop: + movzx edx, byte ptr [eax] + lea eax, [eax + 4] + movzx edx, byte ptr [esi + edx * 4] + mov byte ptr [eax - 4], dl + movzx edx, byte ptr [eax - 4 + 1] + movzx edx, byte ptr [esi + edx * 4 + 1] + mov byte ptr [eax - 4 + 1], dl + movzx edx, byte ptr [eax - 4 + 2] + movzx edx, byte ptr [esi + edx * 4 + 2] + mov byte ptr [eax - 4 + 2], dl + movzx edx, byte ptr [eax - 4 + 3] + movzx edx, byte ptr [esi + edx * 4 + 3] + mov byte ptr [eax - 4 + 3], dl + dec ecx + jg convertloop + pop esi ret } } +#endif // HAS_ARGBCOLORTABLEROW_X86 +#ifdef HAS_RGBCOLORTABLEROW_X86 +// Tranform RGB pixels with color table. +__declspec(naked) __declspec(align(16)) +void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { + __asm { + push esi + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ + mov ecx, [esp + 4 + 12] /* width */ + + // 1 pixel loop. + align 4 + convertloop: + movzx edx, byte ptr [eax] + lea eax, [eax + 4] + movzx edx, byte ptr [esi + edx * 4] + mov byte ptr [eax - 4], dl + movzx edx, byte ptr [eax - 4 + 1] + movzx edx, byte ptr [esi + edx * 4 + 1] + mov byte ptr [eax - 4 + 1], dl + movzx edx, byte ptr [eax - 4 + 2] + movzx edx, byte ptr [esi + edx * 4 + 2] + mov byte ptr [eax - 4 + 2], dl + dec ecx + jg convertloop + + pop esi + ret + } +} +#endif // HAS_RGBCOLORTABLEROW_X86 + +#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 +// Tranform RGB pixels with luma table. +__declspec(naked) __declspec(align(16)) void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, const uint8* luma, int width) { - SIMD_ALIGNED(uint8* lut4[4]); - for (int i = 0; i < width - 3; i += 4) { - ARGBToYJx4_SSSE3(src_argb, luma, lut4); - // Luminance in rows, color values in columns. - const uint8* luma0 = lut4[0]; - dst_argb[0] = luma0[src_argb[0]]; - dst_argb[1] = luma0[src_argb[1]]; - dst_argb[2] = luma0[src_argb[2]]; - dst_argb[3] = src_argb[3]; + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] /* src_argb */ + mov edi, [esp + 8 + 8] /* dst_argb */ + movd xmm2, dword ptr [esp + 8 + 12] /* table_argb */ + pshufd xmm2, xmm2, 0 + mov ecx, [esp + 8 + 16] /* width */ + movdqa xmm3, kARGBToYJ + pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 + psllw xmm4, 8 + pxor xmm5, xmm5 - luma0 = lut4[1]; - dst_argb[4] = luma0[src_argb[4]]; - dst_argb[5] = luma0[src_argb[5]]; - dst_argb[6] = luma0[src_argb[6]]; - dst_argb[7] = src_argb[7]; + // 4 pixel loop. + align 4 + convertloop: + movq xmm0, qword ptr [eax] // generate luma ptr + pmaddubsw xmm0, xmm3 + phaddw xmm0, xmm0 + pand xmm0, xmm4 // mask out low bits + punpcklwd xmm0, xmm5 + paddd xmm0, xmm2 // add table base + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 - luma0 = lut4[2]; - dst_argb[8] = luma0[src_argb[8]]; - dst_argb[9] = luma0[src_argb[9]]; - dst_argb[10] = luma0[src_argb[10]]; - dst_argb[11] = src_argb[11]; + movzx edx, byte ptr [eax] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi], dl + movzx edx, byte ptr [eax + 1] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 1], dl + movzx edx, byte ptr [eax + 2] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 2], dl + movzx edx, byte ptr [eax + 3] // copy alpha. + mov byte ptr [edi + 3], dl - luma0 = lut4[3]; - dst_argb[12] = luma0[src_argb[12]]; - dst_argb[13] = luma0[src_argb[13]]; - dst_argb[14] = luma0[src_argb[14]]; - dst_argb[15] = src_argb[15]; + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 - src_argb += 16; - dst_argb += 16; + movzx edx, byte ptr [eax + 4] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 4], dl + movzx edx, byte ptr [eax + 5] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 5], dl + movzx edx, byte ptr [eax + 6] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 6], dl + movzx edx, byte ptr [eax + 7] // copy alpha. + mov byte ptr [edi + 7], dl + + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // 00111001 to rotate right 32 + + movzx edx, byte ptr [eax + 8] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 8], dl + movzx edx, byte ptr [eax + 9] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 9], dl + movzx edx, byte ptr [eax + 10] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 10], dl + movzx edx, byte ptr [eax + 11] // copy alpha. + mov byte ptr [edi + 11], dl + + movd esi, xmm0 + + movzx edx, byte ptr [eax + 12] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 12], dl + movzx edx, byte ptr [eax + 13] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 13], dl + movzx edx, byte ptr [eax + 14] + movzx edx, byte ptr [esi + edx] + mov byte ptr [edi + 14], dl + movzx edx, byte ptr [eax + 15] // copy alpha. + mov byte ptr [edi + 15], dl + + sub ecx, 4 + lea eax, [eax + 16] + lea edi, [edi + 16] + jg convertloop + + pop edi + pop esi + ret } } +#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)