diff --git a/README.chromium b/README.chromium index 98060c126..717f1f6cd 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 833 +Version: 834 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index b21267d83..8b220c800 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -1613,12 +1613,11 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, uint8* dst_argb, const float* poly, int width); -void ARGBLumaColorTableRow_C(const uint8* src_argb, - uint8* dst_argb, const uint8* luma, - int width); -void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, - uint8* dst_argb, const uint8* luma, - int width); +void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, + const uint8* luma, const uint32 lumacoeff); +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width, const uint8* luma, + const uint32 lumacoeff); // Divide num by div and return as 16.16 fixed point result. int FixedDiv_C(int num, int div); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 5b5e24f62..cff5733d4 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 833 +#define LIBYUV_VERSION 834 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/cpu_id.cc b/source/cpu_id.cc index e223153d3..f472f4c9a 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -75,7 +75,7 @@ int TestOsSaveYmm() { mov xcr0, eax } #elif defined(__i386__) || defined(__x86_64__) - asm volatile (".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx" ); + asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcr0) : "c" (0) : "%edx"); #endif // defined(_MSC_VER) return((xcr0 & 6) == 6); // Is ymm saved? } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 802d9c39d..8bbeb52ac 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2074,16 +2074,16 @@ int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb, height = 1; src_stride_argb = dst_stride_argb = 0; } - void (*ARGBLumaColorTableRow)(const uint8* src_argb, - uint8* dst_argb, const uint8* luma, - int width) = ARGBLumaColorTableRow_C; + void (*ARGBLumaColorTableRow)(const uint8* src_argb, uint8* dst_argb, + int width, const uint8* luma, const uint32 lumacoeff) = + ARGBLumaColorTableRow_C; #if defined(HAS_ARGBLUMACOLORTABLEROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4)) { ARGBLumaColorTableRow = ARGBLumaColorTableRow_SSSE3; } #endif for (int y = 0; y < height; ++y) { - ARGBLumaColorTableRow(src_argb, dst_argb, luma, width); + ARGBLumaColorTableRow(src_argb, dst_argb, width, luma, 0x00264b0f); src_argb += src_stride_argb; dst_argb += dst_stride_argb; } diff --git a/source/row_common.cc b/source/row_common.cc index a12869bbd..2ab8d40bb 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2082,30 +2082,22 @@ void ARGBPolynomialRow_C(const uint8* src_argb, } } -// RGB to Luminance. -// Leverage the fact that we want shifted left by 8 by the caller. -// -// Borrowed from libyuv/files/source/row_common.cc. -// JPeg 7 bit Y: -// b 0.11400 * 128 = 14.592 = 15 -// g 0.58700 * 128 = 75.136 = 75 -// r 0.29900 * 128 = 38.272 = 38 -static __inline unsigned int RGBToYJx256(uint8 r, uint8 g, uint8 b) { - return (38u * r + 75u * g + 15u * b) & 0x7F00u; -} -void ARGBLumaColorTableRow_C(const uint8* src_argb, - uint8* dst_argb, const uint8* luma, - int width) { +void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width, + const uint8* luma, const uint32 lumacoeff) { + uint32 bc = lumacoeff & 0xff; + uint32 gc = (lumacoeff >> 8) & 0xff; + uint32 rc = (lumacoeff >> 16) & 0xff; + for (int i = 0; i < width - 1; i += 2) { // Luminance in rows, color values in columns. - const uint8* luma0 = RGBToYJx256(src_argb[2], src_argb[1], src_argb[0]) + - luma; + const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + + src_argb[2] * rc) & 0x7F00u) + luma; dst_argb[0] = luma0[src_argb[0]]; dst_argb[1] = luma0[src_argb[1]]; dst_argb[2] = luma0[src_argb[2]]; dst_argb[3] = src_argb[3]; - const uint8* luma1 = RGBToYJx256(src_argb[6], src_argb[5], src_argb[4]) + - luma; + const uint8* luma1 = ((src_argb[4] * bc + src_argb[5] * gc + + src_argb[6] * rc) & 0x7F00u) + luma; dst_argb[4] = luma1[src_argb[4]]; dst_argb[5] = luma1[src_argb[5]]; dst_argb[6] = luma1[src_argb[6]]; @@ -2115,8 +2107,8 @@ void ARGBLumaColorTableRow_C(const uint8* src_argb, } if (width & 1) { // Luminance in rows, color values in columns. - const uint8* luma0 = RGBToYJx256(src_argb[2], src_argb[1], src_argb[0]) + - luma; + const uint8* luma0 = ((src_argb[0] * bc + src_argb[1] * gc + + src_argb[2] * rc) & 0x7F00u) + luma; dst_argb[0] = luma0[src_argb[0]]; dst_argb[1] = luma0[src_argb[1]]; dst_argb[2] = luma0[src_argb[2]]; diff --git a/source/row_posix.cc b/source/row_posix.cc index 956512dc3..daa9853c7 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -4477,14 +4477,10 @@ void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, const int8* matrix_argb, int width) { asm volatile ( - "movd " MEMACCESS(3) ",%%xmm2 \n" - "movd " MEMACCESS2(0x4,3) ",%%xmm3 \n" - "movd " MEMACCESS2(0x8,3) ",%%xmm4 \n" - "movd " MEMACCESS2(0xc,3) ",%%xmm5 \n" - "pshufd $0x0,%%xmm2,%%xmm2 \n" - "pshufd $0x0,%%xmm3,%%xmm3 \n" - "pshufd $0x0,%%xmm4,%%xmm4 \n" - "pshufd $0x0,%%xmm5,%%xmm5 \n" + "pshufd $0x00," MEMACCESS(3) ",%%xmm2 \n" + "pshufd $0x55," MEMACCESS(3) ",%%xmm3 \n" + "pshufd $0xaa," MEMACCESS(3) ",%%xmm4 \n" + "pshufd $0xff," MEMACCESS(3) ",%%xmm5 \n" // 8 pixel loop. ".p2align 4 \n" @@ -6361,17 +6357,15 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { } #endif // HAS_RGBCOLORTABLEROW_X86 -// TODO(fbarchard): Ensure this works with minimal number of registers/gcc32. #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 // Tranform RGB pixels with luma table. -void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, - uint8* dst_argb, const uint8* luma, - int width) { +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width, + const uint8* luma, uint32 lumacoeff) { uintptr_t pixel_temp = 0u; uintptr_t table_temp = 0u; asm volatile ( - "mov $0x264b0f,%%edx \n" - "movd %%edx,%%xmm3 \n" + "movd %6,%%xmm3 \n" "pshufd $0x0,%%xmm3,%%xmm3 \n" "pcmpeqb %%xmm4,%%xmm4 \n" "psllw $0x8,%%xmm4 \n" @@ -6456,7 +6450,8 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, "+r"(src_argb), // %2 "+r"(dst_argb), // %3 "+rm"(width) // %4 - : "r"(luma) // %5 + : "r"(luma), // %5 + "rm"(lumacoeff) // %6 : "memory", "cc" #if defined(__SSE2__) , "xmm0", "xmm3", "xmm4", "xmm5" diff --git a/source/row_win.cc b/source/row_win.cc index 4cb6201d2..bbb36bbc9 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -5146,17 +5146,13 @@ void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb, mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_argb */ mov ecx, [esp + 12] /* matrix_argb */ - movd xmm2, [ecx] - movd xmm3, [ecx + 4] - movd xmm4, [ecx + 8] - movd xmm5, [ecx + 12] - pshufd xmm2, xmm2, 0 - pshufd xmm3, xmm3, 0 - pshufd xmm4, xmm4, 0 - pshufd xmm5, xmm5, 0 + pshufd xmm2, [ecx], 0x00 + pshufd xmm3, [ecx], 0x55 + pshufd xmm4, [ecx], 0xaa + pshufd xmm5, [ecx], 0xff mov ecx, [esp + 16] /* width */ - align 16 + align 4 convertloop: movdqa xmm0, [eax] // B movdqa xmm7, [eax + 16] @@ -7142,21 +7138,20 @@ void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) { #ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3 // Tranform RGB pixels with luma table. __declspec(naked) __declspec(align(16)) -void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, - uint8* dst_argb, const uint8* luma, - int width) { +void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, + int width, + const uint8* luma, uint32 lumacoeff) { __asm { push esi push edi mov eax, [esp + 8 + 4] /* src_argb */ mov edi, [esp + 8 + 8] /* dst_argb */ - movd xmm2, dword ptr [esp + 8 + 12] /* table_argb */ + mov ecx, [esp + 8 + 12] /* width */ + movd xmm2, dword ptr [esp + 8 + 16] // luma table + movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff pshufd xmm2, xmm2, 0 - mov ecx, [esp + 8 + 16] /* width */ - mov edx, 0x00264b0f // kARGBToYJ - movd xmm3, edx pshufd xmm3, xmm3, 0 - pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 + pcmpeqb xmm4, xmm4 // generate mask 0xff00ff00 psllw xmm4, 8 pxor xmm5, xmm5