diff --git a/README.chromium b/README.chromium index 9f70d6007..0d7b6f2b0 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 785 +Version: 786 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index fa9ecda74..3e59d159f 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -98,6 +98,7 @@ extern "C" { // Conversions: #define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2 +#define HAS_ARGBPOLYNOMIALROW_SSE2 #define HAS_ARGBTOARGB1555ROW_SSE2 #define HAS_ARGBTOARGB4444ROW_SSE2 #define HAS_ARGBTORAWROW_SSSE3 @@ -142,7 +143,6 @@ extern "C" { #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) // Effects: #define HAS_ARGBCOLORTABLEROW_X86 -#define HAS_ARGBPOLYNOMIALROW_SSE2 #define HAS_RGBCOLORTABLEROW_X86 #define HAS_ARGBLUMACOLORTABLEROW_SSE2 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 6ad24ae15..4e37888d4 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 785 +#define LIBYUV_VERSION 786 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_common.cc b/source/row_common.cc index 305499cf4..3ba0cdb69 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2075,13 +2075,22 @@ void ARGBLumaColorTableRow_C(const uint8* src_argb, dst_argb[3] = src_argb[3]; const uint8* luma1 = RGBToYJx256(src_argb[6], src_argb[5], src_argb[4]) + luma; - dst_argb[4] = luma0[src_argb[4]]; - dst_argb[5] = luma0[src_argb[5]]; - dst_argb[6] = luma0[src_argb[6]]; + dst_argb[4] = luma1[src_argb[4]]; + dst_argb[5] = luma1[src_argb[5]]; + dst_argb[6] = luma1[src_argb[6]]; dst_argb[7] = src_argb[7]; src_argb += 8; dst_argb += 8; } + if (width & 1) { + // Luminance in rows, color values in columns. + const uint8* luma0 = RGBToYJx256(src_argb[2], src_argb[1], src_argb[0]) + + luma; + dst_argb[0] = luma0[src_argb[0]]; + dst_argb[1] = luma0[src_argb[1]]; + dst_argb[2] = luma0[src_argb[2]]; + dst_argb[3] = src_argb[3]; + } } #undef clamp0 diff --git a/source/row_posix.cc b/source/row_posix.cc index 6e8722f6a..b413ab5b5 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -4707,7 +4707,7 @@ void SobelYRow_SSSE3(const uint8* src_y0, const uint8* src_y1, // G = Sobel // B = Sobel void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, - uint8* dst_argb, int width) { + uint8* dst_argb, int width) { asm volatile ( "sub %0,%1 \n" "pcmpeqb %%xmm5,%%xmm5 \n" @@ -5816,6 +5816,65 @@ int FixedDiv_X86(int num, int div) { return num; } #endif // HAS_FIXEDDIV_X86 + +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 +void ARGBPolynomialRow_SSE2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + asm volatile ( + "pxor %%xmm3,%%xmm3 \n" + + // 2 pixel loop. + ".p2align 4 \n" + "1: \n" + "movq (%0),%%xmm0 \n" + "lea 0x8(%0),%0 \n" + "punpcklbw %%xmm3,%%xmm0 \n" + "movdqa %%xmm0,%%xmm4 \n" + "punpcklwd %%xmm3,%%xmm0 \n" + "punpckhwd %%xmm3,%%xmm4 \n" + "cvtdq2ps %%xmm0,%%xmm0 \n" + "cvtdq2ps %%xmm4,%%xmm4 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm4,%%xmm5 \n" + "mulps 0x10(%3),%%xmm0 \n" + "mulps 0x10(%3),%%xmm4 \n" + "addps (%3),%%xmm0 \n" + "addps (%3),%%xmm4 \n" + "movdqa %%xmm1,%%xmm2 \n" + "movdqa %%xmm5,%%xmm6 \n" + "mulps %%xmm1,%%xmm2 \n" + "mulps %%xmm5,%%xmm6 \n" + "mulps %%xmm2,%%xmm1 \n" + "mulps %%xmm6,%%xmm5 \n" + "mulps 0x20(%3),%%xmm2 \n" + "mulps 0x20(%3),%%xmm6 \n" + "mulps 0x30(%3),%%xmm1 \n" + "mulps 0x30(%3),%%xmm5 \n" + "addps %%xmm2,%%xmm0 \n" + "addps %%xmm6,%%xmm4 \n" + "addps %%xmm1,%%xmm0 \n" + "addps %%xmm5,%%xmm4 \n" + "cvttps2dq %%xmm0,%%xmm0 \n" + "cvttps2dq %%xmm4,%%xmm4 \n" + "packuswb %%xmm4,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "sub $0x2,%2 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" + "jg 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(poly) // %3 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" +#endif + ); +} +#endif // HAS_ARGBPOLYNOMIALROW_SSE2 + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/row_win.cc b/source/row_win.cc index 1bf4a94ed..a2ef9a1f7 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -6781,6 +6781,7 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, mov ecx, [esp + 4 + 16] /* width */ pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. + // 2 pixel loop. align 16 convertloop: // (slow) pmovzxbd xmm0, dword ptr [eax] // BGRA pixel @@ -6829,8 +6830,8 @@ void ARGBPolynomialRow_SSE2(const uint8* src_argb, #ifdef HAS_ARGBPOLYNOMIALROW_AVX2 __declspec(naked) __declspec(align(16)) void ARGBPolynomialRow_AVX2(const uint8* src_argb, - uint8* dst_argb, const float* poly, - int width) { + uint8* dst_argb, const float* poly, + int width) { __asm { mov eax, [esp + 12] /* poly */ vmovdqu xmm4, [eax] @@ -6846,6 +6847,7 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb, mov edx, [esp + 8] /* dst_argb */ mov ecx, [esp + 16] /* width */ + // 2 pixel loop. align 16 convertloop: vpmovzxbd ymm0, qword ptr [eax] // 2 BGRA pixels