diff --git a/README.chromium b/README.chromium index a742a658b..7045920b2 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 777 +Version: 778 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 4f8dc2ef8..57da063d6 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -210,6 +210,20 @@ int RGBColorTable(uint8* dst_argb, int dst_stride_argb, const uint8* table_argb, int x, int y, int width, int height); +// Apply a 3 term polynomial to ARGB values. +// poly points to a 4x4 matrix. The first row is constants. The 2nd row is +// coefficients for b, g, r and a. The 3rd row is coefficients for b squared, +// g squared, r squared and a squared. The 4rd row is coefficients for b to +// the 3, g to the 3, r to the 3 and a to the 3. The values are summed and +// result clamped to 0 to 255. +// A polynomial approximation can be dirived using software such as 'R'. + +LIBYUV_API +int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const float* poly, + int width, int height); + // Quantize a rectangle of ARGB. Alpha unaffected. // scale is a 16 bit fractional fixed point scaler between 0 and 65535. // interval_size should be a value between 1 and 255. diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 91b034fbd..e898381c5 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -140,8 +140,11 @@ extern "C" { // The following are Windows only: // TODO(fbarchard): Port to gcc. #if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) +// Effects: #define HAS_ARGBCOLORTABLEROW_X86 #define HAS_RGBCOLORTABLEROW_X86 +#define HAS_ARGBPOLYNOMIALROW_SSE2 + // Caveat: Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 #define HAS_ARGBSHUFFLEROW_AVX2 @@ -320,6 +323,7 @@ typedef __declspec(align(32)) uint32 ulvec32[8]; typedef __declspec(align(32)) uint8 ulvec8[32]; #elif defined(__GNUC__) +// Caveat GCC 4.2 to 4.7 have a known issue using vectors with const. #define SIMD_ALIGNED(var) var __attribute__((aligned(16))) typedef int16 __attribute__((vector_size(16))) vec16; typedef int32 __attribute__((vector_size(16))) vec32; @@ -1542,6 +1546,14 @@ void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely, void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width); +void ARGBPolynomialRow_C(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width); + +void ARGBPolynomialRow_SSE2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width); + // Divide num by div and return as 16.16 fixed point result. int FixedDiv_C(int num, int div); int FixedDiv_X86(int num, int div); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 53483e7b9..aad646167 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 777 +#define LIBYUV_VERSION 778 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 87db9d05e..da5f0d2df 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2032,6 +2032,38 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, return 0; } +// Apply a 4x4 polynomial to each ARGB pixel. +LIBYUV_API +int ARGBPolynomial(const uint8* src_argb, int src_stride_argb, + uint8* dst_argb, int dst_stride_argb, + const float* poly, + int width, int height) { + if (!src_argb || !dst_argb || !poly || width <= 0 || height <= 0) { + return -1; + } + // Coalesce contiguous rows. + if (src_stride_argb == width * 4 && dst_stride_argb == width * 4) { + return ARGBPolynomial(src_argb, 0, + dst_argb, 0, + poly, + width * height, 1); + } + void (*ARGBPolynomialRow)(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) = ARGBPolynomialRow_C; +#if defined(HAS_ARGBPOLYNOMIALROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + ARGBPolynomialRow = ARGBPolynomialRow_SSE2; + } +#endif + for (int y = 0; y < height; ++y) { + ARGBPolynomialRow(src_argb, dst_argb, poly, width); + src_argb += src_stride_argb; + dst_argb += dst_stride_argb; + } + return 0; +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_common.cc b/source/row_common.cc index ea776fb31..03d0e5bc0 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2009,8 +2009,48 @@ void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy, UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width); I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width); } + #endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) #endif // !defined(LIBYUV_DISABLE_X86) + +void ARGBPolynomialRow_C(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + for (int i = 0; i < width; ++i) { + float b = static_cast(src_argb[0]); + float g = static_cast(src_argb[1]); + float r = static_cast(src_argb[2]); + float a = static_cast(src_argb[3]); + float b2 = b * b; + float g2 = g * g; + float r2 = r * r; + float a2 = a * a; + float db = poly[0] + poly[4] * b; + float dg = poly[1] + poly[5] * g; + float dr = poly[2] + poly[6] * r; + float da = poly[3] + poly[7] * a; + db += poly[8] * b2; + dg += poly[9] * g2; + dr += poly[10] * r2; + da += poly[11] * a2; + float b3 = b2 * b; + float g3 = g2 * g; + float r3 = r2 * r; + float a3 = a2 * a; + db += poly[12] * b3; + dg += poly[13] * g3; + dr += poly[14] * r3; + da += poly[15] * a3; + + dst_argb[0]= Clamp(static_cast(db)); + dst_argb[1]= Clamp(static_cast(dg)); + dst_argb[2]= Clamp(static_cast(dr)); + dst_argb[3]= Clamp(static_cast(da)); + src_argb += 4; + dst_argb += 4; + } +} + #undef clamp0 #undef clamp255 diff --git a/source/row_win.cc b/source/row_win.cc index 55235e27f..af87a3d1f 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -6767,6 +6767,53 @@ int FixedDiv_X86(int num, int div) { } } #endif // HAS_FIXEDDIV_X86 + +#ifdef HAS_ARGBPOLYNOMIALROW_SSE2 +__declspec(naked) __declspec(align(16)) +void ARGBPolynomialRow_SSE2(const uint8* src_argb, + uint8* dst_argb, const float* poly, + int width) { + __asm { + mov eax, [esp + 12] /* poly */ + movdqu xmm4, [eax] + movdqu xmm5, [eax + 16] + movdqu xmm6, [eax + 32] + movdqu xmm7, [eax + 48] + + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ + mov ecx, [esp + 16] /* width */ + pxor xmm3, xmm3 // 4 bytes to 4 ints + + align 16 + convertloop: + movd xmm0, [eax] // BGRA + lea eax, [eax + 4] + punpcklbw xmm0, xmm3 + punpcklwd xmm0, xmm3 + cvtdq2ps xmm0, xmm0 // 4 floats + movdqa xmm1, xmm0 // X + mulps xmm0, xmm5 // C1 * X + addps xmm0, xmm4 // result = C0 + C1 * X + movdqa xmm2, xmm1 + mulps xmm2, xmm1 // X * X + mulps xmm1, xmm2 // X * X * X + mulps xmm2, xmm6 // C2 * X * X + mulps xmm1, xmm7 // C3 * X * X * X + addps xmm0, xmm2 // result += C2 * X * X + addps xmm0, xmm1 // result += C3 * X * X * X + cvttps2dq xmm0, xmm0 + packuswb xmm0, xmm0 + packuswb xmm0, xmm0 + sub ecx, 1 + movd [edx], xmm0 + lea edx, [edx + 4] + jg convertloop + ret + } +} +#endif // HAS_ARGBPOLYNOMIALROW_SSE2 + #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #ifdef __cplusplus diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index b3984dbd4..8683592fe 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -1656,4 +1656,68 @@ TEST_F(libyuvTest, ARGBBlur_Opt) { EXPECT_LE(max_diff, 1); } +TEST_F(libyuvTest, TestARGBPolynomial) { + SIMD_ALIGNED(uint8 orig_pixels[1280][4]); + SIMD_ALIGNED(uint8 dst_pixels[1280][4]); + + static const float kWarmifyPolynomial[16] = { + 0.94230f, -3.03300f, -2.92500f, 0.f, // C0 + 0.584500f, 1.112000f, 1.535000f, 1.f, // C1 x + 0.001313f, -0.002503f, -0.004496f, 0.f, // C2 x * x + 0.0f, 0.000006965f, 0.000008781f, 0.f, // C3 x * x * x + }; + + // Test blue + orig_pixels[0][0] = 255u; + orig_pixels[0][1] = 0u; + orig_pixels[0][2] = 0u; + orig_pixels[0][3] = 128u; + // Test green + orig_pixels[1][0] = 0u; + orig_pixels[1][1] = 255u; + orig_pixels[1][2] = 0u; + orig_pixels[1][3] = 0u; + // Test red + orig_pixels[2][0] = 0u; + orig_pixels[2][1] = 0u; + orig_pixels[2][2] = 255u; + orig_pixels[2][3] = 255u; + // Test color + orig_pixels[3][0] = 16u; + orig_pixels[3][1] = 64u; + orig_pixels[3][2] = 192u; + orig_pixels[3][3] = 224u; + // Do 16 to test asm version. + ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, + &kWarmifyPolynomial[0], 16, 1); + EXPECT_EQ(235u, dst_pixels[0][0]); + EXPECT_EQ(0u, dst_pixels[0][1]); + EXPECT_EQ(0u, dst_pixels[0][2]); + EXPECT_EQ(128u, dst_pixels[0][3]); + EXPECT_EQ(0u, dst_pixels[1][0]); + EXPECT_EQ(233u, dst_pixels[1][1]); + EXPECT_EQ(0u, dst_pixels[1][2]); + EXPECT_EQ(0u, dst_pixels[1][3]); + EXPECT_EQ(0u, dst_pixels[2][0]); + EXPECT_EQ(0u, dst_pixels[2][1]); + EXPECT_EQ(241u, dst_pixels[2][2]); + EXPECT_EQ(255u, dst_pixels[2][3]); + EXPECT_EQ(10u, dst_pixels[3][0]); + EXPECT_EQ(59u, dst_pixels[3][1]); + EXPECT_EQ(188u, dst_pixels[3][2]); + EXPECT_EQ(224u, dst_pixels[3][3]); + + for (int i = 0; i < 1280; ++i) { + orig_pixels[i][0] = i; + orig_pixels[i][1] = i / 2; + orig_pixels[i][2] = i / 3; + orig_pixels[i][3] = i; + } + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + ARGBPolynomial(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, + &kWarmifyPolynomial[0], 1280, 1); + } +} + + } // namespace libyuv