diff --git a/README.chromium b/README.chromium index 00be757d2..1271f3ea8 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 613 +Version: 614 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index ee6c1709d..d5a0452b7 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -19,6 +19,11 @@ extern "C" { #endif // TODO(fbarchard): Remove kMaxStride. +// Functions should allocate a single row buffer of this size on the stack. +// Functions that allocate more than one row buffer may fail or cause stack +// probe. +// This size is a retina Mac pixels of 32 bit ARGB. +// Functions may want less for 8 or 16 bit row buffers. #define kMaxStride (2880 * 4) #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1))) diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 4a393b103..a55f748d5 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 613 +#define LIBYUV_VERSION 614 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/format_conversion.cc b/source/format_conversion.cc index f80caeafe..53955f715 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -72,10 +72,10 @@ int ARGBToBayer(const uint8* src_argb, int src_stride_argb, void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) = ARGBToBayerRow_C; #if defined(HAS_ARGBTOBAYERROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 4 && + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { + if (IS_ALIGNED(width, 8)) { ARGBToBayerRow = ARGBToBayerRow_SSSE3; } } @@ -430,9 +430,9 @@ int I420ToBayer(const uint8* src_y, int src_stride_y, void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) = ARGBToBayerRow_C; #if defined(HAS_ARGBTOBAYERROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { + if (IS_ALIGNED(width, 8)) { ARGBToBayerRow = ARGBToBayerRow_SSSE3; } } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 1cc7d6d2a..68567077b 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1823,13 +1823,12 @@ int ARGBShuffle(const uint8* src_bgra, int src_stride_bgra, } // Sobel ARGB effect. -// TODO(fbarchard): Enable AVX2. Mixing SSSE3 and AVX2 requires zeroupper. LIBYUV_API int ARGBSobel(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height) { if (!src_argb || !dst_argb || - width <= 0 || height == 0 || width > kMaxStride) { + width <= 0 || height == 0 || width > (kMaxStride / 4)) { return -1; } // Negative height means invert the image. @@ -1838,39 +1837,25 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = - ARGBToYRow_C; -#if defined(HAS_ARGBTOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - // Assumed row buffer aligned. - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2_DISABLED) - bool clear = false; - if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { - clear = true; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - ARGBToYRow = ARGBToYRow_Any_NEON; + // ARGBToBayer used to select G channel from ARGB. + void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) = ARGBToBayerRow_C; +#if defined(HAS_ARGBTOBAYERROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; + ARGBToBayerRow = ARGBToBayerRow_SSSE3; + } + } +#elif defined(HAS_ARGBTOBAYERROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 4) { + ARGBToBayerRow = ARGBToBayerRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBToBayerRow = ARGBToBayerRow_NEON; } } #endif - void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width) = SobelYRow_C; #if defined(HAS_SOBELYROW_SSSE3) @@ -1896,18 +1881,18 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, #endif const int kEdge = 16; // Extra pixels at start of row for extrude/align. - SIMD_ALIGNED(uint8 row_y[(kMaxStride + kEdge) * 3 + kEdge]); - SIMD_ALIGNED(uint8 row_sobelx[kMaxStride]); - SIMD_ALIGNED(uint8 row_sobely[kMaxStride]); + SIMD_ALIGNED(uint8 row_y[(kMaxStride / 4 + kEdge) * 3 + kEdge]); + SIMD_ALIGNED(uint8 row_sobelx[kMaxStride / 4]); + SIMD_ALIGNED(uint8 row_sobely[kMaxStride / 4]); // Convert first row. uint8* row_y0 = row_y + kEdge; - uint8* row_y1 = row_y0 + kMaxStride; - uint8* row_y2 = row_y1 + kMaxStride; - ARGBToYRow(src_argb, row_y0, width); + uint8* row_y1 = row_y0 + kMaxStride / 4; + uint8* row_y2 = row_y1 + kMaxStride / 4; + ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width); row_y0[-1] = row_y0[0]; row_y0[width] = row_y0[width - 1]; - ARGBToYRow(src_argb, row_y1, width); + ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width); row_y1[-1] = row_y1[0]; row_y1[width] = row_y1[width - 1]; @@ -1916,7 +1901,7 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, if (y < (height - 1)) { src_argb += src_stride_argb; } - ARGBToYRow(src_argb, row_y2, width); + ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width); row_y2[-1] = row_y2[0]; row_y2[width] = row_y2[width - 1]; @@ -1932,23 +1917,17 @@ int ARGBSobel(const uint8* src_argb, int src_stride_argb, dst_argb += dst_stride_argb; } -#if defined(HAS_ARGBTOYROW_AVX2_DISABLED) - if (clear) { - __asm vzeroupper; - } -#endif return 0; } // SobelXY ARGB effect. // Similar to Sobel, but also stores Sobel X in R and Sobel Y in B. G = Sobel. -// TODO(fbarchard): Enable AVX2. Mixing SSSE3 and AVX2 requires zeroupper. LIBYUV_API int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, uint8* dst_argb, int dst_stride_argb, int width, int height) { if (!src_argb || !dst_argb || - width <= 0 || height == 0 || width > kMaxStride) { + width <= 0 || height == 0 || width > kMaxStride / 4) { return -1; } // Negative height means invert the image. @@ -1957,35 +1936,22 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, src_argb = src_argb + (height - 1) * src_stride_argb; src_stride_argb = -src_stride_argb; } - void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) = - ARGBToYRow_C; -#if defined(HAS_ARGBTOYROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) { - ARGBToYRow = ARGBToYRow_Any_SSSE3; - if (IS_ALIGNED(width, 16)) { - ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; - // Assumed row buffer aligned. - if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToYRow = ARGBToYRow_SSSE3; - } - } - } -#endif -#if defined(HAS_ARGBTOYROW_AVX2_DISABLED) - bool clear = false; - if (TestCpuFlag(kCpuHasAVX2) && width >= 32) { - clear = true; - ARGBToYRow = ARGBToYRow_Any_AVX2; - if (IS_ALIGNED(width, 32)) { - ARGBToYRow = ARGBToYRow_AVX2; - } - } -#endif -#if defined(HAS_ARGBTOYROW_NEON) - if (TestCpuFlag(kCpuHasNEON) && width >= 8) { - ARGBToYRow = ARGBToYRow_Any_NEON; + // ARGBToBayer used to select G channel from ARGB. + void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer, + uint32 selector, int pix) = ARGBToBayerRow_C; +#if defined(HAS_ARGBTOBAYERROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { + ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBToYRow = ARGBToYRow_NEON; + ARGBToBayerRow = ARGBToBayerRow_SSSE3; + } + } +#elif defined(HAS_ARGBTOBAYERROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && width >= 4) { + ARGBToBayerRow = ARGBToBayerRow_Any_NEON; + if (IS_ALIGNED(width, 4)) { + ARGBToBayerRow = ARGBToBayerRow_NEON; } } #endif @@ -2015,18 +1981,18 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, #endif const int kEdge = 16; // Extra pixels at start of row for extrude/align. - SIMD_ALIGNED(uint8 row_y[(kMaxStride + kEdge) * 3 + kEdge]); - SIMD_ALIGNED(uint8 row_sobelx[kMaxStride]); - SIMD_ALIGNED(uint8 row_sobely[kMaxStride]); + SIMD_ALIGNED(uint8 row_y[(kMaxStride / 4 + kEdge) * 3 + kEdge]); + SIMD_ALIGNED(uint8 row_sobelx[kMaxStride / 4]); + SIMD_ALIGNED(uint8 row_sobely[kMaxStride / 4]); // Convert first row. uint8* row_y0 = row_y + kEdge; - uint8* row_y1 = row_y0 + kMaxStride; - uint8* row_y2 = row_y1 + kMaxStride; - ARGBToYRow(src_argb, row_y0, width); + uint8* row_y1 = row_y0 + kMaxStride / 4; + uint8* row_y2 = row_y1 + kMaxStride / 4; + ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width); row_y0[-1] = row_y0[0]; row_y0[width] = row_y0[width - 1]; - ARGBToYRow(src_argb, row_y1, width); + ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width); row_y1[-1] = row_y1[0]; row_y1[width] = row_y1[width - 1]; @@ -2035,7 +2001,7 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, if (y < (height - 1)) { src_argb += src_stride_argb; } - ARGBToYRow(src_argb, row_y2, width); + ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width); row_y2[-1] = row_y2[0]; row_y2[width] = row_y2[width - 1]; @@ -2051,11 +2017,6 @@ int ARGBSobelXY(const uint8* src_argb, int src_stride_argb, dst_argb += dst_stride_argb; } -#if defined(HAS_ARGBTOYROW_AVX2_DISABLED) - if (clear) { - __asm vzeroupper; - } -#endif return 0; } diff --git a/source/row_any.cc b/source/row_any.cc index 5a3bf4cd6..bd55adaf1 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -186,7 +186,7 @@ RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, #if defined(HAS_ARGBTOBAYERROW_SSSE3) BAYERANY(ARGBToBayerRow_Any_SSSE3, ARGBToBayerRow_SSSE3, ARGBToBayerRow_C, - 3, 4, 1) + 7, 4, 1) #endif #if defined(HAS_ARGBTOBAYERROW_NEON) BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C, diff --git a/source/row_posix.cc b/source/row_posix.cc index 4e62b0cd2..34b24c52b 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -4595,11 +4595,14 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" - "lea 0x10(%0),%0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" "pshufb %%xmm5,%%xmm0 \n" - "sub $0x4,%2 \n" - "movd %%xmm0,(%1) \n" - "lea 0x4(%1),%1 \n" + "pshufb %%xmm5,%%xmm1 \n" + "punpckldq xmm1, xmm0 \n" + "sub $0x8,%2 \n" + "movq %%xmm0,(%1) \n" + "lea 0x8(%1),%1 \n" "jg 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 @@ -4607,7 +4610,7 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, : "g"(selector) // %3 : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm5" + , "xmm0", "xmm1", "xmm5" #endif ); } diff --git a/source/row_win.cc b/source/row_win.cc index a46396940..b61cdcc4d 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -5795,11 +5795,14 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer, align 16 wloop: movdqa xmm0, [eax] - lea eax, [eax + 16] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] pshufb xmm0, xmm5 - sub ecx, 4 - movd [edx], xmm0 - lea edx, [edx + 4] + pshufb xmm1, xmm5 + punpckldq xmm0, xmm1 + sub ecx, 8 + movq qword ptr [edx], xmm0 + lea edx, [edx + 8] jg wloop ret } diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 1026cdefc..029e2d03b 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -1352,25 +1352,25 @@ static int TestSobel(int width, int height, int benchmark_iterations, TEST_F(libyuvTest, ARGBSobel_Any) { int max_diff = TestSobel(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_, +1, 0); - EXPECT_LE(max_diff, 14); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBSobel_Unaligned) { int max_diff = TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_, +1, 1); - EXPECT_LE(max_diff, 14); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBSobel_Invert) { int max_diff = TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_, -1, 0); - EXPECT_LE(max_diff, 14); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBSobel_Opt) { int max_diff = TestSobel(benchmark_width_, benchmark_height_, benchmark_iterations_, +1, 0); - EXPECT_LE(max_diff, 14); + EXPECT_EQ(0, max_diff); } static int TestSobelXY(int width, int height, int benchmark_iterations, @@ -1415,25 +1415,25 @@ static int TestSobelXY(int width, int height, int benchmark_iterations, TEST_F(libyuvTest, ARGBSobelXY_Any) { int max_diff = TestSobelXY(benchmark_width_ - 1, benchmark_height_, benchmark_iterations_, +1, 0); - EXPECT_LE(max_diff, 14); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBSobelXY_Unaligned) { int max_diff = TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_, +1, 1); - EXPECT_LE(max_diff, 14); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBSobelXY_Invert) { int max_diff = TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_, -1, 0); - EXPECT_LE(max_diff, 14); + EXPECT_EQ(0, max_diff); } TEST_F(libyuvTest, ARGBSobelXY_Opt) { int max_diff = TestSobelXY(benchmark_width_, benchmark_height_, benchmark_iterations_, +1, 0); - EXPECT_LE(max_diff, 14); + EXPECT_EQ(0, max_diff); } } // namespace libyuv diff --git a/util/compare.cc b/util/compare.cc index 5fded3bbf..c36c0fa5f 100644 --- a/util/compare.cc +++ b/util/compare.cc @@ -61,4 +61,3 @@ int main(int argc, char** argv) { } fclose(fin1); } -