diff --git a/README.chromium b/README.chromium index ccf25feea..34e2a98d3 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1307 +Version: 1309 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 8c6feda3a..ac9c17fe8 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -198,7 +198,10 @@ extern "C" { #define HAS_I422TORGB565ROW_AVX2 #define HAS_I422TOARGB1555ROW_AVX2 #define HAS_I422TOARGB4444ROW_AVX2 + +// TODO(fbarchard): Port to Neon #define HAS_ARGBTORGB565DITHERROW_SSE2 +#define HAS_ARGBTORGB565DITHERROW_AVX2 #endif // The following are available on all x86 platforms, but @@ -906,8 +909,12 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, + const uint8* dither8, int pix); void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, - const uint8* dither8x8, int pix); + const uint8* dither8, int pix); +void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, + const uint8* dither8, int pix); void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); @@ -926,9 +933,6 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); -void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, - const uint8* dither8x8, int pix); - void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); @@ -1380,7 +1384,9 @@ void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB565DitherRow_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, - const uint8* dither8x8, int pix); + const uint8* dither8, int pix); +void ARGBToRGB565DitherRow_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, + const uint8* dither8, int pix); void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 04624d0a1..5aef5ba07 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1307 +#define LIBYUV_VERSION 1309 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index ce5d97e1c..c0e784b66 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -842,6 +842,14 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_SSE2; } } +#endif +#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565DitherRow = ARGBToRGB565DitherRow_AVX2; + } + } #endif for (y = 0; y < height; ++y) { ARGBToRGB565DitherRow(src_argb, dst_rgb565, @@ -853,6 +861,7 @@ int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, } // Convert ARGB To RGB565. +// TODO(fbarchard): Consider using dither function low level with zeros. LIBYUV_API int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, uint8* dst_rgb565, int dst_stride_rgb565, diff --git a/source/row_any.cc b/source/row_any.cc index 631b09a4c..d7dcca854 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -239,6 +239,10 @@ RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C, 2, 4, 7) RGBDANY(ARGBToRGB565DitherRow_Any_SSE2, ARGBToRGB565DitherRow_SSE2, ARGBToRGB565DitherRow_C, 4, 2, 7) #endif +#if defined(HAS_ARGBTORGB565DITHERROW_AVX2) +RGBDANY(ARGBToRGB565DitherRow_Any_AVX2, ARGBToRGB565DitherRow_AVX2, + ARGBToRGB565DitherRow_C, 4, 2, 7) +#endif #undef RGBDANY // ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst. diff --git a/source/row_win.cc b/source/row_win.cc index e6796e407..c87f813df 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -687,6 +687,54 @@ void ARGBToRGB565DitherRow_SSE2(const uint8* src_argb, uint8* dst_rgb, } } +#ifdef HAS_ARGBTORGB565DITHERROW_AVX2 +// TODO(fbarchard): Consider vpackusdw and remove vpsrad 16 +__declspec(naked) __declspec(align(16)) +void ARGBToRGB565DitherRow_AVX2(const uint8* src_argb, uint8* dst_rgb, + const uint8* dither8, int pix) { + __asm { + mov eax, [esp + 12] // dither8 + vmovq xmm6, qword ptr [eax] // fetch 8 dither values + vpunpcklbw xmm6, xmm6, xmm6 + vpermq ymm6, ymm6, 0xd8 + vpunpcklwd ymm6, ymm6, ymm6 + + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 16] // pix + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f + vpsrld ymm3, ymm3, 27 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 + vpsrld ymm4, ymm4, 26 + vpslld ymm4, ymm4, 5 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xfffff800 + vpslld ymm5, ymm5, 11 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpaddusb ymm0, ymm0, ymm6 + vpsrld ymm2, ymm0, 5 // G + vpsrld ymm1, ymm0, 3 // B + vpslld ymm0, ymm0, 8 // R + vpand ymm2, ymm2, ymm4 // G + vpand ymm1, ymm1, ymm3 // B + vpsrad ymm0, ymm0, 16 // R + vpand ymm0, ymm0, ymm5 // R + vpor ymm1, ymm1, ymm2 // BG + vpor ymm0, ymm0, ymm1 // BGR + vpackssdw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of RGB565 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTORGB565DITHERROW_AVX2 + // TODO(fbarchard): Improve sign extension/packing. __declspec(naked) __declspec(align(16)) void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { @@ -759,6 +807,7 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { } #ifdef HAS_ARGBTORGB565ROW_AVX2 +// TODO(fbarchard): Consider vpackusdw and remove vpsrad 16 __declspec(naked) __declspec(align(16)) void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index d186a9e8d..36c8bb936 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -1381,4 +1381,37 @@ TEST_F(libyuvTest, TestYToARGB) { } } +static const uint8 kNoDither8x8[64] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +TEST_F(libyuvTest, TestDither) { + align_buffer_64(src_argb, benchmark_width_ * benchmark_height_ * 4); + align_buffer_64(dst_rgb565, benchmark_width_ * benchmark_height_ * 2); + align_buffer_64(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2); + MemRandomize(src_argb, benchmark_width_ * benchmark_height_ * 4); + MemRandomize(dst_rgb565, benchmark_width_ * benchmark_height_ * 2); + MemRandomize(dst_rgb565dither, benchmark_width_ * benchmark_height_ * 2); + ARGBToRGB565(src_argb, benchmark_width_ * 4, + dst_rgb565, benchmark_width_ * 2, + benchmark_width_, benchmark_height_); + ARGBToRGB565Dither(src_argb, benchmark_width_ * 4, + dst_rgb565dither, benchmark_width_ * 2, + kNoDither8x8, benchmark_width_, benchmark_height_); + for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) { + EXPECT_EQ(dst_rgb565[i], dst_rgb565dither[i]); + } + + free_aligned_buffer_64(src_argb); + free_aligned_buffer_64(dst_rgb565); + free_aligned_buffer_64(dst_rgb565dither); +} + } // namespace libyuv