From 933bd40c3c894583f2e0243f5409a8e17d868ba0 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Fri, 27 Feb 2015 21:15:28 +0000 Subject: [PATCH] port ARGBToRGB565 and ARGB1555 to AVX2. Enable functions that use ARGBToRGB565 AVX2 code. Add ARGBToRGB565Dither function. BUG=403 TESTED=local windows build R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/42109004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1302 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- include/libyuv/convert_from_argb.h | 7 ++ include/libyuv/row.h | 20 ++++-- source/convert_from_argb.cc | 56 +++++++++++++++ source/row_any.cc | 4 ++ source/row_common.cc | 29 +++++++- source/row_win.cc | 83 +++++++++++++++++++++- unit_test/convert_test.cc | 106 +++++++++++++++++++++++++++++ 7 files changed, 295 insertions(+), 10 deletions(-) diff --git a/include/libyuv/convert_from_argb.h b/include/libyuv/convert_from_argb.h index 75747fe7d..c592fc235 100644 --- a/include/libyuv/convert_from_argb.h +++ b/include/libyuv/convert_from_argb.h @@ -61,6 +61,13 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, uint8* dst_rgb565, int dst_stride_rgb565, int width, int height); +// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes). +// Values in dither matrix from 0 to 255. 128 is best for no dither. +LIBYUV_API +int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb565, int dst_stride_rgb565, + const uint8* dither8x8, int width, int height); + // Convert ARGB To ARGB1555. LIBYUV_API int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 4592c16a1..80e844bae 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -190,7 +190,14 @@ extern "C" { #define HAS_I422TORGBAROW_AVX2 #define HAS_NV12TOARGBROW_AVX2 #define HAS_NV21TOARGBROW_AVX2 +#define HAS_ARGBTORGB565ROW_AVX2 +#define HAS_ARGBTOARGB1555ROW_AVX2 #define HAS_ARGBTOARGB4444ROW_AVX2 +#define HAS_NV12TORGB565ROW_AVX2 +#define HAS_NV21TORGB565ROW_AVX2 +#define HAS_I422TORGB565ROW_AVX2 +#define HAS_I422TOARGB1555ROW_AVX2 +#define HAS_I422TOARGB4444ROW_AVX2 #endif // The following are available on all x86 platforms, but @@ -223,12 +230,6 @@ extern "C" { #if defined(HAS_I422TOARGBROW_AVX2) #define HAS_YUY2TOARGBROW_AVX2 #define HAS_UYVYTOARGBROW_AVX2 -// TODO(fbarchard): Enable once low levels are ported to AVX2 -// #define HAS_NV12TORGB565ROW_AVX2 -// #define HAS_NV21TORGB565ROW_AVX2 -// #define HAS_I422TORGB565ROW_AVX2 -// #define HAS_I422TOARGB1555ROW_AVX2 -#define HAS_I422TOARGB4444ROW_AVX2 #endif // Effects: @@ -904,6 +905,8 @@ void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); @@ -919,6 +922,9 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, + const uint8* dither8x8, int pix); + void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int pix); void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix); @@ -1369,6 +1375,8 @@ void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToRGB565Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToARGB1555Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444Row_Any_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToRGB24Row_Any_NEON(const uint8* src_argb, uint8* dst_rgb, int pix); diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 7ce430dac..dc2186a6a 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -804,6 +804,46 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, return 0; } +static const uint8 kDither8x8[64] = { + 0, 128, 32, 160, 8, 136, 40, 168, + 192, 64, 224, 96, 200, 72, 232, 104, + 48, 176, 16, 144, 56, 184, 24, 152, + 240, 112, 208, 80, 248, 120, 216, 88, + 12, 140, 44, 172, 4, 132, 36, 164, + 204, 76, 236, 108, 196, 68, 228, 100, + 60, 188, 28, 156, 52, 180, 20, 148, + 252, 124, 220, 92, 244, 116, 212, 84, +}; + +// Convert ARGB To RGB565 with 8x8 dither matrix (64 bytes). +LIBYUV_API +int ARGBToRGB565Dither(const uint8* src_argb, int src_stride_argb, + uint8* dst_rgb565, int dst_stride_rgb565, + const uint8* dither8x8, int width, int height) { + int y; + void (*ARGBToRGB565DitherRow)(const uint8* src_argb, uint8* dst_rgb, + const uint8* dither8x8, int pix) = ARGBToRGB565DitherRow_C; + if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) { + return -1; + } + if (height < 0) { + height = -height; + src_argb = src_argb + (height - 1) * src_stride_argb; + src_stride_argb = -src_stride_argb; + } + if (!dither8x8) { + dither8x8 = kDither8x8; + + } + for (y = 0; y < height; ++y) { + ARGBToRGB565DitherRow(src_argb, dst_rgb565, + dither8x8 + ((y & 7) << 3), width); + src_argb += src_stride_argb; + dst_rgb565 += dst_stride_rgb565; + } + return 0; +} + // Convert ARGB To RGB565. LIBYUV_API int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, @@ -835,6 +875,14 @@ int ARGBToRGB565(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTORGB565ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToRGB565Row = ARGBToRGB565Row_AVX2; + } + } +#endif #if defined(HAS_ARGBTORGB565ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON; @@ -883,6 +931,14 @@ int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb, } } #endif +#if defined(HAS_ARGBTOARGB1555ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_AVX2; + if (IS_ALIGNED(width, 8)) { + ARGBToARGB1555Row = ARGBToARGB1555Row_AVX2; + } + } +#endif #if defined(HAS_ARGBTOARGB1555ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON; diff --git a/source/row_any.cc b/source/row_any.cc index 8a678f2fb..19340b3b7 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -175,6 +175,10 @@ RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C, 4, 2, 3) #endif #if defined(HAS_ARGBTOARGB4444ROW_AVX2) +RGBANY(ARGBToRGB565Row_Any_AVX2, ARGBToRGB565Row_AVX2, ARGBToRGB565Row_C, + 4, 2, 7) +RGBANY(ARGBToARGB1555Row_Any_AVX2, ARGBToARGB1555Row_AVX2, ARGBToARGB1555Row_C, + 4, 2, 7) RGBANY(ARGBToARGB4444Row_Any_AVX2, ARGBToARGB4444Row_AVX2, ARGBToARGB4444Row_C, 4, 2, 7) #endif diff --git a/source/row_common.cc b/source/row_common.cc index 49efd67da..e0e2bf426 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -199,6 +199,32 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { } } +void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8* dst_rgb, + const uint8* dither8x8, int width) { + int x; + for (x = 0; x < width - 1; x += 2) { + int dither0 = dither8x8[x & 7] - 128; + int dither1 = dither8x8[(x & 7) + 1] - 128; + uint8 b0 = Clamp(src_argb[0] + dither0) >> 3; + uint8 g0 = Clamp(src_argb[1] + dither0) >> 2; + uint8 r0 = Clamp(src_argb[2] + dither0) >> 3; + uint8 b1 = Clamp(src_argb[4] + dither1) >> 3; + uint8 g1 = Clamp(src_argb[5] + dither1) >> 2; + uint8 r1 = Clamp(src_argb[6] + dither1) >> 3; + WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | + (b1 << 16) | (g1 << 21) | (r1 << 27)); + dst_rgb += 4; + src_argb += 8; + } + if (width & 1) { + int dither0 = dither8x8[(width - 1) & 7] - 128; + uint8 b0 = Clamp(src_argb[0] + dither0) >> 3; + uint8 g0 = Clamp(src_argb[1] + dither0) >> 2; + uint8 r0 = Clamp(src_argb[2] + dither0) >> 3; + *(uint16*)(dst_rgb) = b0 | (g0 << 5) | (r0 << 11); + } +} + void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { int x; for (x = 0; x < width - 1; x += 2) { @@ -2258,8 +2284,7 @@ void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, uint8* dst_argb, int width) { } #endif // !defined(LIBYUV_DISABLE_X86) -#if defined(HAS_I422TORGB565ROW_AVX2) && !defined(_MSC_VER) -// row_win.cc has asm version, but GCC uses 2 step wrapper. +#if defined(HAS_I422TORGB565ROW_AVX2) void I422ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, diff --git a/source/row_win.cc b/source/row_win.cc index 68c50cd50..5c06b6078 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -693,6 +693,85 @@ void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { } } +#ifdef HAS_ARGBTORGB565ROW_AVX2 +__declspec(naked) __declspec(align(16)) +void ARGBToRGB565Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + vpcmpeqb ymm3, ymm3, ymm3 // generate mask 0x0000001f + vpsrld ymm3, ymm3, 27 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0x000007e0 + vpsrld ymm4, ymm4, 26 + vpslld ymm4, ymm4, 5 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xfffff800 + vpslld ymm5, ymm5, 11 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpsrld ymm2, ymm0, 5 // G + vpsrld ymm1, ymm0, 3 // B + vpslld ymm0, ymm0, 8 // R + vpand ymm2, ymm2, ymm4 // G + vpand ymm1, ymm1, ymm3 // B + vpsrad ymm0, ymm0, 16 // R + vpand ymm0, ymm0, ymm5 // R + vpor ymm1, ymm1, ymm2 // BG + vpor ymm0, ymm0, ymm1 // BGR + vpackssdw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of RGB565 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTORGB565ROW_AVX2 + +#ifdef HAS_ARGBTOARGB1555ROW_AVX2 +__declspec(naked) __declspec(align(16)) +void ARGBToARGB1555Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { + __asm { + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_rgb + mov ecx, [esp + 12] // pix + vpcmpeqb ymm4, ymm4, ymm4 + vpsrld ymm4, ymm4, 27 // generate mask 0x0000001f + vpslld ymm5, ymm4, 5 // generate mask 0x000003e0 + vpslld ymm6, ymm4, 10 // generate mask 0x00007c00 + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffff8000 + vpslld ymm7, ymm7, 15 + + convertloop: + vmovdqu ymm0, [eax] // fetch 8 pixels of argb + vpsrld ymm3, ymm0, 9 // R + vpsrld ymm2, ymm0, 6 // G + vpsrld ymm1, ymm0, 3 // B + vpsrad ymm0, ymm0, 16 // A + vpand ymm3, ymm3, ymm6 // R + vpand ymm2, ymm2, ymm5 // G + vpand ymm1, ymm1, ymm4 // B + vpand ymm0, ymm0, ymm7 // A + vpor ymm0, ymm0, ymm1 // BA + vpor ymm2, ymm2, ymm3 // GR + vpor ymm0, ymm0, ymm2 // BGRA + vpackssdw ymm0, ymm0, ymm0 + vpermq ymm0, ymm0, 0xd8 + lea eax, [eax + 32] + vmovdqu [edx], xmm0 // store 8 pixels of ARGB1555 + lea edx, [edx + 16] + sub ecx, 8 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_ARGBTOARGB1555ROW_AVX2 + #ifdef HAS_ARGBTOARGB4444ROW_AVX2 __declspec(naked) __declspec(align(16)) void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { @@ -700,9 +779,9 @@ void ARGBToARGB4444Row_AVX2(const uint8* src_argb, uint8* dst_rgb, int pix) { mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb mov ecx, [esp + 12] // pix - vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xf000f000 vpsllw ymm4, ymm4, 12 - vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 + vpsrlw ymm3, ymm4, 8 // generate mask 0x00f000f0 convertloop: vmovdqu ymm0, [eax] // fetch 8 pixels of argb diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index d9a6d450d..d186a9e8d 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -934,6 +934,112 @@ TESTATOB(I400, 1, 1, 1, I400Mirror, 1, 1, 1, 0) TESTATOB(Y, 1, 1, 1, ARGB, 4, 4, 1, 0) TESTATOB(ARGB, 4, 4, 1, ARGBMirror, 4, 4, 1, 0) +#define TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ + FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ + W1280, DIFF, N, NEG, OFF) \ +TEST_F(libyuvTest, FMT_A##To##FMT_B##Dither##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ + const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ + const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ + const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ + align_buffer_64(src_argb, kStrideA * kHeightA + OFF); \ + align_buffer_64(dst_argb_c, kStrideB * kHeightB); \ + align_buffer_64(dst_argb_opt, kStrideB * kHeightB); \ + srandom(time(NULL)); \ + for (int i = 0; i < kStrideA * kHeightA; ++i) { \ + src_argb[i + OFF] = (random() & 0xff); \ + } \ + memset(dst_argb_c, 1, kStrideB * kHeightB); \ + memset(dst_argb_opt, 101, kStrideB * kHeightB); \ + MaskCpuFlags(0); \ + FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, \ + dst_argb_c, kStrideB, \ + NULL, kWidth, NEG kHeight); \ + MaskCpuFlags(-1); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + FMT_A##To##FMT_B##Dither(src_argb + OFF, kStrideA, \ + dst_argb_opt, kStrideB, \ + NULL, kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kStrideB * kHeightB; ++i) { \ + int abs_diff = \ + abs(static_cast(dst_argb_c[i]) - \ + static_cast(dst_argb_opt[i])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + EXPECT_LE(max_diff, DIFF); \ + free_aligned_buffer_64(src_argb); \ + free_aligned_buffer_64(dst_argb_c); \ + free_aligned_buffer_64(dst_argb_opt); \ +} + +#define TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ + FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \ +TEST_F(libyuvTest, FMT_A##To##FMT_B##Dither_Random) { \ + srandom(time(NULL)); \ + for (int times = 0; times < benchmark_iterations_; ++times) { \ + const int kWidth = (random() & 63) + 1; \ + const int kHeight = (random() & 31) + 1; \ + const int kHeightA = (kHeight + HEIGHT_A - 1) / HEIGHT_A * HEIGHT_A; \ + const int kHeightB = (kHeight + HEIGHT_B - 1) / HEIGHT_B * HEIGHT_B; \ + const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;\ + const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;\ + align_buffer_page_end(src_argb, kStrideA * kHeightA); \ + align_buffer_page_end(dst_argb_c, kStrideB * kHeightB); \ + align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB); \ + for (int i = 0; i < kStrideA * kHeightA; ++i) { \ + src_argb[i] = (random() & 0xff); \ + } \ + memset(dst_argb_c, 123, kStrideB * kHeightB); \ + memset(dst_argb_opt, 123, kStrideB * kHeightB); \ + MaskCpuFlags(0); \ + FMT_A##To##FMT_B##Dither(src_argb, kStrideA, \ + dst_argb_c, kStrideB, \ + NULL, kWidth, kHeight); \ + MaskCpuFlags(-1); \ + FMT_A##To##FMT_B##Dither(src_argb, kStrideA, \ + dst_argb_opt, kStrideB, \ + NULL, kWidth, kHeight); \ + int max_diff = 0; \ + for (int i = 0; i < kStrideB * kHeightB; ++i) { \ + int abs_diff = \ + abs(static_cast(dst_argb_c[i]) - \ + static_cast(dst_argb_opt[i])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + EXPECT_LE(max_diff, DIFF); \ + free_aligned_buffer_page_end(src_argb); \ + free_aligned_buffer_page_end(dst_argb_c); \ + free_aligned_buffer_page_end(dst_argb_opt); \ + } \ +} + +#define TESTATOBD(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ + FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) \ + TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ + FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ + benchmark_width_ - 4, DIFF, _Any, +, 0) \ + TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ + FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ + benchmark_width_, DIFF, _Unaligned, +, 1) \ + TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ + FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ + benchmark_width_, DIFF, _Invert, -, 0) \ + TESTATOBDI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ + FMT_B, BPP_B, STRIDE_B, HEIGHT_B, \ + benchmark_width_, DIFF, _Opt, +, 0) \ + TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, \ + FMT_B, BPP_B, STRIDE_B, HEIGHT_B, DIFF) + +TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0) + #define TESTSYMI(FMT_ATOB, BPP_A, STRIDE_A, HEIGHT_A, \ W1280, N, NEG, OFF) \ TEST_F(libyuvTest, FMT_ATOB##_Symetric##N) { \