diff --git a/README.chromium b/README.chromium index 89399c5e0..ddf47c3ed 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 576 +Version: 577 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index fd9e4c633..ae1ba00db 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -140,6 +140,7 @@ extern "C" { // Effects #define HAS_ARGBATTENUATEROW_AVX2 +#define HAS_ARGBUNATTENUATEROW_AVX2 #endif #endif @@ -1324,6 +1325,7 @@ void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb, extern uint32 fixed_invtbl8[256]; void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width); void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width); diff --git a/source/compare.cc b/source/compare.cc index b829eb035..40a8ce958 100644 --- a/source/compare.cc +++ b/source/compare.cc @@ -145,11 +145,9 @@ LIBYUV_API uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a, const uint8* src_b, int stride_b, int width, int height) { - if (stride_a == width && stride_b == width) { return ComputeSumSquareError(src_a, src_b, width * height); } - uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) = SumSquareError_C; #if defined(HAS_SUMSQUAREERROR_NEON) diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 5df56fd82..075c937f0 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1085,6 +1085,14 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb, ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2; } #endif +#if defined(HAS_ARGBUNATTENUATEROW_AVX2) + bool clear = false; + if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) { + bool clear = true; + ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2; + } +#endif +// TODO(fbarchard): Neon version. for (int y = 0; y < height; ++y) { ARGBUnattenuateRow(src_argb, dst_argb, width); diff --git a/source/row_common.cc b/source/row_common.cc index e603a8256..44e058252 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1528,7 +1528,7 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { // 8.16 fixed point inverse table #define T(a) 0x10000 / a uint32 fixed_invtbl8[256] = { - 0x0100, T(0x01), T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), + 0xffff, 0xffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07), T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f), T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17), T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f), diff --git a/source/row_win.cc b/source/row_win.cc index d4a81a4a9..8cff3c0ef 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -4462,6 +4462,53 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, } #endif // HAS_ARGBUNATTENUATEROW_SSE2 +#ifdef HAS_ARGBUNATTENUATEROW_AVX2 +// Shuffle table duplicating alpha. +static const ulvec8 kUnattenShuffleAlpha_AVX2 = { + 0u, 1u, 0u, 1u, 0u, 1u, 128u, 128u, + 8u, 9u, 8u, 9u, 8u, 9u, 128u, 128u, + 0u, 1u, 0u, 1u, 0u, 1u, 128u, 128u, + 8u, 9u, 8u, 9u, 8u, 9u, 128u, 128u, +}; +__declspec(naked) __declspec(align(16)) +void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, + int width) { + __asm { + mov eax, [esp + 4] // src_argb0 + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width + sub edx, eax + vmovdqa ymm4, kUnattenShuffleAlpha_AVX2 + vpcmpeqb ymm5, ymm5, ymm5 // generate mask 0xff000000 + vpslld ymm5, ymm5, 24 + + align 16 + convertloop: + vmovdqu ymm6, [eax] // read 8 pixels. + vpcmpeqb ymm7, ymm7, ymm7 // generate mask 0xffffffff for gather. + vpsrld ymm2, ymm6, 24 // alpha in low 8 bits. + vpunpcklbw ymm0, ymm6, ymm6 // low 4 pixels. mutated. + vpunpckhbw ymm1, ymm6, ymm6 // high 4 pixels. mutated. + vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm7 // ymm7 cleared. + vpunpcklwd ymm2, ymm3, ymm7 // low 4 inverted alphas. mutated. + vpunpckhwd ymm3, ymm3, ymm7 // high 4 inverted alphas. mutated. + vpshufb ymm2, ymm2, ymm4 // replicate low 4 alphas + vpshufb ymm3, ymm3, ymm4 // replicate high 4 alphas + vpmulhuw ymm0, ymm0, ymm2 // rgb * ia + vpmulhuw ymm1, ymm1, ymm3 // rgb * ia + vpand ymm6, ymm6, ymm5 // isolate alpha + vpackuswb ymm0, ymm0, ymm1 // unmutated. + vpor ymm0, ymm0, ymm6 // copy original alpha + sub ecx, 8 + vmovdqu [eax + edx], ymm0 + lea eax, [eax + 32] + jg convertloop + + ret + } +} +#endif // HAS_ARGBATTENUATEROW_AVX2 + #ifdef HAS_ARGBGRAYROW_SSSE3 // Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R static const vec8 kARGBToGray = { diff --git a/source/row_x86.asm b/source/row_x86.asm index 3a028c196..80a9716ba 100644 --- a/source/row_x86.asm +++ b/source/row_x86.asm @@ -42,7 +42,7 @@ cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix %endif packuswb m0, m0, m1 %if cpuflag(AVX2) - vpermq m0, m0, 0xd8 + vpermq m0, m0, 0xd8 %endif sub pixd, mmsize mov%2 [dst_yq], m0 @@ -86,8 +86,8 @@ cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix packuswb m0, m0, m1 packuswb m2, m2, m3 %if cpuflag(AVX2) - vpermq m0, m0, 0xd8 - vpermq m2, m2, 0xd8 + vpermq m0, m0, 0xd8 + vpermq m2, m2, 0xd8 %endif mov%1 [dst_uq], m0 mov%1 [dst_uq + dst_vq], m2 diff --git a/source/scale.cc b/source/scale.cc index b893b2409..92358a34b 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -42,13 +42,7 @@ void SetUseReferenceImpl(bool use) { } // ScaleRowDown2Int also used by planar functions - -/** - * NEON downscalers with interpolation. - * - * Provided by Fritz Koenig - * - */ +// NEON downscalers with interpolation. #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) #define HAS_SCALEROWDOWN2_NEON @@ -98,13 +92,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction); -/** - * SSE2 downscalers with interpolation. - * - * Provided by Frank Barchard (fbarchard@google.com) - * - */ - +// SSE2 downscalers with interpolation. // Constants for SSSE3 code #elif !defined(YUV_DISABLE_ASM) && \ (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__)) @@ -2630,13 +2618,10 @@ void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride, } } -/** - * Scale plane, 1/2 - * - * This is an optimized version for scaling down a plane to 1/2 of - * its original size. - * - */ +// Scale plane, 1/2 +// This is an optimized version for scaling down a plane to 1/2 of +// its original size. + static void ScalePlaneDown2(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, @@ -2676,12 +2661,10 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */, } } -/** - * Scale plane, 1/4 - * - * This is an optimized version for scaling down a plane to 1/4 of - * its original size. - */ +// Scale plane, 1/4 +// This is an optimized version for scaling down a plane to 1/4 of +// its original size. + static void ScalePlaneDown4(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, @@ -2717,13 +2700,10 @@ static void ScalePlaneDown4(int /* src_width */, int /* src_height */, } } -/** - * Scale plane, 1/8 - * - * This is an optimized version for scaling down a plane to 1/8 - * of its original size. - * - */ +// Scale plane, 1/8 +// This is an optimized version for scaling down a plane to 1/8 +// of its original size. + static void ScalePlaneDown8(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, @@ -2748,12 +2728,8 @@ static void ScalePlaneDown8(int /* src_width */, int /* src_height */, } } -/** - * Scale plane down, 3/4 - * - * Provided by Frank Barchard (fbarchard@google.com) - * - */ +// Scale plane down, 3/4 + static void ScalePlaneDown34(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, @@ -2839,23 +2815,22 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */, } } -/** - * Scale plane, 3/8 - * - * This is an optimized version for scaling down a plane to 3/8 - * of its original size. - * - * Uses box filter arranges like this - * aaabbbcc -> abc - * aaabbbcc def - * aaabbbcc ghi - * dddeeeff - * dddeeeff - * dddeeeff - * ggghhhii - * ggghhhii - * Boxes are 3x3, 2x3, 3x2 and 2x2 - */ + +// Scale plane, 3/8 +// This is an optimized version for scaling down a plane to 3/8 +// of its original size. +// +// Uses box filter arranges like this +// aaabbbcc -> abc +// aaabbbcc def +// aaabbbcc ghi +// dddeeeff +// dddeeeff +// dddeeeff +// ggghhhii +// ggghhhii +// Boxes are 3x3, 2x3, 3x2 and 2x2 + static void ScalePlaneDown38(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, @@ -2991,15 +2966,14 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx, } } -/** - * Scale plane down to any dimensions, with interpolation. - * (boxfilter). - * - * Same method as SimpleScale, which is fixed point, outputting - * one pixel of destination using fixed point (16.16) to step - * through source, sampling a box of pixel with simple - * averaging. - */ +// Scale plane down to any dimensions, with interpolation. +// (boxfilter). +// +// Same method as SimpleScale, which is fixed point, outputting +// one pixel of destination using fixed point (16.16) to step +// through source, sampling a box of pixel with simple +// averaging. + static void ScalePlaneBox(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, @@ -3008,8 +2982,6 @@ static void ScalePlaneBox(int src_width, int src_height, assert(dst_height > 0); int dx = (src_width << 16) / dst_width; int dy = (src_height << 16) / dst_height; -// int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1); -// int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1); int x = 0; int y = 0; int maxy = (src_height << 16); @@ -3063,9 +3035,8 @@ static void ScalePlaneBox(int src_width, int src_height, } } -/** - * Scale plane to/from any dimensions, with interpolation. - */ +// Scale plane to/from any dimensions, with interpolation. + static void ScalePlaneBilinearSimple(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, @@ -3104,10 +3075,9 @@ static void ScalePlaneBilinearSimple(int src_width, int src_height, } } -/** - * Scale plane to/from any dimensions, with bilinear - * interpolation. - */ + +// Scale plane to/from any dimensions, with bilinear interpolation. + void ScalePlaneBilinear(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, @@ -3170,12 +3140,11 @@ void ScalePlaneBilinear(int src_width, int src_height, } } -/** - * Scale plane to/from any dimensions, without interpolation. - * Fixed point math is used for performance: The upper 16 bits - * of x and dx is the integer part of the source position and - * the lower 16 bits are the fixed decimal part. - */ +// Scale plane to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. + static void ScalePlaneSimple(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, @@ -3197,9 +3166,8 @@ static void ScalePlaneSimple(int src_width, int src_height, } } -/** - * Scale plane to/from any dimensions. - */ +// Scale plane to/from any dimensions. + static void ScalePlaneAnySize(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, @@ -3215,14 +3183,12 @@ static void ScalePlaneAnySize(int src_width, int src_height, } } -/** - * Scale plane down, any size - * - * This is an optimized version for scaling down a plane to any size. - * The current implementation is ~10 times faster compared to the - * reference implementation for e.g. XGA->LowResPAL - * - */ +// Scale plane down, any size +// +// This is an optimized version for scaling down a plane to any size. +// The current implementation is ~10 times faster compared to the +// reference implementation for e.g. XGA->LowResPAL + static void ScalePlaneDown(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, diff --git a/source/scale_argb.cc b/source/scale_argb.cc index aa10d6120..81d0a33ca 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -44,10 +44,6 @@ void ScaleARGBFilterRows_NEON(uint8* dst_ptr, int dst_width, int source_y_fraction); #endif -/** - * SSE2 downscalers with bilinear interpolation. - */ - #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_SCALEARGBROWDOWN2_SSE2 @@ -880,13 +876,10 @@ void ScaleARGBFilterRows_C(uint8* dst_argb, const uint8* src_argb, dst_argb[3] = dst_argb[-1]; } -/** - * ScaleARGB ARGB, 1/2 - * - * This is an optimized version for scaling down a ARGB to 1/2 of - * its original size. - * - */ +// ScaleARGB ARGB, 1/2 +// This is an optimized version for scaling down a ARGB to 1/2 of +// its original size. + static void ScaleARGBDown2(int /* src_width */, int /* src_height */, int dst_width, int dst_height, int src_stride, int dst_stride, @@ -918,13 +911,10 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */, } } -/** - * ScaleARGB ARGB Even - * - * This is an optimized version for scaling down a ARGB to even - * multiple of its original size. - * - */ +// ScaleARGB ARGB Even +// This is an optimized version for scaling down a ARGB to even +// multiple of its original size. + static void ScaleARGBDownEven(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, @@ -959,10 +949,9 @@ static void ScaleARGBDownEven(int src_width, int src_height, dst_argb += dst_stride; } } -/** - * ScaleARGB ARGB to/from any dimensions, with bilinear - * interpolation. - */ + +// ScaleARGB ARGB to/from any dimensions, with bilinear +// interpolation. // Maximum width handled by 2 pass Bilinear. static const int kMaxInputWidth = 2560; @@ -1033,12 +1022,11 @@ static void ScaleARGBCols(uint8* dst_argb, const uint8* src_argb, } } -/** - * ScaleARGB ARGB to/from any dimensions, without interpolation. - * Fixed point math is used for performance: The upper 16 bits - * of x and dx is the integer part of the source position and - * the lower 16 bits are the fixed decimal part. - */ + +// ScaleARGB ARGB to/from any dimensions, without interpolation. +// Fixed point math is used for performance: The upper 16 bits +// of x and dx is the integer part of the source position and +// the lower 16 bits are the fixed decimal part. static void ScaleARGBSimple(int src_width, int src_height, int dst_width, int dst_height, @@ -1056,9 +1044,8 @@ static void ScaleARGBSimple(int src_width, int src_height, } } -/** - * ScaleARGB ARGB to/from any dimensions. - */ +// ScaleARGB ARGB to/from any dimensions. + static void ScaleARGBAnySize(int src_width, int src_height, int dst_width, int dst_height, int src_stride, int dst_stride, diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index c02d1007a..8089f9cb9 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -167,6 +167,72 @@ TEST_F(libyuvTest, ARGBAttenuate_Opt) { EXPECT_LE(max_diff, 2); } +static int TestUnattenuateI(int width, int height, int benchmark_iterations, + int invert, int off) { + const int kBpp = 4; + const int kStride = (width * kBpp + 15) & ~15; + align_buffer_64(src_argb, kStride * height + off); + align_buffer_64(dst_argb_c, kStride * height); + align_buffer_64(dst_argb_opt, kStride * height); + srandom(time(NULL)); + for (int i = 0; i < kStride * height; ++i) { + src_argb[i + off] = (random() & 0xff); + } + ARGBAttenuate(src_argb + off, kStride, + src_argb + off, kStride, + width, height); + memset(dst_argb_c, 0, kStride * height); + memset(dst_argb_opt, 0, kStride * height); + + MaskCpuFlags(0); + ARGBUnattenuate(src_argb + off, kStride, + dst_argb_c, kStride, + width, invert * height); + MaskCpuFlags(-1); + for (int i = 0; i < benchmark_iterations; ++i) { + ARGBUnattenuate(src_argb + off, kStride, + dst_argb_opt, kStride, + width, invert * height); + } + int max_diff = 0; + for (int i = 0; i < kStride * height; ++i) { + int abs_diff = + abs(static_cast(dst_argb_c[i]) - + static_cast(dst_argb_opt[i])); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + free_aligned_buffer_64(src_argb) + free_aligned_buffer_64(dst_argb_c) + free_aligned_buffer_64(dst_argb_opt) + return max_diff; +} + +TEST_F(libyuvTest, ARGBUnattenuate_Any) { + int max_diff = TestUnattenuateI(benchmark_width_ - 1, benchmark_height_, + benchmark_iterations_, +1, 0); + EXPECT_LE(max_diff, 2); +} + +TEST_F(libyuvTest, ARGBUnattenuate_Unaligned) { + int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, + benchmark_iterations_, +1, 1); + EXPECT_LE(max_diff, 2); +} + +TEST_F(libyuvTest, ARGBUnattenuate_Invert) { + int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, + benchmark_iterations_, -1, 0); + EXPECT_LE(max_diff, 2); +} + +TEST_F(libyuvTest, ARGBUnattenuate_Opt) { + int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_, + benchmark_iterations_, +1, 0); + EXPECT_LE(max_diff, 2); +} + TEST_F(libyuvTest, TestARGBComputeCumulativeSum) { SIMD_ALIGNED(uint8 orig_pixels[16][16][4]); SIMD_ALIGNED(int32 added_pixels[16][16][4]);