Unattenuate AVX2

BUG=190 TEST=planar_test Review URL: https://webrtc-codereview.appspot.com/1112004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@577 16f28f9a-4ce2-e073-06de-1de4eb20be90
2026-02-16 15:19:52 +08:00 · 2013-02-20 22:18:36 +00:00 · 2013-02-20 22:18:36 +00:00 · 3c7bb050bd
commit 3c7bb050bd
parent d5ee3dc912
10 changed files with 204 additions and 130 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 576
+Version: 577
 License: BSD
 License File: LICENSE
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -140,6 +140,7 @@ extern "C" {
 // Effects
 #define HAS_ARGBATTENUATEROW_AVX2
 #define HAS_ARGBUNATTENUATEROW_AVX2
 #endif
 #endif
@ -1324,6 +1325,7 @@ void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
 extern uint32 fixed_invtbl8[256];
 void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBGrayRow_C(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
--- a/source/compare.cc
+++ b/source/compare.cc
@ -145,11 +145,9 @@ LIBYUV_API
 uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
                                  const uint8* src_b, int stride_b,
                                  int width, int height) {
  if (stride_a == width && stride_b == width) {
    return ComputeSumSquareError(src_a, src_b, width * height);
  }
  uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
      SumSquareError_C;
 #if defined(HAS_SUMSQUAREERROR_NEON)
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -1085,6 +1085,14 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
    ARGBUnattenuateRow = ARGBUnattenuateRow_SSE2;
  }
 #endif
 #if defined(HAS_ARGBUNATTENUATEROW_AVX2)
  bool clear = false;
  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
    bool clear = true;
    ARGBUnattenuateRow = ARGBUnattenuateRow_AVX2;
  }
 #endif
 // TODO(fbarchard): Neon version.
  for (int y = 0; y < height; ++y) {
    ARGBUnattenuateRow(src_argb, dst_argb, width);
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -1528,7 +1528,7 @@ void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
 // 8.16 fixed point inverse table
 #define T(a) 0x10000 / a
 uint32 fixed_invtbl8[256] = {
-  0x0100, T(0x01), T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
+  0xffff, 0xffff, T(0x02), T(0x03), T(0x04), T(0x05), T(0x06), T(0x07),
  T(0x08), T(0x09), T(0x0a), T(0x0b), T(0x0c), T(0x0d), T(0x0e), T(0x0f),
  T(0x10), T(0x11), T(0x12), T(0x13), T(0x14), T(0x15), T(0x16), T(0x17),
  T(0x18), T(0x19), T(0x1a), T(0x1b), T(0x1c), T(0x1d), T(0x1e), T(0x1f),
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -4462,6 +4462,53 @@ void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
 }
 #endif  // HAS_ARGBUNATTENUATEROW_SSE2
 #ifdef HAS_ARGBUNATTENUATEROW_AVX2
 // Shuffle table duplicating alpha.
 static const ulvec8 kUnattenShuffleAlpha_AVX2 = {
  0u, 1u, 0u, 1u, 0u, 1u, 128u, 128u,
  8u, 9u, 8u, 9u, 8u, 9u, 128u, 128u,
  0u, 1u, 0u, 1u, 0u, 1u, 128u, 128u,
  8u, 9u, 8u, 9u, 8u, 9u, 128u, 128u,
 };
 __declspec(naked) __declspec(align(16))
 void ARGBUnattenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb,
                             int width) {
  __asm {
    mov        eax, [esp + 4]   // src_argb0
    mov        edx, [esp + 8]   // dst_argb
    mov        ecx, [esp + 12]  // width
    sub        edx, eax
    vmovdqa    ymm4, kUnattenShuffleAlpha_AVX2
    vpcmpeqb   ymm5, ymm5, ymm5  // generate mask 0xff000000
    vpslld     ymm5, ymm5, 24
    align      16
 convertloop:
    vmovdqu    ymm6, [eax]       // read 8 pixels.
    vpcmpeqb   ymm7, ymm7, ymm7  // generate mask 0xffffffff for gather.
    vpsrld     ymm2, ymm6, 24    // alpha in low 8 bits.
    vpunpcklbw ymm0, ymm6, ymm6  // low 4 pixels. mutated.
    vpunpckhbw ymm1, ymm6, ymm6  // high 4 pixels. mutated.
    vpgatherdd ymm3, [ymm2 * 4 + fixed_invtbl8], ymm7  // ymm7 cleared.
    vpunpcklwd ymm2, ymm3, ymm7  // low 4 inverted alphas. mutated.
    vpunpckhwd ymm3, ymm3, ymm7  // high 4 inverted alphas. mutated.
    vpshufb    ymm2, ymm2, ymm4  // replicate low 4 alphas
    vpshufb    ymm3, ymm3, ymm4  // replicate high 4 alphas
    vpmulhuw   ymm0, ymm0, ymm2  // rgb * ia
    vpmulhuw   ymm1, ymm1, ymm3  // rgb * ia
    vpand      ymm6, ymm6, ymm5  // isolate alpha
    vpackuswb  ymm0, ymm0, ymm1  // unmutated.
    vpor       ymm0, ymm0, ymm6  // copy original alpha
    sub        ecx, 8
    vmovdqu    [eax + edx], ymm0
    lea        eax, [eax + 32]
    jg         convertloop
    ret
  }
 }
 #endif  // HAS_ARGBATTENUATEROW_AVX2
 #ifdef HAS_ARGBGRAYROW_SSSE3
 // Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R
 static const vec8 kARGBToGray = {
--- a/source/scale.cc
+++ b/source/scale.cc
@ -42,13 +42,7 @@ void SetUseReferenceImpl(bool use) {
 }
 // ScaleRowDown2Int also used by planar functions
-
+// NEON downscalers with interpolation.
 /**
 * NEON downscalers with interpolation.
 *
 * Provided by Fritz Koenig
 *
 */
 #if !defined(YUV_DISABLE_ASM) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_SCALEROWDOWN2_NEON
@ -98,13 +92,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
                          const uint8* src_ptr, ptrdiff_t src_stride,
                          int dst_width, int source_y_fraction);
-/**
+// SSE2 downscalers with interpolation.
 * SSE2 downscalers with interpolation.
 *
 * Provided by Frank Barchard (fbarchard@google.com)
 *
 */
 // Constants for SSSE3 code
 #elif !defined(YUV_DISABLE_ASM) && \
    (defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
@ -2630,13 +2618,10 @@ void ScaleAddRows_C(const uint8* src_ptr, ptrdiff_t src_stride,
  }
 }
-/**
+// Scale plane, 1/2
- * Scale plane, 1/2
+// This is an optimized version for scaling down a plane to 1/2 of
- *
+// its original size.
- * This is an optimized version for scaling down a plane to 1/2 of
+
 * its original size.
 *
 */
 static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
                            int dst_width, int dst_height,
                            int src_stride, int dst_stride,
@ -2676,12 +2661,10 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */,
  }
 }
-/**
+// Scale plane, 1/4
- * Scale plane, 1/4
+// This is an optimized version for scaling down a plane to 1/4 of
- *
+// its original size.
- * This is an optimized version for scaling down a plane to 1/4 of
+
 * its original size.
 */
 static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
                            int dst_width, int dst_height,
                            int src_stride, int dst_stride,
@ -2717,13 +2700,10 @@ static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
  }
 }
-/**
+// Scale plane, 1/8
- * Scale plane, 1/8
+// This is an optimized version for scaling down a plane to 1/8
- *
+// of its original size.
- * This is an optimized version for scaling down a plane to 1/8
+
 * of its original size.
 *
 */
 static void ScalePlaneDown8(int /* src_width */, int /* src_height */,
                            int dst_width, int dst_height,
                            int src_stride, int dst_stride,
@ -2748,12 +2728,8 @@ static void ScalePlaneDown8(int /* src_width */, int /* src_height */,
  }
 }
-/**
+// Scale plane down, 3/4
- * Scale plane down, 3/4
+
 *
 * Provided by Frank Barchard (fbarchard@google.com)
 *
 */
 static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
                             int dst_width, int dst_height,
                             int src_stride, int dst_stride,
@ -2839,23 +2815,22 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
  }
 }
-/**
+
- * Scale plane, 3/8
+// Scale plane, 3/8
- *
+// This is an optimized version for scaling down a plane to 3/8
- * This is an optimized version for scaling down a plane to 3/8
+// of its original size.
- * of its original size.
+//
- *
+// Uses box filter arranges like this
- * Uses box filter arranges like this
+// aaabbbcc -> abc
- * aaabbbcc -> abc
+// aaabbbcc    def
- * aaabbbcc    def
+// aaabbbcc    ghi
- * aaabbbcc    ghi
+// dddeeeff
- * dddeeeff
+// dddeeeff
- * dddeeeff
+// dddeeeff
- * dddeeeff
+// ggghhhii
- * ggghhhii
+// ggghhhii
- * ggghhhii
+// Boxes are 3x3, 2x3, 3x2 and 2x2
- * Boxes are 3x3, 2x3, 3x2 and 2x2
+
 */
 static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
                             int dst_width, int dst_height,
                             int src_stride, int dst_stride,
@ -2991,15 +2966,14 @@ static void ScaleAddCols1_C(int dst_width, int boxheight, int x, int dx,
  }
 }
-/**
+// Scale plane down to any dimensions, with interpolation.
- * Scale plane down to any dimensions, with interpolation.
+// (boxfilter).
- * (boxfilter).
+//
- *
+// Same method as SimpleScale, which is fixed point, outputting
- * Same method as SimpleScale, which is fixed point, outputting
+// one pixel of destination using fixed point (16.16) to step
- * one pixel of destination using fixed point (16.16) to step
+// through source, sampling a box of pixel with simple
- * through source, sampling a box of pixel with simple
+// averaging.
- * averaging.
+
 */
 static void ScalePlaneBox(int src_width, int src_height,
                          int dst_width, int dst_height,
                          int src_stride, int dst_stride,
@ -3008,8 +2982,6 @@ static void ScalePlaneBox(int src_width, int src_height,
  assert(dst_height > 0);
  int dx = (src_width << 16) / dst_width;
  int dy = (src_height << 16) / dst_height;
 //  int x = (dx >= 65536) ? ((dx >> 1) - 32768) : (dx >> 1);
 //  int y = (dy >= 65536) ? ((dy >> 1) - 32768) : (dy >> 1);
  int x = 0;
  int y = 0;
  int maxy = (src_height << 16);
@ -3063,9 +3035,8 @@ static void ScalePlaneBox(int src_width, int src_height,
  }
 }
-/**
+// Scale plane to/from any dimensions, with interpolation.
- * Scale plane to/from any dimensions, with interpolation.
+
 */
 static void ScalePlaneBilinearSimple(int src_width, int src_height,
                                     int dst_width, int dst_height,
                                     int src_stride, int dst_stride,
@ -3104,10 +3075,9 @@ static void ScalePlaneBilinearSimple(int src_width, int src_height,
  }
 }
-/**
+
- * Scale plane to/from any dimensions, with bilinear
+// Scale plane to/from any dimensions, with bilinear interpolation.
- * interpolation.
+
 */
 void ScalePlaneBilinear(int src_width, int src_height,
                        int dst_width, int dst_height,
                        int src_stride, int dst_stride,
@ -3170,12 +3140,11 @@ void ScalePlaneBilinear(int src_width, int src_height,
  }
 }
-/**
+// Scale plane to/from any dimensions, without interpolation.
- * Scale plane to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
- * Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
- * of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
- * the lower 16 bits are the fixed decimal part.
+
 */
 static void ScalePlaneSimple(int src_width, int src_height,
                             int dst_width, int dst_height,
                             int src_stride, int dst_stride,
@ -3197,9 +3166,8 @@ static void ScalePlaneSimple(int src_width, int src_height,
  }
 }
-/**
+// Scale plane to/from any dimensions.
- * Scale plane to/from any dimensions.
+
 */
 static void ScalePlaneAnySize(int src_width, int src_height,
                              int dst_width, int dst_height,
                              int src_stride, int dst_stride,
@ -3215,14 +3183,12 @@ static void ScalePlaneAnySize(int src_width, int src_height,
  }
 }
-/**
+// Scale plane down, any size
- * Scale plane down, any size
+//
- *
+// This is an optimized version for scaling down a plane to any size.
- * This is an optimized version for scaling down a plane to any size.
+// The current implementation is ~10 times faster compared to the
- * The current implementation is ~10 times faster compared to the
+// reference implementation for e.g. XGA->LowResPAL
- * reference implementation for e.g. XGA->LowResPAL
+
 *
 */
 static void ScalePlaneDown(int src_width, int src_height,
                           int dst_width, int dst_height,
                           int src_stride, int dst_stride,
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@ -44,10 +44,6 @@ void ScaleARGBFilterRows_NEON(uint8* dst_ptr,
                              int dst_width, int source_y_fraction);
 #endif
 /**
 * SSE2 downscalers with bilinear interpolation.
 */
 #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
 #define HAS_SCALEARGBROWDOWN2_SSE2
@ -880,13 +876,10 @@ void ScaleARGBFilterRows_C(uint8* dst_argb, const uint8* src_argb,
  dst_argb[3] = dst_argb[-1];
 }
-/**
+// ScaleARGB ARGB, 1/2
- * ScaleARGB ARGB, 1/2
+// This is an optimized version for scaling down a ARGB to 1/2 of
- *
+// its original size.
- * This is an optimized version for scaling down a ARGB to 1/2 of
+
 * its original size.
 *
 */
 static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
                           int dst_width, int dst_height,
                           int src_stride, int dst_stride,
@ -918,13 +911,10 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */,
  }
 }
-/**
+// ScaleARGB ARGB Even
- * ScaleARGB ARGB Even
+// This is an optimized version for scaling down a ARGB to even
- *
+// multiple of its original size.
- * This is an optimized version for scaling down a ARGB to even
+
 * multiple of its original size.
 *
 */
 static void ScaleARGBDownEven(int src_width, int src_height,
                              int dst_width, int dst_height,
                              int src_stride, int dst_stride,
@ -959,10 +949,9 @@ static void ScaleARGBDownEven(int src_width, int src_height,
    dst_argb += dst_stride;
  }
 }
-/**
+
- * ScaleARGB ARGB to/from any dimensions, with bilinear
+// ScaleARGB ARGB to/from any dimensions, with bilinear
- * interpolation.
+// interpolation.
 */
 // Maximum width handled by 2 pass Bilinear.
 static const int kMaxInputWidth = 2560;
@ -1033,12 +1022,11 @@ static void ScaleARGBCols(uint8* dst_argb, const uint8* src_argb,
  }
 }
-/**
+
- * ScaleARGB ARGB to/from any dimensions, without interpolation.
+// ScaleARGB ARGB to/from any dimensions, without interpolation.
- * Fixed point math is used for performance: The upper 16 bits
+// Fixed point math is used for performance: The upper 16 bits
- * of x and dx is the integer part of the source position and
+// of x and dx is the integer part of the source position and
- * the lower 16 bits are the fixed decimal part.
+// the lower 16 bits are the fixed decimal part.
 */
 static void ScaleARGBSimple(int src_width, int src_height,
                            int dst_width, int dst_height,
@ -1056,9 +1044,8 @@ static void ScaleARGBSimple(int src_width, int src_height,
  }
 }
-/**
+// ScaleARGB ARGB to/from any dimensions.
- * ScaleARGB ARGB to/from any dimensions.
+
 */
 static void ScaleARGBAnySize(int src_width, int src_height,
                             int dst_width, int dst_height,
                             int src_stride, int dst_stride,
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -167,6 +167,72 @@ TEST_F(libyuvTest, ARGBAttenuate_Opt) {
  EXPECT_LE(max_diff, 2);
 }
 static int TestUnattenuateI(int width, int height, int benchmark_iterations,
                            int invert, int off) {
  const int kBpp = 4;
  const int kStride = (width * kBpp + 15) & ~15;
  align_buffer_64(src_argb, kStride * height + off);
  align_buffer_64(dst_argb_c, kStride * height);
  align_buffer_64(dst_argb_opt, kStride * height);
  srandom(time(NULL));
  for (int i = 0; i < kStride * height; ++i) {
    src_argb[i + off] = (random() & 0xff);
  }
  ARGBAttenuate(src_argb + off, kStride,
                src_argb + off, kStride,
                width, height);
  memset(dst_argb_c, 0, kStride * height);
  memset(dst_argb_opt, 0, kStride * height);
  MaskCpuFlags(0);
  ARGBUnattenuate(src_argb + off, kStride,
                  dst_argb_c, kStride,
                  width, invert * height);
  MaskCpuFlags(-1);
  for (int i = 0; i < benchmark_iterations; ++i) {
    ARGBUnattenuate(src_argb + off, kStride,
                    dst_argb_opt, kStride,
                    width, invert * height);
  }
  int max_diff = 0;
  for (int i = 0; i < kStride * height; ++i) {
    int abs_diff =
        abs(static_cast<int>(dst_argb_c[i]) -
            static_cast<int>(dst_argb_opt[i]));
    if (abs_diff > max_diff) {
      max_diff = abs_diff;
    }
  }
  free_aligned_buffer_64(src_argb)
  free_aligned_buffer_64(dst_argb_c)
  free_aligned_buffer_64(dst_argb_opt)
  return max_diff;
 }
 TEST_F(libyuvTest, ARGBUnattenuate_Any) {
  int max_diff = TestUnattenuateI(benchmark_width_ - 1, benchmark_height_,
                                  benchmark_iterations_, +1, 0);
  EXPECT_LE(max_diff, 2);
 }
 TEST_F(libyuvTest, ARGBUnattenuate_Unaligned) {
  int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
                                  benchmark_iterations_, +1, 1);
  EXPECT_LE(max_diff, 2);
 }
 TEST_F(libyuvTest, ARGBUnattenuate_Invert) {
  int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
                                  benchmark_iterations_, -1, 0);
  EXPECT_LE(max_diff, 2);
 }
 TEST_F(libyuvTest, ARGBUnattenuate_Opt) {
  int max_diff = TestUnattenuateI(benchmark_width_, benchmark_height_,
                                  benchmark_iterations_, +1, 0);
  EXPECT_LE(max_diff, 2);
 }
 TEST_F(libyuvTest, TestARGBComputeCumulativeSum) {
  SIMD_ALIGNED(uint8 orig_pixels[16][16][4]);
  SIMD_ALIGNED(int32 added_pixels[16][16][4]);