Add 2 ARGB Images together and store to destination

BUG=175 TEST=Add unittest Review URL: https://webrtc-codereview.appspot.com/1049004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@543 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-07 01:06:46 +08:00 · 2013-01-18 23:03:56 +00:00 · 2013-01-18 23:03:56 +00:00 · 83e1b17cc0
commit 83e1b17cc0
parent 8fa7634994
10 changed files with 249 additions and 28 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 542
+Version: 543
 License: BSD
 License File: LICENSE
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@ -216,6 +216,13 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
                 uint8* dst_argb, int dst_stride_argb,
                 int width, int height);
 // Add ARGB image with ARGB image.
 LIBYUV_API
 int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
            const uint8* src_argb1, int src_stride_argb1,
            uint8* dst_argb, int dst_stride_argb,
            int width, int height);
 // Convert I422 to YUY2.
 LIBYUV_API
 int I422ToYUY2(const uint8* src_y, int src_stride_y,
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -99,6 +99,7 @@ extern "C" {
 #define HAS_YUY2TOYROW_SSE2
 // Effects
 #define HAS_ARGBADDROW_SSE2
 #define HAS_ARGBAFFINEROW_SSE2
 #define HAS_ARGBATTENUATEROW_SSSE3
 #define HAS_ARGBBLENDROW_SSSE3
@ -967,7 +968,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb, const uint8* src_argb1,
 void ARGBBlendRow_C(const uint8* src_argb, const uint8* src_argb1,
                    uint8* dst_argb, int width);
-// ARGB preattenuated alpha blend.  Same API as Blend, but these require
+// ARGB multiply images.  Same API as Blend, but these require
 // pointer and width alignment for SSE2.
 void ARGBMultiplyRow_C(const uint8* src_argb, const uint8* src_argb1,
                       uint8* dst_argb, int width);
@ -976,6 +977,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
 void ARGBMultiplyRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
                              uint8* dst_argb, int width);
 // ARGB add images.
 void ARGBAddRow_C(const uint8* src_argb, const uint8* src_argb1,
                  uint8* dst_argb, int width);
 void ARGBAddRow_SSE2(const uint8* src_argb, const uint8* src_argb1,
                     uint8* dst_argb, int width);
 void ARGBAddRow_Any_SSE2(const uint8* src_argb, const uint8* src_argb1,
                         uint8* dst_argb, int width);
 void ARGBToRGB24Row_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRAWRow_Any_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix);
 void ARGBToRGB565Row_Any_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 542
+#define LIBYUV_VERSION 543
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -448,6 +448,50 @@ int ARGBMultiply(const uint8* src_argb0, int src_stride_argb0,
  return 0;
 }
 // Add 2 ARGB images together and store to destination.
 LIBYUV_API
 int ARGBAdd(const uint8* src_argb0, int src_stride_argb0,
            const uint8* src_argb1, int src_stride_argb1,
            uint8* dst_argb, int dst_stride_argb,
            int width, int height) {
  if (!src_argb0 || !src_argb1 || !dst_argb || width <= 0 || height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    dst_argb = dst_argb + (height - 1) * dst_stride_argb;
    dst_stride_argb = -dst_stride_argb;
  }
  void (*ARGBAddRow)(const uint8* src0, const uint8* src1, uint8* dst,
                     int width) = ARGBAddRow_C;
 #if defined(HAS_ARGBADDROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
      IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
      IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
    ARGBAddRow = ARGBAddRow_Any_SSE2;
    if (IS_ALIGNED(width, 4)) {
      ARGBAddRow = ARGBAddRow_SSE2;
    }
  }
 #elif defined(HAS_ARGBADDROW_NEON)
  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
    ARGBAddRow = ARGBAddRow_NEON;
  }
 #endif
  // Add plane
  for (int y = 0; y < height; ++y) {
    ARGBAddRow(src_argb0, src_argb1, dst_argb, width);
    src_argb0 += src_stride_argb0;
    src_argb1 += src_stride_argb1;
    dst_argb += dst_stride_argb;
  }
  return 0;
 }
 // Convert I422 to BGRA.
 LIBYUV_API
 int I422ToBGRA(const uint8* src_y, int src_stride_y,
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -324,7 +324,7 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
 #endif
 #undef UV422ANY
-#define SplitUVRowANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK)                  \
+#define SPLITUVROWANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK)                  \
    void NAMEANY(const uint8* src_uv,                                          \
                 uint8* dst_u, uint8* dst_v, int width) {                      \
      int n = width & ~MASK;                                                   \
@ -336,21 +336,21 @@ UV422ANY(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON,
    }
 #ifdef HAS_SPLITUVROW_SSE2
-SplitUVRowANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15)
+SPLITUVROWANY(SplitUVRow_Any_SSE2, SplitUVRow_Unaligned_SSE2, SplitUVRow_C, 15)
 #endif
 #ifdef HAS_SPLITUVROW_AVX2
-SplitUVRowANY(SplitUVRow_Any_AVX2, SplitUVRow_Unaligned_AVX2, SplitUVRow_C, 31)
+SPLITUVROWANY(SplitUVRow_Any_AVX2, SplitUVRow_Unaligned_AVX2, SplitUVRow_C, 31)
 #endif
 #ifdef HAS_SPLITUVROW_NEON
-SplitUVRowANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15)
+SPLITUVROWANY(SplitUVRow_Any_NEON, SplitUVRow_NEON, SplitUVRow_C, 15)
 #endif
 #ifdef HAS_SPLITUVROW_MIPS_DSPR2
-SplitUVRowANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
+SPLITUVROWANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
              SplitUVRow_C, 15)
 #endif
-#undef SplitUVRowANY
+#undef SPLITUVROWANY
-#define MergeUVRow_ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK)                 \
+#define MERGEUVROW_ANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, MASK)                 \
    void NAMEANY(const uint8* src_u, const uint8* src_v,                       \
                 uint8* dst_uv, int width) {                                   \
      int n = width & ~MASK;                                                   \
@ -362,17 +362,17 @@ SplitUVRowANY(SplitUVRow_Any_MIPS_DSPR2, SplitUVRow_Unaligned_MIPS_DSPR2,
    }
 #ifdef HAS_MERGEUVROW_SSE2
-MergeUVRow_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15)
+MERGEUVROW_ANY(MergeUVRow_Any_SSE2, MergeUVRow_Unaligned_SSE2, MergeUVRow_C, 15)
 #endif
 #ifdef HAS_MERGEUVROW_AVX2
-MergeUVRow_ANY(MergeUVRow_Any_AVX2, MergeUVRow_Unaligned_AVX2, MergeUVRow_C, 31)
+MERGEUVROW_ANY(MergeUVRow_Any_AVX2, MergeUVRow_Unaligned_AVX2, MergeUVRow_C, 31)
 #endif
 #ifdef HAS_MERGEUVROW_NEON
-MergeUVRow_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
+MERGEUVROW_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
 #endif
-#undef MergeUVRow_ANY
+#undef MERGEUVROW_ANY
-#define MultiplyRow_ANY(NAMEANY, ARGBMULT_SIMD, ARGBMULT_C, MASK)              \
+#define MATHROW_ANY(NAMEANY, ARGBMULT_SIMD, ARGBMULT_C, MASK)                  \
    void NAMEANY(const uint8* src_argb0, const uint8* src_argb1,               \
                 uint8* dst_argb, int width) {                                 \
      int n = width & ~MASK;                                                   \
@ -384,9 +384,13 @@ MergeUVRow_ANY(MergeUVRow_Any_NEON, MergeUVRow_NEON, MergeUVRow_C, 15)
    }
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
-MultiplyRow_ANY(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2,
+MATHROW_ANY(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, ARGBMultiplyRow_C, 3)
                ARGBMultiplyRow_C, 3)
 #endif
 #ifdef HAS_ARGBADDROW_SSE2
 MATHROW_ANY(ARGBAddRow_Any_SSE2, ARGBAddRow_SSE2, ARGBAddRow_C, 3)
 #endif
 #undef MATHROW_ANY
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -727,6 +727,30 @@ void ARGBMultiplyRow_C(const uint8* src_argb0, const uint8* src_argb1,
 #undef REPEAT8
 #undef SHADE
 #define SHADE(f, v) ((v + f) > 255) ? 255 : (v + f)
 void ARGBAddRow_C(const uint8* src_argb0, const uint8* src_argb1,
                  uint8* dst_argb, int width) {
  for (int i = 0; i < width; ++i) {
    const uint32 b = src_argb0[0];
    const uint32 g = src_argb0[1];
    const uint32 r = src_argb0[2];
    const uint32 a = src_argb0[3];
    const uint32 b_add = src_argb1[0];
    const uint32 g_add = src_argb1[1];
    const uint32 r_add = src_argb1[2];
    const uint32 a_add = src_argb1[3];
    dst_argb[0] = SHADE(b, b_add);
    dst_argb[1] = SHADE(g, g_add);
    dst_argb[2] = SHADE(r, r_add);
    dst_argb[3] = SHADE(a, a_add);
    src_argb0 += 4;
    src_argb1 += 4;
    dst_argb += 4;
  }
 }
 #undef SHADE
 void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
  // Copy a Y to RGB.
  for (int x = 0; x < width; ++x) {
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -3961,7 +3961,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 #endif  // HAS_ARGBSHADEROW_SSE2
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
-// Multiple 2 rows of ARGB pixels together, 4 pixels at a time.
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
 // Aligned to 16 bytes.
 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                          uint8* dst_argb, int width) {
@ -4001,6 +4001,39 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 }
 #endif  // HAS_ARGBMULTIPLYROW_SSE2
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
 // Aligned to 16 bytes.
 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                     uint8* dst_argb, int width) {
  asm volatile (
    "pxor      %%xmm5,%%xmm5                   \n"
    "sub       %0,%1                           \n"
    "sub       %0,%2                           \n"
    // 4 pixel loop.
    ".p2align  4                               \n"
  "1:                                          \n"
    "movdqa    (%0),%%xmm0                     \n"
    "movdqa    (%0,%1),%%xmm1                  \n"
    "paddusb   %%xmm1,%%xmm0                   \n"
    "sub       $0x4,%3                         \n"
    "movdqa    %%xmm0,(%0,%2,1)                \n"
    "lea       0x10(%0),%0                     \n"
    "jg        1b                              \n"
  : "+r"(src_argb0),  // %0
    "+r"(src_argb1),  // %1
    "+r"(dst_argb),   // %2
    "+r"(width)       // %3
  :
  : "memory", "cc"
 #if defined(__SSE2__)
    , "xmm0", "xmm1", "xmm5"
 #endif
  );
 }
 #endif  // HAS_ARGBADDROW_SSE2
 #ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
 // Creates a table of cumulative sums where each value is a sum of all values
 // above and to the left of the value, inclusive of the value.
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -4277,7 +4277,7 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 #endif  // HAS_ARGBSHADEROW_SSE2
 #ifdef HAS_ARGBMULTIPLYROW_SSE2
-// Multiple 2 rows of ARGB pixels together, 4 pixels at a time.
+// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
 // Aligned to 16 bytes.
 __declspec(naked) __declspec(align(16))
 void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
@ -4302,8 +4302,8 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
    punpckhbw  xmm1, xmm1       // next 2
    punpcklbw  xmm2, xmm5       // first 2
    punpckhbw  xmm3, xmm5       // next 2
-    pmulhuw    xmm0, xmm2       // argb * value
+    pmulhuw    xmm0, xmm2       // src_argb0 * src_argb1 first 2
-    pmulhuw    xmm1, xmm3       // argb * value
+    pmulhuw    xmm1, xmm3       // src_argb0 * src_argb1 next 2
    packuswb   xmm0, xmm1
    sub        ecx, 4
    movdqa     [eax + edx], xmm0
@ -4316,6 +4316,38 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
 }
 #endif  // HAS_ARGBMULTIPLYROW_SSE2
 #ifdef HAS_ARGBADDROW_SSE2
 // Add 2 rows of ARGB pixels together, 4 pixels at a time.
 // Aligned to 16 bytes.
 __declspec(naked) __declspec(align(16))
 void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
                     uint8* dst_argb, int width) {
  __asm {
    push       esi
    mov        eax, [esp + 4 + 4]   // src_argb0
    mov        esi, [esp + 4 + 8]   // src_argb1
    mov        edx, [esp + 4 + 12]  // dst_argb
    mov        ecx, [esp + 4 + 16]  // width
    pxor       xmm5, xmm5  // constant 0
    sub        esi, eax
    sub        edx, eax
    align      16
 convertloop:
    movdqa     xmm0, [eax]        // read 4 pixels from src_argb0
    movdqa     xmm1, [eax + esi]  // read 4 pixels from src_argb1
    paddusb    xmm0, xmm1         // src_argb0 + src_argb1
    sub        ecx, 4
    movdqa     [eax + edx], xmm0
    lea        eax, [eax + 16]
    jg         convertloop
    pop        esi
    ret
  }
 }
 #endif  // HAS_ARGBADDROW_SSE2
 #ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
 // Consider float CumulativeSum.
 // Consider calling CumulativeSum one row at time as needed.
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -968,4 +968,72 @@ TEST_F(libyuvTest, ARGBMultiply_Opt) {
  EXPECT_LE(max_diff, 1);
 }
 static int TestAdd(int width, int height, int benchmark_iterations,
                   int invert, int off) {
  const int kBpp = 4;
  const int kStride = (width * kBpp + 15) & ~15;
  align_buffer_64(src_argb_a, kStride * height + off);
  align_buffer_64(src_argb_b, kStride * height + off);
  align_buffer_64(dst_argb_c, kStride * height);
  align_buffer_64(dst_argb_opt, kStride * height);
  srandom(time(NULL));
  for (int i = 0; i < kStride * height; ++i) {
    src_argb_a[i + off] = (random() & 0xff);
    src_argb_b[i + off] = (random() & 0xff);
  }
  memset(dst_argb_c, 0, kStride * height);
  memset(dst_argb_opt, 0, kStride * height);
  MaskCpuFlags(0);
  ARGBAdd(src_argb_a + off, kStride,
          src_argb_b + off, kStride,
          dst_argb_c, kStride,
          width, invert * height);
  MaskCpuFlags(-1);
  for (int i = 0; i < benchmark_iterations; ++i) {
    ARGBAdd(src_argb_a + off, kStride,
            src_argb_b + off, kStride,
            dst_argb_opt, kStride,
            width, invert * height);
  }
  int max_diff = 0;
  for (int i = 0; i < kStride * height; ++i) {
    int abs_diff =
        abs(static_cast<int>(dst_argb_c[i]) -
            static_cast<int>(dst_argb_opt[i]));
    if (abs_diff > max_diff) {
      max_diff = abs_diff;
    }
  }
  free_aligned_buffer_64(src_argb_a)
  free_aligned_buffer_64(src_argb_b)
  free_aligned_buffer_64(dst_argb_c)
  free_aligned_buffer_64(dst_argb_opt)
  return max_diff;
 }
 TEST_F(libyuvTest, ARGBAdd_Any) {
  int max_diff = TestAdd(benchmark_width_ - 1, benchmark_height_,
                         benchmark_iterations_, +1, 0);
  EXPECT_LE(max_diff, 1);
 }
 TEST_F(libyuvTest, ARGBAdd_Unaligned) {
  int max_diff = TestAdd(benchmark_width_, benchmark_height_,
                         benchmark_iterations_, +1, 1);
  EXPECT_LE(max_diff, 1);
 }
 TEST_F(libyuvTest, ARGBAdd_Invert) {
  int max_diff = TestAdd(benchmark_width_, benchmark_height_,
                         benchmark_iterations_, -1, 0);
  EXPECT_LE(max_diff, 1);
 }
 TEST_F(libyuvTest, ARGBAdd_Opt) {
  int max_diff = TestAdd(benchmark_width_, benchmark_height_,
                         benchmark_iterations_, +1, 0);
  EXPECT_LE(max_diff, 1);
 }
 }  // namespace libyuv