AddRow function to help implement box filter

BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/527002 git-svn-id: http://libyuv.googlecode.com/svn/trunk@252 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-06 16:56:55 +08:00 · 2012-04-26 00:01:41 +00:00 · 2012-04-26 00:01:41 +00:00 · 5566302866
commit 5566302866
parent 5ff3a8fec5
10 changed files with 219 additions and 2 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 251
+Version: 252
 License: BSD
 License File: LICENSE

--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@ -170,6 +170,11 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
                    uint8* dst_argb, int dst_stride_argb,
                    int width, int height);

+// Get function to add or subtract rows of bytes to a 16 bit buffer.  For blur.
+typedef void (*AddRow)(const uint8* src, uint16* dst, int width);
+AddRow GetAddRow(uint16* dst, int width);
+AddRow GetSubRow(uint16* dst, int width);
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/include/libyuv/scale.h
+++ b/include/libyuv/scale.h
@ -66,6 +66,14 @@ int ScaleOffset(const uint8* src, int src_width, int src_height,
                uint8* dst, int dst_width, int dst_height, int dst_yoffset,
                bool interpolate);

+typedef void (*ARGBBlendRow)(const uint8* src_argb0,
+                             const uint8* src_argb1,
+                             uint8* dst_argb, int width);
+
+// Get function to Alpha Blend ARGB pixels and store to destination.
+ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width);
+
+
 // For testing, allow disabling of optimizations.
 void SetUseReferenceImpl(bool use);

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,7 +11,7 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 251
+#define LIBYUV_VERSION 252

 #endif  // INCLUDE_LIBYUV_VERSION_H_

--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -944,6 +944,32 @@ int ARGBUnattenuate(const uint8* src_argb, int src_stride_argb,
  return 0;
 }

+// AddRow is useful for summing up rows of an image, when implementing a
+// box filter or blur effect.
+AddRow GetAddRow(uint16* dst, int width) {
+  AddRow AddRowF = AddRow_C;
+#if defined(HAS_ADDROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(width, 16)) {
+    AddRowF = AddRow_SSE2;
+  }
+#endif
+  return AddRowF;
+}
+
+// SubRow is useful when a sum of rows exists and the caller wants to
+// remove a row and add a new row without recomputing the full sum of rows.
+AddRow GetSubRow(uint16* dst, int width) {
+  AddRow SubRowF = SubRow_C;
+#if defined(HAS_ADDROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) &&
+      IS_ALIGNED(dst, 16) && IS_ALIGNED(width, 16)) {
+    SubRowF = SubRow_SSE2;
+  }
+#endif
+  return SubRowF;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/source/row.h
+++ b/source/row.h
@ -61,6 +61,7 @@ extern "C" {
 #define HAS_I444TOARGBROW_SSSE3
 #define HAS_MIRRORROW_SSSE3
 #define HAS_MIRRORROWUV_SSSE3
+#define HAS_ADDROW_SSE2
 #define HAS_RAWTOARGBROW_SSSE3
 #define HAS_RGB24TOARGBROW_SSSE3
 #define HAS_RGB565TOARGBROW_SSE2
@ -152,6 +153,11 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
 void MirrorRowUV_NEON(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
 void MirrorRowUV_C(const uint8* src, uint8* dst_u, uint8* dst_v, int width);

+void AddRow_SSE2(const uint8* src, uint16* dst, int width);
+void SubRow_SSE2(const uint8* src, uint16* dst, int width);
+void AddRow_C(const uint8* src, uint16* dst, int width);
+void SubRow_C(const uint8* src, uint16* dst, int width);
+
 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
 void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -386,6 +386,30 @@ void MirrorRowUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
  }
 }

+void AddRow_C(const uint8* src, uint16* dst, int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    dst[0] += static_cast<uint16>(src[0]);
+    dst[1] += static_cast<uint16>(src[1]);
+    src += 2;
+    dst += 2;
+  }
+  if (width & 1) {
+    dst[0] += static_cast<uint16>(src[0]);
+  }
+}
+
+void SubRow_C(const uint8* src, uint16* dst, int width) {
+  for (int x = 0; x < width - 1; x += 2) {
+    dst[0] -= static_cast<uint16>(src[0]);
+    dst[1] -= static_cast<uint16>(src[1]);
+    src += 2;
+    dst += 2;
+  }
+  if (width & 1) {
+    dst[0] -= static_cast<uint16>(src[0]);
+  }
+}
+
 void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
  for (int x = 0; x < width - 1; x += 2) {
    dst_u[x] = src_uv[0];
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -1690,6 +1690,68 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
 }
 #endif

+#ifdef HAS_ADDROW_SSE2
+// dst and width aligned to 16
+void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
+  asm volatile (
+    "pxor      %%xmm4,%%xmm4                   \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm2                     \n"
+    "lea       0x10(%0),%0                     \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    0x10(%1),%%xmm1                 \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklbw %%xmm4,%%xmm2                   \n"
+    "punpckhbw %%xmm4,%%xmm3                   \n"
+    "paddusw   %%xmm2,%%xmm0                   \n"
+    "paddusw   %%xmm3,%%xmm1                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "movdqa    %%xmm1,0x10(%1)                 \n"
+    "lea       0x20(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+  );
+}
+
+// dst and width aligned to 16
+void SubRow_SSE2(const uint8* src, uint16* dst, int width) {
+  asm volatile (
+    "pxor      %%xmm4,%%xmm4                   \n"
+  "1:                                          \n"
+    "movdqu    (%0),%%xmm2                     \n"
+    "lea       0x10(%0),%0                     \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    0x10(%1),%%xmm1                 \n"
+    "movdqa    %%xmm2,%%xmm3                   \n"
+    "punpcklbw %%xmm4,%%xmm2                   \n"
+    "punpckhbw %%xmm4,%%xmm3                   \n"
+    "psubusw   %%xmm2,%%xmm0                   \n"
+    "psubusw   %%xmm3,%%xmm1                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1)                     \n"
+    "movdqa    %%xmm1,0x10(%1)                 \n"
+    "lea       0x20(%1),%1                     \n"
+    "jg        1b                              \n"
+  : "+r"(src),   // %0
+    "+r"(dst),   // %1
+    "+r"(width)  // %2
+  :
+  : "memory", "cc"
+#if defined(__SSE2__)
+    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
+#endif
+  );
+}
+#endif  // HAS_ADDROW_SSE2
+
 #ifdef HAS_SPLITUV_SSE2
 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
  asm volatile (
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -1716,6 +1716,65 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
 }
 #endif

+#ifdef HAS_ADDROW_SSE2
+// dst and width aligned to 16
+__declspec(naked) __declspec(align(16))
+void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
+__asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // width
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    movdqu     xmm2, [eax]       // read 16 bytes
+    lea        eax, [eax + 16]
+    movdqa     xmm0, [edx]       // read first 8 words
+    movdqa     xmm1, [edx + 16]  // read next 8 words
+    movdqa     xmm3, xmm2
+    punpcklbw  xmm2, xmm4
+    punpckhbw  xmm3, xmm4
+    paddusw    xmm0, xmm2        // add 16 words
+    paddusw    xmm1, xmm3
+    sub        ecx, 16
+    movdqa     [edx], xmm0       // store 16 words
+    movdqa     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    jg         convertloop
+    ret
+  }
+}
+
+__declspec(naked) __declspec(align(16))
+void SubRow_SSE2(const uint8* src, uint16* dst, int width) {
+__asm {
+    mov        eax, [esp + 4]   // src
+    mov        edx, [esp + 8]   // dst
+    mov        ecx, [esp + 12]  // width
+    pxor       xmm4, xmm4
+
+    align      16
+ convertloop:
+    movdqu     xmm2, [eax]       // read 16 bytes
+    lea        eax, [eax + 16]
+    movdqa     xmm0, [edx]       // read first 8 words
+    movdqa     xmm1, [edx + 16]  // read next 8 words
+    movdqa     xmm3, xmm2
+    punpcklbw  xmm2, xmm4
+    punpckhbw  xmm3, xmm4
+    psubusw    xmm0, xmm2        // sub 16 words
+    psubusw    xmm1, xmm3
+    sub        ecx, 16
+    movdqa     [edx], xmm0       // store 16 words
+    movdqa     [edx + 16], xmm1
+    lea        edx, [edx + 32]
+    jg         convertloop
+    ret
+  }
+}
+#endif  // HAS_ADDROW_SSE2
+
 #ifdef HAS_SPLITUV_SSE2
 __declspec(naked) __declspec(align(16))
 void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -193,4 +193,31 @@ TEST_F(libyuvTest, TestAttenuate) {
  EXPECT_EQ(85,  atten_pixels[255][2]);
  EXPECT_EQ(255, atten_pixels[255][3]);
 }
+
+TEST_F(libyuvTest, TestAddRow) {
+  SIMD_ALIGNED(uint8 orig_pixels[256]);
+  SIMD_ALIGNED(uint16 added_pixels[256]);
+
+  libyuv::AddRow AddRow = GetAddRow(added_pixels, 256);
+  libyuv::AddRow SubRow = GetSubRow(added_pixels, 256);
+
+  for (int i = 0; i < 256; ++i) {
+    orig_pixels[i] = i;
+  }
+  memset(added_pixels, 0, sizeof(uint16) * 256);
+
+  AddRow(orig_pixels, added_pixels, 256);
+  EXPECT_EQ(7u, added_pixels[7]);
+  EXPECT_EQ(250u, added_pixels[250]);
+  AddRow(orig_pixels, added_pixels, 256);
+  EXPECT_EQ(14u, added_pixels[7]);
+  EXPECT_EQ(500u, added_pixels[250]);
+  SubRow(orig_pixels, added_pixels, 256);
+  EXPECT_EQ(7u, added_pixels[7]);
+  EXPECT_EQ(250u, added_pixels[250]);
+
+  for (int i = 0; i < 1000 * (1280 * 720 * 4 / 256); ++i) {
+    AddRow(orig_pixels, added_pixels, 256);
+  }
+}
 }