FixedDiv1 using a single 64/32 divide. Removes size restriction from slope.

BUG=302 TESTED=libyuv scale tests R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/6489004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@940 16f28f9a-4ce2-e073-06de-1de4eb20be90
2026-01-01 03:12:16 +08:00 · 2014-01-02 22:32:09 +00:00 · 2014-01-02 22:32:09 +00:00 · 5dba58cb1e
commit 5dba58cb1e
parent 277378723a
12 changed files with 139 additions and 61 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 939
+Version: 941
 License: BSD
 License File: LICENSE

--- a/include/libyuv.h
+++ b/include/libyuv.h
@ -26,6 +26,7 @@
 #include "libyuv/row.h"
 #include "libyuv/scale.h"
 #include "libyuv/scale_argb.h"
+#include "libyuv/scale_row.h"
 #include "libyuv/version.h"
 #include "libyuv/video_common.h"

--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -104,7 +104,6 @@ extern "C" {
 #define HAS_COPYROW_ERMS
 #define HAS_COPYROW_SSE2
 #define HAS_COPYROW_X86
-#define HAS_FIXEDDIV_X86
 #define HAS_HALFROW_SSE2
 #define HAS_I400TOARGBROW_SSE2
 #define HAS_I411TOARGBROW_SSSE3
@ -1684,15 +1683,6 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
                                 int width, const uint8* luma,
                                 const uint32 lumacoeff);

-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv_C(int num, int div);
-int FixedDiv_X86(int num, int div);
-#ifdef HAS_FIXEDDIV_X86
-#define FixedDiv FixedDiv_X86
-#else
-#define FixedDiv FixedDiv_C
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/include/libyuv/scale_row.h
+++ b/include/libyuv/scale_row.h
@ -33,6 +33,8 @@ extern "C" {
 #define HAS_SCALEARGBCOLS_SSE2
 #define HAS_SCALEARGBFILTERCOLS_SSSE3
 #define HAS_SCALEARGBCOLSUP2_SSE2
+#define HAS_FIXEDDIV_X86
+#define HAS_FIXEDDIV1_X86
 #endif

 // The following are available on Neon platforms:
@ -61,17 +63,31 @@ void ScalePlaneVertical(int src_height,
                        int src_stride, int dst_stride,
                        const uint8* src_argb, uint8* dst_argb,
                        int x, int y, int dy,
-                        int bpp, FilterMode filtering);
+                        int bpp, enum FilterMode filtering);

 // Simplify the filtering based on scale factors.
-FilterMode ScaleFilterReduce(int src_width, int src_height,
-                             int dst_width, int dst_height,
-                             FilterMode filtering);
+enum FilterMode ScaleFilterReduce(int src_width, int src_height,
+                                  int dst_width, int dst_height,
+                                  enum FilterMode filtering);
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div);
+int FixedDiv_X86(int num, int div);
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div);
+int FixedDiv1_X86(int num, int div);
+#ifdef HAS_FIXEDDIV_X86
+#define FixedDiv FixedDiv_X86
+#define FixedDiv1 FixedDiv1_X86
+#else
+#define FixedDiv FixedDiv_C
+#define FixedDiv1 FixedDiv1_C
+#endif

 // Compute slope values for stepping.
 void ScaleSlope(int src_width, int src_height,
                int dst_width, int dst_height,
-                FilterMode filtering,
+                enum FilterMode filtering,
                int* x, int* y, int* dx, int* dy);

 void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */,
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 939
+#define LIBYUV_VERSION 941

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -59,11 +59,6 @@ static __inline uint32 Abs(int32 v) {
 }
 #endif  // USE_BRANCHLESS

-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv_C(int num, int div) {
-  return static_cast<int>((static_cast<int64>(num) << 16) / div);
-}
-
 #ifdef LIBYUV_LITTLE_ENDIAN
 #define WRITEWORD(p, v) *reinterpret_cast<uint32*>(p) = v
 #else
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -6170,23 +6170,6 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
 }
 #endif  // HAS_I422TOUYVYROW_SSE2

-#ifdef HAS_FIXEDDIV_X86
-// Divide num by div and return as 16.16 fixed point result.
-int FixedDiv_X86(int num, int div) {
-  asm volatile (
-    "cdq                                       \n"
-    "shld      $0x10,%%eax,%%edx               \n"
-    "shl       $0x10,%%eax                     \n"
-    "idiv      %1                              \n"
-    "mov       %0, %%eax                       \n"
-    : "+a"(num)  // %0
-    : "c"(div)   // %1
-    : "memory", "cc", "edx"
-  );
-  return num;
-}
-#endif  // HAS_FIXEDDIV_X86
-
 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
                            uint8* dst_argb, const float* poly,
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -7009,21 +7009,6 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
  }
 }

-#ifdef HAS_FIXEDDIV_X86
-// Divide num by div and return as 16.16 fixed point result.
-__declspec(naked) __declspec(align(16))
-int FixedDiv_X86(int num, int div) {
-  __asm {
-    mov        eax, [esp + 4]    // num
-    cdq                          // extend num to 64 bits
-    shld       edx, eax, 16      // 32.16
-    shl        eax, 16
-    idiv       dword ptr [esp + 8]
-    ret
-  }
-}
-#endif  // HAS_FIXEDDIV_X86
-
 #ifdef HAS_ARGBPOLYNOMIALROW_SSE2
 __declspec(naked) __declspec(align(16))
 void ARGBPolynomialRow_SSE2(const uint8* src_argb,
--- a/source/scale_common.cc
+++ b/source/scale_common.cc
@ -584,9 +584,18 @@ FilterMode ScaleFilterReduce(int src_width, int src_height,
  return filtering;
 }

+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_C(int num, int div) {
+  return static_cast<int>((static_cast<int64>(num) << 16) / div);
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv1_C(int num, int div) {
+  return static_cast<int>(((static_cast<int64>(num) << 16) - 0x00010001) /
+                          (div - 1));
+}
+
 #define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s)
-#define FIXEDDIV1(src, dst) FixedDiv((src << 16) - 0x00010001, \
-                                     (dst << 16) - 0x00010000);

 // Compute slope values for stepping.
 void ScaleSlope(int src_width, int src_height,
@ -613,14 +622,14 @@ void ScaleSlope(int src_width, int src_height,
      *dx = FixedDiv(Abs(src_width), dst_width);
      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
    } else if (dst_width > 1) {
-      *dx = FIXEDDIV1(Abs(src_width), dst_width);
+      *dx = FixedDiv1(Abs(src_width), dst_width);
      *x = 0;
    }
    if (dst_height <= src_height) {
      *dy = FixedDiv(src_height,  dst_height);
      *y = CENTERSTART(*dy, -32768);  // Subtract 0.5 (32768) to center filter.
    } else if (dst_height > 1) {
-      *dy = FIXEDDIV1(src_height, dst_height);
+      *dy = FixedDiv1(src_height, dst_height);
      *y = 0;
    }
  } else if (filtering == kFilterLinear) {
@ -629,7 +638,7 @@ void ScaleSlope(int src_width, int src_height,
      *dx = FixedDiv(Abs(src_width), dst_width);
      *x = CENTERSTART(*dx, -32768);  // Subtract 0.5 (32768) to center filter.
    } else if (dst_width > 1) {
-      *dx = FIXEDDIV1(Abs(src_width), dst_width);
+      *dx = FixedDiv1(Abs(src_width), dst_width);
      *x = 0;
    }
    *dy = FixedDiv(src_height, dst_height);
@ -649,7 +658,6 @@ void ScaleSlope(int src_width, int src_height,
  }
 }
 #undef CENTERSTART
-#undef FIXEDDIV1

 #ifdef __cplusplus
 }  // extern "C"
--- a/source/scale_posix.cc
+++ b/source/scale_posix.cc
@ -1274,6 +1274,39 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb,
  );
 }

+// Divide num by div and return as 16.16 fixed point result.
+int FixedDiv_X86(int num, int div) {
+  asm volatile (
+    "cdq                                       \n"
+    "shld      $0x10,%%eax,%%edx               \n"
+    "shl       $0x10,%%eax                     \n"
+    "idiv      %1                              \n"
+    "mov       %0, %%eax                       \n"
+    : "+a"(num)  // %0
+    : "c"(div)   // %1
+    : "memory", "cc", "edx"
+  );
+  return num;
+}
+
+// Divide num - 1 by div - 1 and return as 16.16 fixed point result.
+int FixedDiv1_X86(int num, int div) {
+  asm volatile (
+    "cdq                                       \n"
+    "shld      $0x10,%%eax,%%edx               \n"
+    "shl       $0x10,%%eax                     \n"
+    "sub       $0x10001,%%eax                  \n"
+    "sbb       $0x0,%%edx                      \n"
+    "sub       $0x1,%1                         \n"
+    "idiv      %1                              \n"
+    "mov       %0, %%eax                       \n"
+    : "+a"(num)  // %0
+    : "c"(div)   // %1
+    : "memory", "cc", "edx"
+  );
+  return num;
+}
+
 #endif  // defined(__x86_64__) || defined(__i386__)

 #ifdef __cplusplus
--- a/source/scale_win.cc
+++ b/source/scale_win.cc
@ -1281,6 +1281,36 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb,
  }
 }

+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked) __declspec(align(16))
+int FixedDiv_X86(int num, int div) {
+  __asm {
+    mov        eax, [esp + 4]    // num
+    cdq                          // extend num to 64 bits
+    shld       edx, eax, 16      // 32.16
+    shl        eax, 16
+    idiv       dword ptr [esp + 8]
+    ret
+  }
+}
+
+// Divide num by div and return as 16.16 fixed point result.
+__declspec(naked) __declspec(align(16))
+int FixedDiv1_X86(int num, int div) {
+  __asm {
+    mov        eax, [esp + 4]    // num
+    mov        ecx, [esp + 8]    // denom
+    cdq                          // extend num to 64 bits
+    shld       edx, eax, 16      // 32.16
+    shl        eax, 16
+    sub        eax, 0x00010001
+    sbb        edx, 0
+    sub        ecx, 1
+    idiv       ecx
+    ret
+  }
+}
+
 #endif  // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)

 #ifdef __cplusplus
--- a/unit_test/math_test.cc
+++ b/unit_test/math_test.cc
@ -14,6 +14,8 @@
 #include "libyuv/basic_types.h"
 #include "libyuv/cpu_id.h"
 #include "libyuv/row.h"
+#include "libyuv/scale.h"
+#include "libyuv/scale_row.h"
 #include "../unit_test/unit_test.h"

 namespace libyuv {
@ -27,7 +29,7 @@ TEST_F(libyuvTest, TestFixedDiv) {
  EXPECT_EQ(0x10000, libyuv::FixedDiv(1, 1));
  EXPECT_EQ(0x7fff0000, libyuv::FixedDiv(0x7fff, 1));
  // TODO(fbarchard): Avoid the following that throw exceptions.
-  // EXPECT_EQ(0x10000, libyuv::FixedDiv(0x10000, 1));
+  // EXPECT_EQ(0x100000000, libyuv::FixedDiv(0x10000, 1));
  // EXPECT_EQ(0x80000000, libyuv::FixedDiv(0x8000, 1));

  EXPECT_EQ(0x20000, libyuv::FixedDiv(640 * 2, 640));
@ -118,4 +120,39 @@ TEST_F(libyuvTest, TestFixedDiv_Opt) {
  }
 }

+TEST_F(libyuvTest, TestFixedDiv1_Opt) {
+  int num[1280];
+  int div[1280];
+  int result_opt[1280];
+  int result_c[1280];
+
+  srandom(time(NULL));
+  MemRandomize(reinterpret_cast<uint8*>(&num[0]), sizeof(num));
+  MemRandomize(reinterpret_cast<uint8*>(&div[0]), sizeof(div));
+  for (int j = 0; j < 1280; ++j) {
+    num[j] &= 4095;  // Make numerator smaller.
+    div[j] &= 4095;  // Make divisor smaller.
+    if (div[j] <= 1) {
+      div[j] = 1280;
+    }
+  }
+
+  int has_x86 = TestCpuFlag(kCpuHasX86);
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    if (has_x86) {
+      for (int j = 0; j < 1280; ++j) {
+        result_opt[j] = libyuv::FixedDiv1(num[j], div[j]);
+      }
+    } else {
+      for (int j = 0; j < 1280; ++j) {
+        result_opt[j] = libyuv::FixedDiv1_C(num[j], div[j]);
+      }
+    }
+  }
+  for (int j = 0; j < 1280; ++j) {
+    result_c[j] = libyuv::FixedDiv1_C(num[j], div[j]);
+    EXPECT_NEAR(result_c[j], result_opt[j], 1);
+  }
+}
+
 }  // namespace libyuv