From 5dba58cb1ed4117f491267f68351a6079eaed667 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Thu, 2 Jan 2014 22:32:09 +0000 Subject: [PATCH] FixedDiv1 using a single 64/32 divide. Removes size restriction from slope. BUG=302 TESTED=libyuv scale tests R=tpsiaki@google.com Review URL: https://webrtc-codereview.appspot.com/6489004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@940 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv.h | 1 + include/libyuv/row.h | 10 ---------- include/libyuv/scale_row.h | 26 ++++++++++++++++++++----- include/libyuv/version.h | 2 +- source/row_common.cc | 5 ----- source/row_posix.cc | 17 ----------------- source/row_win.cc | 15 --------------- source/scale_common.cc | 20 +++++++++++++------ source/scale_posix.cc | 33 ++++++++++++++++++++++++++++++++ source/scale_win.cc | 30 +++++++++++++++++++++++++++++ unit_test/math_test.cc | 39 +++++++++++++++++++++++++++++++++++++- 12 files changed, 139 insertions(+), 61 deletions(-) diff --git a/README.chromium b/README.chromium index 77e29fbf6..d677519fb 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 939 +Version: 941 License: BSD License File: LICENSE diff --git a/include/libyuv.h b/include/libyuv.h index c058665a8..3bebe642c 100644 --- a/include/libyuv.h +++ b/include/libyuv.h @@ -26,6 +26,7 @@ #include "libyuv/row.h" #include "libyuv/scale.h" #include "libyuv/scale_argb.h" +#include "libyuv/scale_row.h" #include "libyuv/version.h" #include "libyuv/video_common.h" diff --git a/include/libyuv/row.h b/include/libyuv/row.h index fdb459f29..75e1dca0e 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -104,7 +104,6 @@ extern "C" { #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 #define HAS_COPYROW_X86 -#define HAS_FIXEDDIV_X86 #define HAS_HALFROW_SSE2 #define HAS_I400TOARGBROW_SSE2 #define HAS_I411TOARGBROW_SSSE3 @@ -1684,15 +1683,6 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width, const uint8* luma, const uint32 lumacoeff); -// Divide num by div and return as 16.16 fixed point result. -int FixedDiv_C(int num, int div); -int FixedDiv_X86(int num, int div); -#ifdef HAS_FIXEDDIV_X86 -#define FixedDiv FixedDiv_X86 -#else -#define FixedDiv FixedDiv_C -#endif - #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/include/libyuv/scale_row.h b/include/libyuv/scale_row.h index 8e1abfa1c..fddb15215 100644 --- a/include/libyuv/scale_row.h +++ b/include/libyuv/scale_row.h @@ -33,6 +33,8 @@ extern "C" { #define HAS_SCALEARGBCOLS_SSE2 #define HAS_SCALEARGBFILTERCOLS_SSSE3 #define HAS_SCALEARGBCOLSUP2_SSE2 +#define HAS_FIXEDDIV_X86 +#define HAS_FIXEDDIV1_X86 #endif // The following are available on Neon platforms: @@ -61,17 +63,31 @@ void ScalePlaneVertical(int src_height, int src_stride, int dst_stride, const uint8* src_argb, uint8* dst_argb, int x, int y, int dy, - int bpp, FilterMode filtering); + int bpp, enum FilterMode filtering); // Simplify the filtering based on scale factors. -FilterMode ScaleFilterReduce(int src_width, int src_height, - int dst_width, int dst_height, - FilterMode filtering); +enum FilterMode ScaleFilterReduce(int src_width, int src_height, + int dst_width, int dst_height, + enum FilterMode filtering); + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_C(int num, int div); +int FixedDiv_X86(int num, int div); +// Divide num - 1 by div - 1 and return as 16.16 fixed point result. +int FixedDiv1_C(int num, int div); +int FixedDiv1_X86(int num, int div); +#ifdef HAS_FIXEDDIV_X86 +#define FixedDiv FixedDiv_X86 +#define FixedDiv1 FixedDiv1_X86 +#else +#define FixedDiv FixedDiv_C +#define FixedDiv1 FixedDiv1_C +#endif // Compute slope values for stepping. void ScaleSlope(int src_width, int src_height, int dst_width, int dst_height, - FilterMode filtering, + enum FilterMode filtering, int* x, int* y, int* dx, int* dy); void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 199910d79..7a2eabfd9 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 939 +#define LIBYUV_VERSION 941 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_common.cc b/source/row_common.cc index 98ecdf41e..204763281 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -59,11 +59,6 @@ static __inline uint32 Abs(int32 v) { } #endif // USE_BRANCHLESS -// Divide num by div and return as 16.16 fixed point result. -int FixedDiv_C(int num, int div) { - return static_cast((static_cast(num) << 16) / div); -} - #ifdef LIBYUV_LITTLE_ENDIAN #define WRITEWORD(p, v) *reinterpret_cast(p) = v #else diff --git a/source/row_posix.cc b/source/row_posix.cc index a0106b344..4d773b334 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -6170,23 +6170,6 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, } #endif // HAS_I422TOUYVYROW_SSE2 -#ifdef HAS_FIXEDDIV_X86 -// Divide num by div and return as 16.16 fixed point result. -int FixedDiv_X86(int num, int div) { - asm volatile ( - "cdq \n" - "shld $0x10,%%eax,%%edx \n" - "shl $0x10,%%eax \n" - "idiv %1 \n" - "mov %0, %%eax \n" - : "+a"(num) // %0 - : "c"(div) // %1 - : "memory", "cc", "edx" - ); - return num; -} -#endif // HAS_FIXEDDIV_X86 - #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 void ARGBPolynomialRow_SSE2(const uint8* src_argb, uint8* dst_argb, const float* poly, diff --git a/source/row_win.cc b/source/row_win.cc index 502d25cea..62d819f54 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -7009,21 +7009,6 @@ void I422ToUYVYRow_SSE2(const uint8* src_y, } } -#ifdef HAS_FIXEDDIV_X86 -// Divide num by div and return as 16.16 fixed point result. -__declspec(naked) __declspec(align(16)) -int FixedDiv_X86(int num, int div) { - __asm { - mov eax, [esp + 4] // num - cdq // extend num to 64 bits - shld edx, eax, 16 // 32.16 - shl eax, 16 - idiv dword ptr [esp + 8] - ret - } -} -#endif // HAS_FIXEDDIV_X86 - #ifdef HAS_ARGBPOLYNOMIALROW_SSE2 __declspec(naked) __declspec(align(16)) void ARGBPolynomialRow_SSE2(const uint8* src_argb, diff --git a/source/scale_common.cc b/source/scale_common.cc index ee6a33629..5d0fcb8b9 100644 --- a/source/scale_common.cc +++ b/source/scale_common.cc @@ -584,9 +584,18 @@ FilterMode ScaleFilterReduce(int src_width, int src_height, return filtering; } +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_C(int num, int div) { + return static_cast((static_cast(num) << 16) / div); +} + +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv1_C(int num, int div) { + return static_cast(((static_cast(num) << 16) - 0x00010001) / + (div - 1)); +} + #define CENTERSTART(dx, s) (dx < 0) ? -((-dx >> 1) + s) : ((dx >> 1) + s) -#define FIXEDDIV1(src, dst) FixedDiv((src << 16) - 0x00010001, \ - (dst << 16) - 0x00010000); // Compute slope values for stepping. void ScaleSlope(int src_width, int src_height, @@ -613,14 +622,14 @@ void ScaleSlope(int src_width, int src_height, *dx = FixedDiv(Abs(src_width), dst_width); *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. } else if (dst_width > 1) { - *dx = FIXEDDIV1(Abs(src_width), dst_width); + *dx = FixedDiv1(Abs(src_width), dst_width); *x = 0; } if (dst_height <= src_height) { *dy = FixedDiv(src_height, dst_height); *y = CENTERSTART(*dy, -32768); // Subtract 0.5 (32768) to center filter. } else if (dst_height > 1) { - *dy = FIXEDDIV1(src_height, dst_height); + *dy = FixedDiv1(src_height, dst_height); *y = 0; } } else if (filtering == kFilterLinear) { @@ -629,7 +638,7 @@ void ScaleSlope(int src_width, int src_height, *dx = FixedDiv(Abs(src_width), dst_width); *x = CENTERSTART(*dx, -32768); // Subtract 0.5 (32768) to center filter. } else if (dst_width > 1) { - *dx = FIXEDDIV1(Abs(src_width), dst_width); + *dx = FixedDiv1(Abs(src_width), dst_width); *x = 0; } *dy = FixedDiv(src_height, dst_height); @@ -649,7 +658,6 @@ void ScaleSlope(int src_width, int src_height, } } #undef CENTERSTART -#undef FIXEDDIV1 #ifdef __cplusplus } // extern "C" diff --git a/source/scale_posix.cc b/source/scale_posix.cc index e32268fbe..4a8b729c5 100644 --- a/source/scale_posix.cc +++ b/source/scale_posix.cc @@ -1274,6 +1274,39 @@ void ScaleARGBFilterCols_SSSE3(uint8* dst_argb, const uint8* src_argb, ); } +// Divide num by div and return as 16.16 fixed point result. +int FixedDiv_X86(int num, int div) { + asm volatile ( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx" + ); + return num; +} + +// Divide num - 1 by div - 1 and return as 16.16 fixed point result. +int FixedDiv1_X86(int num, int div) { + asm volatile ( + "cdq \n" + "shld $0x10,%%eax,%%edx \n" + "shl $0x10,%%eax \n" + "sub $0x10001,%%eax \n" + "sbb $0x0,%%edx \n" + "sub $0x1,%1 \n" + "idiv %1 \n" + "mov %0, %%eax \n" + : "+a"(num) // %0 + : "c"(div) // %1 + : "memory", "cc", "edx" + ); + return num; +} + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/scale_win.cc b/source/scale_win.cc index 76f5f4b4b..35e1af901 100644 --- a/source/scale_win.cc +++ b/source/scale_win.cc @@ -1281,6 +1281,36 @@ void ScaleARGBColsUp2_SSE2(uint8* dst_argb, const uint8* src_argb, } } +// Divide num by div and return as 16.16 fixed point result. +__declspec(naked) __declspec(align(16)) +int FixedDiv_X86(int num, int div) { + __asm { + mov eax, [esp + 4] // num + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 + shl eax, 16 + idiv dword ptr [esp + 8] + ret + } +} + +// Divide num by div and return as 16.16 fixed point result. +__declspec(naked) __declspec(align(16)) +int FixedDiv1_X86(int num, int div) { + __asm { + mov eax, [esp + 4] // num + mov ecx, [esp + 8] // denom + cdq // extend num to 64 bits + shld edx, eax, 16 // 32.16 + shl eax, 16 + sub eax, 0x00010001 + sbb edx, 0 + sub ecx, 1 + idiv ecx + ret + } +} + #endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER) #ifdef __cplusplus diff --git a/unit_test/math_test.cc b/unit_test/math_test.cc index ac6adec9e..370701037 100644 --- a/unit_test/math_test.cc +++ b/unit_test/math_test.cc @@ -14,6 +14,8 @@ #include "libyuv/basic_types.h" #include "libyuv/cpu_id.h" #include "libyuv/row.h" +#include "libyuv/scale.h" +#include "libyuv/scale_row.h" #include "../unit_test/unit_test.h" namespace libyuv { @@ -27,7 +29,7 @@ TEST_F(libyuvTest, TestFixedDiv) { EXPECT_EQ(0x10000, libyuv::FixedDiv(1, 1)); EXPECT_EQ(0x7fff0000, libyuv::FixedDiv(0x7fff, 1)); // TODO(fbarchard): Avoid the following that throw exceptions. - // EXPECT_EQ(0x10000, libyuv::FixedDiv(0x10000, 1)); + // EXPECT_EQ(0x100000000, libyuv::FixedDiv(0x10000, 1)); // EXPECT_EQ(0x80000000, libyuv::FixedDiv(0x8000, 1)); EXPECT_EQ(0x20000, libyuv::FixedDiv(640 * 2, 640)); @@ -118,4 +120,39 @@ TEST_F(libyuvTest, TestFixedDiv_Opt) { } } +TEST_F(libyuvTest, TestFixedDiv1_Opt) { + int num[1280]; + int div[1280]; + int result_opt[1280]; + int result_c[1280]; + + srandom(time(NULL)); + MemRandomize(reinterpret_cast(&num[0]), sizeof(num)); + MemRandomize(reinterpret_cast(&div[0]), sizeof(div)); + for (int j = 0; j < 1280; ++j) { + num[j] &= 4095; // Make numerator smaller. + div[j] &= 4095; // Make divisor smaller. + if (div[j] <= 1) { + div[j] = 1280; + } + } + + int has_x86 = TestCpuFlag(kCpuHasX86); + for (int i = 0; i < benchmark_pixels_div1280_; ++i) { + if (has_x86) { + for (int j = 0; j < 1280; ++j) { + result_opt[j] = libyuv::FixedDiv1(num[j], div[j]); + } + } else { + for (int j = 0; j < 1280; ++j) { + result_opt[j] = libyuv::FixedDiv1_C(num[j], div[j]); + } + } + } + for (int j = 0; j < 1280; ++j) { + result_c[j] = libyuv::FixedDiv1_C(num[j], div[j]); + EXPECT_NEAR(result_c[j], result_opt[j], 1); + } +} + } // namespace libyuv