Reenable AVX2 scaling with bug fix for any width

BUG=376 TESTED=unittest on scale functions R=brucedawson@google.com, harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/30759004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1135 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-06 16:56:55 +08:00 · 2014-10-22 01:15:20 +00:00 · 2014-10-22 01:15:20 +00:00 · af6f25245e
commit af6f25245e
parent 4165437c3e
5 changed files with 51 additions and 53 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1134
+Version: 1135
 License: BSD
 License File: LICENSE
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -201,8 +201,7 @@ extern "C" {
 #define HAS_ARGBTOYJROW_AVX2
 #define HAS_ARGBTOYROW_AVX2
 #define HAS_I422TOARGBROW_AVX2
-// TODO(fbarchard): fix bug #376.
+#define HAS_INTERPOLATEROW_AVX2
 // #define HAS_INTERPOLATEROW_AVX2
 #define HAS_MERGEUVROW_AVX2
 #define HAS_MIRRORROW_AVX2
 #define HAS_SPLITUVROW_AVX2
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1134
+#define LIBYUV_VERSION 1135
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -581,24 +581,20 @@ YANY(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON,
    }
 #ifdef HAS_INTERPOLATEROW_AVX2
-NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2,
+NANY(InterpolateRow_Any_AVX2, InterpolateRow_AVX2, InterpolateRow_C, 1, 1, 31)
     InterpolateRow_C, 1, 1, 32)
 #endif
 #ifdef HAS_INTERPOLATEROW_SSSE3
-NANY(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3,
+NANY(InterpolateRow_Any_SSSE3, InterpolateRow_SSSE3, InterpolateRow_C, 1, 1, 15)
     InterpolateRow_C, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_SSE2
-NANY(InterpolateRow_Any_SSE2, InterpolateRow_SSE2,
+NANY(InterpolateRow_Any_SSE2, InterpolateRow_SSE2, InterpolateRow_C, 1, 1, 15)
     InterpolateRow_C, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_NEON
-NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON,
+NANY(InterpolateRow_Any_NEON, InterpolateRow_NEON, InterpolateRow_C, 1, 1, 15)
     InterpolateRow_C, 1, 1, 15)
 #endif
 #ifdef HAS_INTERPOLATEROW_MIPS_DSPR2
-NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2,
+NANY(InterpolateRow_Any_MIPS_DSPR2, InterpolateRow_MIPS_DSPR2, InterpolateRow_C,
-     InterpolateRow_C, 1, 1, 3)
+     1, 1, 3)
 #endif
 #undef NANY
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -4972,11 +4972,11 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
 #endif  // HAS_ARGBAFFINEROW_SSE2
 #ifdef HAS_INTERPOLATEROW_AVX2
-// Bilinear filter 16x2 -> 16x1
+// Bilinear filter 32x2 -> 32x1
 __declspec(naked) __declspec(align(16))
 void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
-                          ptrdiff_t src_stride, int dst_width,
+                         ptrdiff_t src_stride, int dst_width,
-                          int source_y_fraction) {
+                         int source_y_fraction) {
  __asm {
    push       esi
    push       edi
@ -5023,45 +5023,48 @@ void InterpolateRow_AVX2(uint8* dst_ptr, const uint8* src_ptr,
    jg         xloop
    jmp        xloop99
-    // Blend 25 / 75.
+   // Blend 25 / 75.
-    align      4
+   align      4
-  xloop25:
+ xloop25:
-    vmovdqu    ymm0, [esi]
+   vmovdqu    ymm0, [esi]
-    vpavgb     ymm0, ymm0, [esi + edx]
+   vmovdqu    ymm1, [esi + edx]
-    vpavgb     ymm0, ymm0, [esi + edx]
+   vpavgb     ymm0, ymm0, ymm1
-    sub        ecx, 32
+   vpavgb     ymm0, ymm0, ymm1
-    vmovdqu    [esi + edi], ymm0
+   sub        ecx, 32
-    lea        esi, [esi + 32]
+   vmovdqu    [esi + edi], ymm0
-    jg         xloop25
+   lea        esi, [esi + 32]
-    jmp        xloop99
+   jg         xloop25
   jmp        xloop99
-    // Blend 50 / 50.
+   // Blend 50 / 50.
-    align      4
+   align      4
-  xloop50:
+ xloop50:
-    vmovdqu    ymm0, [esi]
+   vmovdqu    ymm0, [esi]
-    vpavgb     ymm0, ymm0, [esi + edx]
+   vmovdqu    ymm1, [esi + edx]
-    sub        ecx, 32
+   vpavgb     ymm0, ymm0, ymm1
-    vmovdqu    [esi + edi], ymm0
+   sub        ecx, 32
-    lea        esi, [esi + 32]
+   vmovdqu    [esi + edi], ymm0
-    jg         xloop50
+   lea        esi, [esi + 32]
-    jmp        xloop99
+   jg         xloop50
   jmp        xloop99
-    // Blend 75 / 25.
+   // Blend 75 / 25.
-    align      4
+   align      4
-  xloop75:
+ xloop75:
-    vmovdqu    ymm0, [esi + edx]
+   vmovdqu    ymm1, [esi]
-    vpavgb     ymm0, ymm0, [esi]
+   vmovdqu    ymm0, [esi + edx]
-    vpavgb     ymm0, ymm0, [esi]
+   vpavgb     ymm0, ymm0, ymm1
-    sub        ecx, 32
+   vpavgb     ymm0, ymm0, ymm1
-    vmovdqu     [esi + edi], ymm0
+   sub        ecx, 32
-    lea        esi, [esi + 32]
+   vmovdqu    [esi + edi], ymm0
-    jg         xloop75
+   lea        esi, [esi + 32]
-    jmp        xloop99
+   jg         xloop75
   jmp        xloop99
-    // Blend 100 / 0 - Copy row unchanged.
+   // Blend 100 / 0 - Copy row unchanged.
-    align      4
+   align      4
-  xloop100:
+ xloop100:
-    rep movsb
+   rep movsb
  xloop99:
    pop        edi