From 14f657b44a6fa2e19573684bea7203cd18975a6e Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Wed, 27 Feb 2013 06:38:41 +0000
Subject: [PATCH] SSE2 scale for ARGB and posix with specializations BUG=177
 TESTED=try bots Review URL: https://webrtc-codereview.appspot.com/1115008

git-svn-id: http://libyuv.googlecode.com/svn/trunk@585 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium          |   2 +-
 include/libyuv/row.h     |   3 +-
 include/libyuv/version.h |   2 +-
 source/scale.cc          |  85 +++++++++++++-----
 source/scale_argb.cc     | 190 ++++++++++++++++++++++++++-------------
 5 files changed, 193 insertions(+), 89 deletions(-)

diff --git a/README.chromium b/README.chromium
index 60b8b2da4..d57660be9 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 584
+Version: 585
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 5bb958698..f051b3def 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -169,7 +169,8 @@ extern "C" {
 #endif
 
 // The following are available on Neon platforms
-#if !defined(LIBYUV_DISABLE_NEON) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#if !defined(LIBYUV_DISABLE_NEON) && \
+    (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
 #define HAS_ABGRTOARGBROW_NEON
 #define HAS_ABGRTOUVROW_NEON
 #define HAS_ABGRTOYROW_NEON
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 01960a700..65ec74f9d 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 584
+#define LIBYUV_VERSION 585
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/scale.cc b/source/scale.cc
index 50425ccd5..a146580a9 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -23,9 +23,6 @@ namespace libyuv {
 extern "C" {
 #endif
 
-// Bilinear SSE2 is disabled.
-#define SSE2_DISABLED 1
-
 // Note: Some SSE2 reference manuals
 // cpuvol1.pdf agner_instruction_tables.pdf 253666.pdf 253667.pdf
 
@@ -1954,20 +1951,29 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, ptrdiff_t src_stride,
 static void ScaleFilterRows_SSE2(uint8* dst_ptr,
                                  const uint8* src_ptr, ptrdiff_t src_stride,
                                  int dst_width, int source_y_fraction) {
+  asm volatile (
   asm volatile (
     "sub       %1,%0                           \n"
+    "shr       %3                              \n"
     "cmp       $0x0,%3                         \n"
-    "je        2f                              \n"
-    "cmp       $0x80,%3                        \n"
-    "je        3f                              \n"
+    "je        100f                            \n"
+    "cmp       $0x20,%3                        \n"
+    "je        75f                             \n"
+    "cmp       $0x40,%3                        \n"
+    "je        50f                             \n"
+    "cmp       $0x60,%3                        \n"
+    "je        25f                             \n"
 
+    "movd      %3,%%xmm0                       \n"
+    "neg       %3                              \n"
+    "add       $0x80,%3                        \n"
     "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm5,%%xmm5                   \n"
-    "psrlw     $0x1,%%xmm5                     \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
     "punpcklwd %%xmm5,%%xmm5                   \n"
-    "punpckldq %%xmm5,%%xmm5                   \n"
-    "punpcklqdq %%xmm5,%%xmm5                  \n"
+    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
     "pxor      %%xmm4,%%xmm4                   \n"
+
+    // General purpose row blend.
     ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%1),%%xmm0                     \n"
@@ -1991,25 +1997,56 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr,
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        1b                              \n"
-    "jmp       4f                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 25 / 75.
     ".p2align  4                               \n"
-  "2:                                          \n"
+  "25:                                         \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    (%1,%4,1),%%xmm1                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        25b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    ".p2align  4                               \n"
+  "50:                                         \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    (%1,%4,1),%%xmm1                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 75 / 25.
+    ".p2align  4                               \n"
+  "75:                                         \n"
+    "movdqa    (%1),%%xmm1                     \n"
+    "movdqa    (%1,%4,1),%%xmm0                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "sub       $0x10,%2                        \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        75b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    ".p2align  4                               \n"
+  "100:                                        \n"
     "movdqa    (%1),%%xmm0                     \n"
     "sub       $0x10,%2                        \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
-    "jg        2b                              \n"
-    "jmp       4f                              \n"
-    ".p2align  4                               \n"
-  "3:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "pavgb     (%1,%4,1),%%xmm0                \n"
-    "sub       $0x10,%2                        \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        3b                              \n"
-    ".p2align  4                               \n"
-  "4:                                          \n"
+    "jg        100b                            \n"
+
+  "99:                                         \n"
     "punpckhbw %%xmm0,%%xmm0                   \n"
     "pshufhw   $0xff,%%xmm0,%%xmm0             \n"
     "punpckhqdq %%xmm0,%%xmm0                  \n"
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index d593a54b8..12ae70f03 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -200,8 +200,7 @@ static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_argb,
 }
 
 // Bilinear row filtering combines 4x2 -> 4x1. SSE2 version.
-#ifndef SSE2_DISABLED
-#define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED
+#define HAS_SCALEARGBFILTERROWS_SSE2
 __declspec(naked) __declspec(align(16))
 void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb,
                               ptrdiff_t src_stride, int dst_width,
@@ -215,19 +214,24 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb,
     mov        ecx, [esp + 8 + 16]  // dst_width
     mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
     sub        edi, esi
+    // Dispatch to specialized filters if applicable.
     cmp        eax, 0
-    je         xloop1
+    je         xloop100  // 0 / 256.  Blend 100 / 0.
+    cmp        eax, 64
+    je         xloop75   // 64 / 256 is 0.25.  Blend 75 / 25.
     cmp        eax, 128
-    je         xloop2
+    je         xloop50   // 128 / 256 is 0.50.  Blend 50 / 50.
+    cmp        eax, 192
+    je         xloop25   // 192 / 256 is 0.75.  Blend 25 / 75.
 
     movd       xmm5, eax            // xmm5 = y fraction
     punpcklbw  xmm5, xmm5
+    psrlw      xmm5, 1
     punpcklwd  xmm5, xmm5
-    pshufd     xmm5, xmm5, 0
+    punpckldq  xmm5, xmm5
+    punpcklqdq xmm5, xmm5
     pxor       xmm4, xmm4
 
-    // f * row1 + (1 - frac) row0
-    // frac * (row1 - row0) + row0
     align      16
   xloop:
     movdqa     xmm0, [esi]  // row0
@@ -240,6 +244,8 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb,
     punpckhbw  xmm1, xmm4
     psubw      xmm2, xmm0  // row1 - row0
     psubw      xmm3, xmm1
+    paddw      xmm2, xmm2  // 9 bits * 15 bits = 8.16
+    paddw      xmm3, xmm3
     pmulhw     xmm2, xmm5  // scale diff
     pmulhw     xmm3, xmm5
     paddw      xmm0, xmm2  // sum rows
@@ -249,44 +255,63 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb,
     movdqa     [esi + edi], xmm0
     lea        esi, [esi + 16]
     jg         xloop
+    jmp        xloop99
 
+    // Blend 25 / 75.
+    align      16
+  xloop25:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop25
+    jmp        xloop99
+
+    // Blend 50 / 50.
+    align      16
+  xloop50:
+    movdqa     xmm0, [esi]
+    movdqa     xmm1, [esi + edx]
+    pavgb      xmm0, xmm1
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop50
+    jmp        xloop99
+
+    // Blend 75 / 25.
+    align      16
+  xloop75:
+    movdqa     xmm1, [esi]
+    movdqa     xmm0, [esi + edx]
+    pavgb      xmm0, xmm1
+    pavgb      xmm0, xmm1
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop75
+    jmp        xloop99
+
+    // Blend 100 / 0 - Copy row unchanged.
+    align      16
+  xloop100:
+    movdqa     xmm0, [esi]
+    sub        ecx, 4
+    movdqa     [esi + edi], xmm0
+    lea        esi, [esi + 16]
+    jg         xloop100
+
+  xloop99:
     shufps     xmm0, xmm0, 0xff
     movdqa     [esi + edi], xmm0    // duplicate last pixel for filtering
     pop        edi
     pop        esi
     ret
-
-    align      16
-  xloop1:
-    movdqa     xmm0, [esi]
-    sub        ecx, 4
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop1
-
-    shufps     xmm0, xmm0, 0xff
-    movdqa     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
-
-    align      16
-  xloop2:
-    movdqa     xmm0, [esi]
-    pavgb      xmm0, [esi + edx]
-    sub        ecx, 4
-    movdqa     [esi + edi], xmm0
-    lea        esi, [esi + 16]
-    jg         xloop2
-
-    shufps     xmm0, xmm0, 0xff
-    movdqa     [esi + edi], xmm0
-    pop        edi
-    pop        esi
-    ret
   }
 }
-#endif  // SSE2_DISABLED
 
 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
 #define HAS_SCALEARGBFILTERROWS_SSSE3
@@ -544,23 +569,33 @@ static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_argb,
   );
 }
 
-#ifndef SSE2_DISABLED
 // Bilinear row filtering combines 4x2 -> 4x1. SSE2 version
-#define HAS_SCALEARGBFILTERROWS_SSE2_DISABLED
+#define HAS_SCALEARGBFILTERROWS_SSE2
 void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb,
                               ptrdiff_t src_stride, int dst_width,
                               int source_y_fraction) {
   asm volatile (
     "sub       %1,%0                           \n"
+    "shr       %3                              \n"
     "cmp       $0x0,%3                         \n"
-    "je        2f                              \n"
-    "cmp       $0x80,%3                        \n"
-    "je        3f                              \n"
+    "je        100f                            \n"
+    "cmp       $0x20,%3                        \n"
+    "je        75f                             \n"
+    "cmp       $0x40,%3                        \n"
+    "je        50f                             \n"
+    "cmp       $0x60,%3                        \n"
+    "je        25f                             \n"
+
+    "movd      %3,%%xmm0                       \n"
+    "neg       %3                              \n"
+    "add       $0x80,%3                        \n"
     "movd      %3,%%xmm5                       \n"
-    "punpcklbw %%xmm5,%%xmm5                   \n"
+    "punpcklbw %%xmm0,%%xmm5                   \n"
     "punpcklwd %%xmm5,%%xmm5                   \n"
     "pshufd    $0x0,%%xmm5,%%xmm5              \n"
     "pxor      %%xmm4,%%xmm4                   \n"
+
+    // General purpose row blend.
     ".p2align  4                               \n"
   "1:                                          \n"
     "movdqa    (%1),%%xmm0                     \n"
@@ -573,6 +608,8 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb,
     "punpckhbw %%xmm4,%%xmm1                   \n"
     "psubw     %%xmm0,%%xmm2                   \n"
     "psubw     %%xmm1,%%xmm3                   \n"
+    "paddw     %%xmm2,%%xmm2                   \n"
+    "paddw     %%xmm3,%%xmm3                   \n"
     "pmulhw    %%xmm5,%%xmm2                   \n"
     "pmulhw    %%xmm5,%%xmm3                   \n"
     "paddw     %%xmm2,%%xmm0                   \n"
@@ -582,31 +619,61 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb,
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
     "jg        1b                              \n"
-    "jmp       4f                              \n"
+    "jmp       99f                             \n"
+
+    // Blend 25 / 75.
     ".p2align  4                               \n"
-  "2:                                          \n"
+  "25:                                         \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    (%1,%4,1),%%xmm1                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        25b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 50 / 50.
+    ".p2align  4                               \n"
+  "50:                                         \n"
+    "movdqa    (%1),%%xmm0                     \n"
+    "movdqa    (%1,%4,1),%%xmm1                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        50b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 75 / 25.
+    ".p2align  4                               \n"
+  "75:                                         \n"
+    "movdqa    (%1),%%xmm1                     \n"
+    "movdqa    (%1,%4,1),%%xmm0                \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "pavgb     %%xmm1,%%xmm0                   \n"
+    "sub       $0x4,%2                         \n"
+    "movdqa    %%xmm0,(%1,%0,1)                \n"
+    "lea       0x10(%1),%1                     \n"
+    "jg        75b                             \n"
+    "jmp       99f                             \n"
+
+    // Blend 100 / 0 - Copy row unchanged.
+    ".p2align  4                               \n"
+  "100:                                        \n"
     "movdqa    (%1),%%xmm0                     \n"
     "sub       $0x4,%2                         \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
     "lea       0x10(%1),%1                     \n"
-    "jg        2b                              \n"
-    "jmp       4f                              \n"
-    ".p2align  4                               \n"
-  "3:                                          \n"
-    "movdqa    (%1),%%xmm0                     \n"
-    "pavgb     (%1,%4,1),%%xmm0                \n"
-    "sub       $0x4,%2                         \n"
-    "movdqa    %%xmm0,(%1,%0,1)                \n"
-    "lea       0x10(%1),%1                     \n"
-    "lea       0x10(%1),%1                     \n"
-    "jg        3b                              \n"
-    ".p2align  4                               \n"
-  "4:                                          \n"
+    "jg        100b                            \n"
+
+  "99:                                         \n"
     "shufps    $0xff,%%xmm0,%%xmm0             \n"
     "movdqa    %%xmm0,(%1,%0,1)                \n"
-  : "+r"(dst_argb),     // %0
-    "+r"(src_argb),     // %1
-    "+r"(dst_width),   // %2
+  : "+r"(dst_argb),   // %0
+    "+r"(src_argb),   // %1
+    "+r"(dst_width),  // %2
     "+r"(source_y_fraction)  // %3
   : "r"(static_cast<intptr_t>(src_stride))  // %4
   : "memory", "cc"
@@ -615,7 +682,6 @@ void ScaleARGBFilterRows_SSE2(uint8* dst_argb, const uint8* src_argb,
 #endif
   );
 }
-#endif  // SSE2_DISABLED
 
 // Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
 #define HAS_SCALEARGBFILTERROWS_SSSE3