From 1d160cb99f2b05df80c4555bd769825ad1175dc9 Mon Sep 17 00:00:00 2001
From: "fbarchard@google.com"
 <fbarchard@google.com@16f28f9a-4ce2-e073-06de-1de4eb20be90>
Date: Wed, 28 Nov 2012 20:02:55 +0000
Subject: [PATCH] Attenuate AGRB pixels NEON optimized BUG=164
 TEST=./libyuv_unittest --gtest_filter=*Atten* Review URL:
 https://webrtc-codereview.appspot.com/937031

git-svn-id: http://libyuv.googlecode.com/svn/trunk@506 16f28f9a-4ce2-e073-06de-1de4eb20be90
---
 README.chromium            |   2 +-
 include/libyuv/cpu_id.h    |   2 +-
 include/libyuv/row.h       |  16 +++++-
 include/libyuv/version.h   |   2 +-
 source/convert.cc          |  29 ++++------
 source/convert_argb.cc     |  16 ++++--
 source/convert_from.cc     |  45 ++++++---------
 source/cpu_id.cc           |   3 +-
 source/planar_functions.cc |  34 ++++++++----
 source/rotate_neon.cc      |   6 +-
 source/row_any.cc          |  26 +++++++++
 source/row_neon.cc         |  55 ++++++++++++++++++
 source/row_posix.cc        |   5 +-
 source/row_win.cc          |   5 +-
 source/scale.cc            |   9 ++-
 source/scale_argb.cc       |  23 +++++---
 unit_test/convert_test.cc  |  14 +++--
 unit_test/planar_test.cc   | 111 ++++++++++++++++++++++++++++---------
 18 files changed, 280 insertions(+), 123 deletions(-)

diff --git a/README.chromium b/README.chromium
index 78e01936c..ea7b076b9 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 505
+Version: 506
 License: BSD
 License File: LICENSE
 
diff --git a/include/libyuv/cpu_id.h b/include/libyuv/cpu_id.h
index 7727f2760..0c50886cf 100644
--- a/include/libyuv/cpu_id.h
+++ b/include/libyuv/cpu_id.h
@@ -53,7 +53,7 @@ int ArmCpuCaps(const char* cpuinfo_name);
 // returns non-zero if instruction set is detected
 static __inline int TestCpuFlag(int test_flag) {
   LIBYUV_API extern int cpu_info_;
-  return (cpu_info_ == 1 ? InitCpuFlags() : cpu_info_) & test_flag;
+  return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;
 }
 
 // For testing, allow CPU flags to be disabled.
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 7906d0153..2435138ec 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -139,7 +139,7 @@ extern "C" {
 #if !defined(YUV_DISABLE_ASM) && \
     (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
     !defined(LIBYUV_SSSE3_ONLY)
-#define HAS_ARGBATTENUATE_SSE2
+#define HAS_ARGBATTENUATEROW_SSE2
 #define HAS_ARGBBLENDROW_SSE2
 #define HAS_MIRRORROW_SSE2
 #endif
@@ -221,6 +221,7 @@ extern "C" {
 // Effects
 #define HAS_ARGBINTERPOLATEROW_NEON
 #define HAS_ARGBBLENDROW_NEON
+#define HAS_ARGBATTENUATEROW_NEON
 #endif
 
 // The following are available on Mips platforms
@@ -935,6 +936,12 @@ void YToARGBRow_SSE2(const uint8* src_y,
 void YToARGBRow_NEON(const uint8* src_y,
                      uint8* dst_argb,
                      int width);
+void YToARGBRow_Any_SSE2(const uint8* src_y,
+                         uint8* dst_argb,
+                         int width);
+void YToARGBRow_Any_NEON(const uint8* src_y,
+                         uint8* dst_argb,
+                         int width);
 
 // ARGB preattenuated alpha blend.
 void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
@@ -1194,6 +1201,13 @@ void I422ToUYVYRow_Any_NEON(const uint8* src_y,
 void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
+void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
+                               int width);
+void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
+                                int width);
+void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
+                               int width);
 
 // Inverse table for unattenuate, shared by C and SSE2.
 extern uint32 fixed_invtbl8[256];
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index af349e63e..90da7968c 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 505
+#define LIBYUV_VERSION 506
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
diff --git a/source/convert.cc b/source/convert.cc
index a35cd1769..11d32cd77 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -18,6 +18,7 @@
 #endif
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
+#include "libyuv/scale.h"  // For ScalePlane()
 #include "libyuv/video_common.h"
 #include "libyuv/row.h"
 
@@ -215,12 +216,7 @@ int I444ToI420(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// use Bilinear for upsampling chroma
-void ScalePlaneBilinear(int src_width, int src_height,
-                        int dst_width, int dst_height,
-                        int src_stride, int dst_stride,
-                        const uint8* src_ptr, uint8* dst_ptr);
-
+// TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler.
 // 411 chroma is 1/4 width, 1x height
 // 420 chroma is 1/2 width, 1/2 height
 LIBYUV_API
@@ -256,19 +252,15 @@ int I411ToI420(const uint8* src_y, int src_stride_y,
   int halfheight = (height + 1) >> 1;
   int quarterwidth = (width + 3) >> 2;
 
-  // Resample U plane.
-  ScalePlaneBilinear(quarterwidth, height,  // from 1/4 width, 1x height
-                     halfwidth, halfheight,  // to 1/2 width, 1/2 height
-                     src_stride_u,
-                     dst_stride_u,
-                     src_u, dst_u);
+  // Resample U plane from 1/4 width, 1x height to 1/2 width, 1/2 height.
+  ScalePlane(src_u, src_stride_u, quarterwidth, height,
+             dst_u, dst_stride_u, halfwidth, halfheight,
+             kFilterNone);
 
   // Resample V plane.
-  ScalePlaneBilinear(quarterwidth, height,  // from 1/4 width, 1x height
-                     halfwidth, halfheight,  // to 1/2 width, 1/2 height
-                     src_stride_v,
-                     dst_stride_v,
-                     src_v, dst_v);
+  ScalePlane(src_v, src_stride_v, quarterwidth, height,
+             dst_v, dst_stride_v, halfwidth, halfheight,
+             kFilterNone);
   return 0;
 }
 
@@ -1738,7 +1730,6 @@ static void JpegI400ToI420(void* opaque,
 LIBYUV_API
 int MJPGSize(const uint8* sample, size_t sample_size,
              int* width, int* height) {
-  // TODO(fbarchard): Port to C
   MJpegDecoder mjpeg_decoder;
   bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
   if (ret) {
@@ -1764,7 +1755,7 @@ int MJPGToI420(const uint8* sample,
     return -1;
   }
 
-  // TODO(fbarchard): Port to C
+  // TODO(fbarchard): Port MJpeg to C.
   MJpegDecoder mjpeg_decoder;
   bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
   if (ret && (mjpeg_decoder.GetWidth() != w ||
diff --git a/source/convert_argb.cc b/source/convert_argb.cc
index b9c46d222..b9ec60f98 100644
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@@ -230,13 +230,19 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
                      uint8* rgb_buf,
                      int width) = YToARGBRow_C;
 #if defined(HAS_YTOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8) &&
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    YToARGBRow = YToARGBRow_SSE2;
+    YToARGBRow = YToARGBRow_Any_SSE2;
+    if (IS_ALIGNED(width, 8)) {
+      YToARGBRow = YToARGBRow_SSE2;
+    }
   }
 #elif defined(HAS_YTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
-    YToARGBRow = YToARGBRow_NEON;
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    YToARGBRow = YToARGBRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      YToARGBRow = YToARGBRow_NEON;
+    }
   }
 #endif
 
@@ -941,7 +947,7 @@ int MJPGToARGB(const uint8* sample,
     return -1;
   }
 
-  // TODO(fbarchard): Port to C
+  // TODO(fbarchard): Port MJpeg to C.
   MJpegDecoder mjpeg_decoder;
   bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
   if (ret && (mjpeg_decoder.GetWidth() != w ||
diff --git a/source/convert_from.cc b/source/convert_from.cc
index a4233b1c2..549af8564 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -16,6 +16,7 @@
 #include "libyuv/format_conversion.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
+#include "libyuv/scale.h"  // For ScalePlane()
 #include "libyuv/video_common.h"
 #include "libyuv/row.h"
 
@@ -98,12 +99,7 @@ int I420ToI422(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// use Bilinear for upsampling chroma
-void ScalePlaneBilinear(int src_width, int src_height,
-                        int dst_width, int dst_height,
-                        int src_stride, int dst_stride,
-                        const uint8* src_ptr, uint8* dst_ptr);
-
+// TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler.
 LIBYUV_API
 int I420ToI444(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
@@ -136,19 +132,15 @@ int I420ToI444(const uint8* src_y, int src_stride_y,
   int halfwidth = (width + 1) >> 1;
   int halfheight = (height + 1) >> 1;
 
-  // Upsample U plane.
-  ScalePlaneBilinear(halfwidth, halfheight,
-                     width, height,
-                     src_stride_u,
-                     dst_stride_u,
-                     src_u, dst_u);
+  // Upsample U plane from from 1/2 width, 1/2 height to 1x width, 1x height.
+  ScalePlane(src_u, src_stride_u, halfwidth, halfheight,
+             dst_u, dst_stride_u, width, height,
+             kFilterNone);
 
   // Upsample V plane.
-  ScalePlaneBilinear(halfwidth, halfheight,
-                     width, height,
-                     src_stride_v,
-                     dst_stride_v,
-                     src_v, dst_v);
+  ScalePlane(src_v, src_stride_v, halfwidth, halfheight,
+             dst_v, dst_stride_v, width, height,
+             kFilterNone);
   return 0;
 }
 
@@ -187,19 +179,15 @@ int I420ToI411(const uint8* src_y, int src_stride_y,
   int halfheight = (height + 1) >> 1;
   int quarterwidth = (width + 3) >> 2;
 
-  // Resample U plane.
-  ScalePlaneBilinear(halfwidth, halfheight,  // from 1/2 width, 1/2 height
-                     quarterwidth, height,  // to 1/4 width, 1x height
-                     src_stride_u,
-                     dst_stride_u,
-                     src_u, dst_u);
+  // Resample U plane from 1/2 width, 1/2 height to 1/4 width, 1x height
+  ScalePlane(src_u, src_stride_u, halfwidth, halfheight,
+             dst_u, dst_stride_u,quarterwidth, height,
+             kFilterNone);
 
   // Resample V plane.
-  ScalePlaneBilinear(halfwidth, halfheight,  // from 1/2 width, 1/2 height
-                     quarterwidth, height,  // to 1/4 width, 1x height
-                     src_stride_v,
-                     dst_stride_v,
-                     src_v, dst_v);
+  ScalePlane(src_v, src_stride_v, halfwidth, halfheight,
+             dst_v, dst_stride_v,quarterwidth, height,
+             kFilterNone);
   return 0;
 }
 
@@ -360,7 +348,6 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
   return 0;
 }
 
-// TODO(fbarchard): Deprecate, move or expand 422 support?
 LIBYUV_API
 int I422ToUYVY(const uint8* src_y, int src_stride_y,
                const uint8* src_u, int src_stride_u,
diff --git a/source/cpu_id.cc b/source/cpu_id.cc
index 4032080f9..b932beb54 100644
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -138,9 +138,8 @@ static int MipsCpuCaps(const char* search_string) {
 #endif
 
 // CPU detect function for SIMD instruction sets.
-// TODO(fbarchard): Use constant if/when valgrind says cpu_info is initialized.
 LIBYUV_API
-int cpu_info_ = 1;  // 1 means cpu info is not initialized yet.
+int cpu_info_ = kCpuInit;  // cpu_info is not initialized yet.
 
 // Test environment variable for disabling CPU features. Any non-zero value
 // to disable. Zero ignored to make it easy to set the variable on/off.
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 2da083abd..6b5c3ecfd 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -767,18 +767,32 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
   }
   void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
                            int width) = ARGBAttenuateRow_C;
-#if defined(HAS_ARGBATTENUATE_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
+#if defined(HAS_ARGBATTENUATEROW_SSE2)
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
       IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
+    }
   }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4 &&
       IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
-    ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 4)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_ARGBATTENUATEROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
+    if (IS_ALIGNED(width, 8)) {
+      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
+    }
   }
 #endif
 
@@ -1126,9 +1140,8 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
 }
 
 // Interpolate 2 ARGB images by specified amount (0 to 255).
-// TODO(fbarchard): Check width is multiple of 16.  Do Any version.
-// TODO(fbarchard): Consider selecting a specialized interpolator so
-//     interpolation doesn't need to be checked on each row.
+// TODO(fbarchard): Consider selecting a specialization for interpolation so
+//     row function doesn't need to check interpolation on each row.
 LIBYUV_API
 int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
                     const uint8* src_argb1, int src_stride_argb1,
@@ -1147,15 +1160,14 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
                              ptrdiff_t src_stride, int dst_width,
                              int source_y_fraction) = ARGBInterpolateRow_C;
 #if defined(HAS_ARGBINTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
       IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
       IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
       IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
     ARGBInterpolateRow = ARGBInterpolateRow_SSSE3;
   }
 #elif defined(HAS_ARGBINTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) &&
-      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
     ARGBInterpolateRow = ARGBInterpolateRow_NEON;
   }
 #endif
diff --git a/source/rotate_neon.cc b/source/rotate_neon.cc
index 49b300325..a0b3d291f 100644
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@@ -113,8 +113,8 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
     "vtbl.8      d0, {d2, d3}, d6              \n"
     "vtbl.8      d1, {d2, d3}, d7              \n"
 
-    // TODO: rework shuffle above to write
-    //       out with 4 instead of 8 writes
+    // TODO(frkoenig): Rework shuffle above to
+    // write out with 4 instead of 8 writes.
     "vst1.32     {d4[0]}, [r9], %3             \n"
     "vst1.32     {d4[1]}, [r9], %3             \n"
     "vst1.32     {d5[0]}, [r9], %3             \n"
@@ -276,7 +276,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
     "cmp         %6, #4                        \n"
     "blt         2f                            \n"
 
-    //TODO(frkoenig) : clean this up
+    //TODO(frkoenig): Clean this up
     // 4x8 block
     "mov         r9, %0                        \n"
     "vld1.64     {d0}, [r9], %1                \n"
diff --git a/source/row_any.cc b/source/row_any.cc
index 78921afd2..ce9f352c1 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -141,6 +141,8 @@ RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
        3, 4, 2)
 RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
        7, 1, 4)
+RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
+       7, 1, 4)
 RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,
        15, 2, 4)
 RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C,
@@ -157,6 +159,8 @@ RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C,
        7, 4, 2)
 RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C,
        7, 1, 4)
+RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C,
+       7, 1, 4)
 RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C,
        7, 2, 4)
 RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C,
@@ -226,6 +230,28 @@ YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
 #endif
 #undef YANY
 
+// Attenuate is destructive so last16 method can not be used due to overlap.
+#define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK)                \
+    void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) {             \
+      int n = width & ~MASK;                                                   \
+      ARGBTOY_SIMD(src_argb, dst_y, n);                                        \
+      ARGBTOY_C(src_argb + n * SBPP,                                           \
+                dst_y  + n * BPP, width & MASK);                               \
+    }
+
+#ifdef HAS_ARGBATTENUATEROW_SSSE3
+YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C,
+     4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_SSE2
+YANY(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, ARGBAttenuateRow_C,
+     4, 4, 3)
+#endif
+#ifdef HAS_ARGBATTENUATEROW_NEON
+YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
+     4, 4, 7)
+#endif
+
 // RGB/YUV to UV does multiple of 16 with SIMD and remainder with C.
 #define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP)                           \
     void NAMEANY(const uint8* src_argb, int src_stride_argb,                   \
diff --git a/source/row_neon.cc b/source/row_neon.cc
index 67ff79736..d859ca7b3 100644
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@@ -2418,6 +2418,61 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
   );
 }
 
+// Attenuate 8 pixels at a time.
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    // Attenuate 8 pixels.
+  "1:                                          \n"
+    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmull.u8   q10, d0, d3                    \n"  // b * a
+    "vmull.u8   q11, d1, d3                    \n"  // g * a
+    "vmull.u8   q12, d2, d3                    \n"  // r * a
+    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
+    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
+    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
+    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
+  );
+}
+
+#ifdef ARGBATTENUATEROW_VQRDMULH
+// TODO(fbarchard): Remove this.  Works but is slower and off by 2.
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+  asm volatile (
+    // Attenuate 8 pixels.
+  "1:                                          \n"
+    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 pixels of ARGB.
+    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
+    "vmovl.u8   q0, d0                         \n"
+    "vmovl.u8   q1, d2                         \n"
+    "vmovl.u8   q2, d4                         \n"
+    "vmovl.u8   q8, d6                         \n"
+    "vshl.u16   q0, q0, #7                     \n"  // b << 7
+    "vshl.u16   q1, q1, #7                     \n"  // g << 7
+    "vshl.u16   q2, q2, #7                     \n"  // r << 7
+    "vqrdmulh.s16 q0, q0, q8                   \n"  // b * a
+    "vqrdmulh.s16 q1, q1, q8                   \n"  // g * a
+    "vqrdmulh.s16 q2, q2, q8                   \n"  // r * a
+    "vmovn.u16  d0, q0                         \n"
+    "vmovn.u16  d2, q1                         \n"
+    "vmovn.u16  d4, q2                         \n"
+    "vst4.8     {d0, d2, d4, d6}, [%1]!        \n"  // store 8 pixels of ARGB.
+    "bgt        1b                             \n"
+  : "+r"(src_argb),   // %0
+    "+r"(dst_argb),   // %1
+    "+r"(width)       // %2
+  :
+  : "cc", "memory", "q0", "q1", "q2", "q3", "q8"
+  );
+}
+#endif
+
 #endif  // __ARM_NEON__
 
 #ifdef __cplusplus
diff --git a/source/row_posix.cc b/source/row_posix.cc
index aab0e920d..920a8c404 100644
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@@ -3519,7 +3519,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
 
-#ifdef HAS_ARGBATTENUATE_SSE2
+#ifdef HAS_ARGBATTENUATEROW_SSE2
 // Attenuate 4 pixels at a time.
 // aligned to 16 bytes
 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
@@ -3564,7 +3564,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
 #endif
   );
 }
-#endif  // HAS_ARGBATTENUATE_SSE2
+#endif  // HAS_ARGBATTENUATEROW_SSE2
 
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha
@@ -4132,7 +4132,6 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 
 #ifdef HAS_ARGBAFFINEROW_SSE2
 // TODO(fbarchard): Find 64 bit way to avoid masking.
-// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
 // Copy ARGB pixels from source image with slope to a row of destination.
 // Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
 // an error if movq is used. movd  %%xmm0,%1
diff --git a/source/row_win.cc b/source/row_win.cc
index a2d96124a..b0d8a1117 100644
--- a/source/row_win.cc
+++ b/source/row_win.cc
@@ -1675,7 +1675,6 @@ static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
 
-// TODO(fbarchard): NV12/NV21 fetch UV and use directly.
 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
 
 // Read 8 UV from 411.
@@ -3701,7 +3700,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
 
-#ifdef HAS_ARGBATTENUATE_SSE2
+#ifdef HAS_ARGBATTENUATEROW_SSE2
 // Attenuate 4 pixels at a time.
 // Aligned to 16 bytes.
 __declspec(naked) __declspec(align(16))
@@ -3743,7 +3742,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
     ret
   }
 }
-#endif  // HAS_ARGBATTENUATE_SSE2
+#endif  // HAS_ARGBATTENUATEROW_SSE2
 
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha.
diff --git a/source/scale.cc b/source/scale.cc
index f686dc67e..0c6036a74 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -3091,18 +3091,18 @@ void ScalePlaneBilinear(int src_width, int src_height,
                             int dst_width, int source_y_fraction) =
         ScaleFilterRows_C;
 #if defined(HAS_SCALEFILTERROWS_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
+    if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(src_width, 16)) {
       ScaleFilterRows = ScaleFilterRows_NEON;
     }
 #endif
 #if defined(HAS_SCALEFILTERROWS_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) &&
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16) &&
         IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
       ScaleFilterRows = ScaleFilterRows_SSE2;
     }
 #endif
 #if defined(HAS_SCALEFILTERROWS_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_width, 16)) {
       ScaleFilterRows = ScaleFilterRows_Unaligned_SSSE3;
       if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
         ScaleFilterRows = ScaleFilterRows_SSSE3;
@@ -3110,7 +3110,7 @@ void ScalePlaneBilinear(int src_width, int src_height,
     }
 #endif
 #if defined(HAS_SCALEFILTERROWS_MIPS_DSPR2)
-    if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+    if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
         IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4)) {
       ScaleFilterRows = ScaleFilterRows_MIPS_DSPR2;
     }
@@ -3129,7 +3129,6 @@ void ScalePlaneBilinear(int src_width, int src_height,
       int yf = (y >> 8) & 255;
       const uint8* src = src_ptr + yi * src_stride;
       ScaleFilterRows(row, src, src_stride, src_width, yf);
-      row[src_width] = row[src_width - 1];
       ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
       dst_ptr += dst_stride;
       y += dy;
diff --git a/source/scale_argb.cc b/source/scale_argb.cc
index 635269ad2..c0a8b8912 100644
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@@ -856,8 +856,7 @@ void ScaleARGBFilterRows_C(uint8* dst_argb, const uint8* src_argb,
   int y1_fraction = source_y_fraction;
   int y0_fraction = 256 - y1_fraction;
   const uint8* src_ptr1 = src_argb + src_stride;
-  uint8* end = dst_argb + (dst_width << 2);
-  do {
+  for (int x = 0; x < dst_width - 1; x += 2) {
     dst_argb[0] = (src_argb[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
     dst_argb[1] = (src_argb[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
     dst_argb[2] = (src_argb[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
@@ -869,7 +868,14 @@ void ScaleARGBFilterRows_C(uint8* dst_argb, const uint8* src_argb,
     src_argb += 8;
     src_ptr1 += 8;
     dst_argb += 8;
-  } while (dst_argb < end);
+  }
+  if (dst_width & 1) {
+    dst_argb[0] = (src_argb[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
+    dst_argb[1] = (src_argb[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
+    dst_argb[2] = (src_argb[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
+    dst_argb[3] = (src_argb[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
+    dst_argb += 4;
+  }
   // Duplicate the last pixel (4 bytes) for filtering.
   dst_argb[0] = dst_argb[-4];
   dst_argb[1] = dst_argb[-3];
@@ -975,21 +981,20 @@ static void ScaleARGBBilinear(int src_width, int src_height,
                               ptrdiff_t src_stride,
                               int dst_width, int source_y_fraction) =
       ScaleARGBFilterRows_C;
-// TODO(fbarchard): Check aligned width.
 #if defined(HAS_SCALEARGBFILTERROWS_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
-      IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 4) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
     ScaleARGBFilterRows = ScaleARGBFilterRows_SSE2;
   }
 #endif
 #if defined(HAS_SCALEARGBFILTERROWS_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
-      IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_argb, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_width, 4) &&
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
     ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3;
   }
 #endif
 #if defined(HAS_SCALEARGBFILTERROWS_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(src_width, 4)) {
     ScaleARGBFilterRows = ScaleARGBFilterRows_NEON;
   }
 #endif
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index 3e4de6dc5..01c09b0ae 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -478,8 +478,8 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) {                                 \
   /* Convert to ARGB so 565 is expanded to bytes that can be compared. */      \
   align_buffer_64(dst_argb32_c, kWidth * 4 * kHeight);                         \
   align_buffer_64(dst_argb32_opt, kWidth * 4 * kHeight);                       \
-  memset(dst_argb32_c, 0, kWidth * 4 * kHeight);                               \
-  memset(dst_argb32_opt, 0, kWidth * 4 * kHeight);                             \
+  memset(dst_argb32_c, 1, kWidth * 4 * kHeight);                               \
+  memset(dst_argb32_opt, 2, kWidth * 4 * kHeight);                             \
   FMT_B##ToARGB(dst_argb_c, kStrideB,                                          \
                 dst_argb32_c, kWidth * 4,                                      \
                 kWidth, kHeight);                                              \
@@ -534,6 +534,12 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) {                                 \
   align_buffer_64(dst_y_opt, kWidth * kHeight);                                \
   align_buffer_64(dst_u_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);        \
   align_buffer_64(dst_v_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);        \
+  memset(dst_y_c, 1, kWidth * kHeight);                                        \
+  memset(dst_u_c, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);                \
+  memset(dst_v_c, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);                \
+  memset(dst_y_opt, 2, kWidth * kHeight);                                      \
+  memset(dst_u_opt, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);              \
+  memset(dst_v_opt, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);              \
   srandom(time(NULL));                                                         \
   for (int i = 0; i < kHeight; ++i)                                            \
     for (int j = 0; j < kStride; ++j)                                          \
@@ -753,11 +759,11 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) {                                \
     align_buffer_page_end(src_argb, kStrideA * kHeightA);                      \
     align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                    \
     align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                  \
+    memset(dst_argb_c, 0, kStrideB * kHeightB);                                \
+    memset(dst_argb_opt, 0, kStrideB * kHeightB);                              \
     for (int i = 0; i < kStrideA * kHeightA; ++i) {                            \
       src_argb[i] = (random() & 0xff);                                         \
     }                                                                          \
-    memset(dst_argb_c, 0, kStrideB * kHeightB);                                \
-    memset(dst_argb_opt, 0, kStrideB * kHeightB);                              \
     MaskCpuFlags(0);                                                           \
     FMT_A##To##FMT_B(src_argb, kStrideA,                                       \
                      dst_argb_c, kStrideB,                                     \
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index 11b286b13..169d96317 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -98,12 +98,75 @@ TEST_F(libyuvTest, TestAttenuate) {
   EXPECT_EQ(32, atten_pixels[128][1]);
   EXPECT_EQ(21,  atten_pixels[128][2]);
   EXPECT_EQ(128, atten_pixels[128][3]);
-  EXPECT_EQ(255, atten_pixels[255][0]);
-  EXPECT_EQ(127, atten_pixels[255][1]);
-  EXPECT_EQ(85,  atten_pixels[255][2]);
+  EXPECT_NEAR(255, atten_pixels[255][0], 1);
+  EXPECT_NEAR(127, atten_pixels[255][1], 1);
+  EXPECT_NEAR(85,  atten_pixels[255][2], 1);
   EXPECT_EQ(255, atten_pixels[255][3]);
 }
 
+static int TestAttenuateI(int width, int height, int benchmark_iterations,
+                          int invert, int off) {
+  const int kBpp = 4;
+  const int kStride = (width * kBpp + 15) & ~15;
+  align_buffer_64(src_argb, kStride * height + off);
+  align_buffer_64(dst_argb_c, kStride * height);
+  align_buffer_64(dst_argb_opt, kStride * height);
+  srandom(time(NULL));
+  for (int i = 0; i < kStride * height; ++i) {
+    src_argb[i + off] = (random() & 0xff);
+  }
+  memset(dst_argb_c, 0, kStride * height);
+  memset(dst_argb_opt, 0, kStride * height);
+
+  MaskCpuFlags(0);
+  ARGBAttenuate(src_argb + off, kStride,
+                dst_argb_c, kStride,
+                width, invert * height);
+  MaskCpuFlags(-1);
+  for (int i = 0; i < benchmark_iterations; ++i) {
+    ARGBAttenuate(src_argb + off, kStride,
+                  dst_argb_opt, kStride,
+                  width, invert * height);
+  }
+  int max_diff = 0;
+  for (int i = 0; i < kStride * height; ++i) {
+    int abs_diff =
+        abs(static_cast<int>(dst_argb_c[i]) -
+            static_cast<int>(dst_argb_opt[i]));
+    if (abs_diff > max_diff) {
+      max_diff = abs_diff;
+    }
+  }
+  free_aligned_buffer_64(src_argb)
+  free_aligned_buffer_64(dst_argb_c)
+  free_aligned_buffer_64(dst_argb_opt)
+  return max_diff;
+}
+
+TEST_F(libyuvTest, ARGBAttenuate_Any) {
+  int max_diff = TestAttenuateI(benchmark_width_ - 1, benchmark_height_,
+                                benchmark_iterations_, +1, 0);
+  EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(libyuvTest, ARGBAttenuate_Unaligned) {
+  int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_, +1, 1);
+  EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(libyuvTest, ARGBAttenuate_Invert) {
+  int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_, -1, 0);
+  EXPECT_LE(max_diff, 2);
+}
+
+TEST_F(libyuvTest, ARGBAttenuate_Opt) {
+  int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
+                                benchmark_iterations_, +1, 0);
+  EXPECT_LE(max_diff, 2);
+}
+
 TEST_F(libyuvTest, TestARGBComputeCumulativeSum) {
   SIMD_ALIGNED(uint8 orig_pixels[16][16][4]);
   SIMD_ALIGNED(int32 added_pixels[16][16][4]);
@@ -632,7 +695,7 @@ TEST_F(libyuvTest, ARGBInterpolate##TERP##N) {                                 \
 
 #define TESTINTERPOLATE(TERP)                                                  \
     TESTTERP(ARGB, 4, 1, ARGB, 4, 1,                                           \
-             benchmark_width_ - 4, TERP, 1, _Any, +, 0)                        \
+             benchmark_width_ - 1, TERP, 1, _Any, +, 0)                        \
     TESTTERP(ARGB, 4, 1, ARGB, 4, 1,                                           \
              benchmark_width_, TERP, 1, _Unaligned, +, 1)                      \
     TESTTERP(ARGB, 4, 1, ARGB, 4, 1,                                           \
@@ -648,42 +711,38 @@ TESTINTERPOLATE(255)
 
 static int TestBlend(int width, int height, int benchmark_iterations,
                      int invert, int off) {
-  const int BPP_A = 4;
-  const int STRIDE_A = 1;
-  const int BPP_B = 4;
-  const int STRIDE_B = 1;
-  const int kStrideA = (width * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;
-  const int kStrideB = (width * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;
-  align_buffer_64(src_argb_a, kStrideA * height + off);
-  align_buffer_64(src_argb_b, kStrideA * height + off);
-  align_buffer_64(dst_argb_c, kStrideB * height);
-  align_buffer_64(dst_argb_opt, kStrideB * height);
+  const int kBpp = 4;
+  const int kStride = width * kBpp;
+  align_buffer_64(src_argb_a, kStride * height + off);
+  align_buffer_64(src_argb_b, kStride * height + off);
+  align_buffer_64(dst_argb_c, kStride * height);
+  align_buffer_64(dst_argb_opt, kStride * height);
   srandom(time(NULL));
-  for (int i = 0; i < kStrideA * height; ++i) {
+  for (int i = 0; i < kStride * height; ++i) {
     src_argb_a[i + off] = (random() & 0xff);
     src_argb_b[i + off] = (random() & 0xff);
   }
-  ARGBAttenuate(src_argb_a + off, kStrideA, src_argb_a + off, kStrideA, width,
+  ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
                 height);
-  ARGBAttenuate(src_argb_b + off, kStrideA, src_argb_b + off, kStrideA, width,
+  ARGBAttenuate(src_argb_b + off, kStride, src_argb_b + off, kStride, width,
                 height);
-  memset(dst_argb_c, 255, kStrideB * height);
-  memset(dst_argb_opt, 255, kStrideB * height);
+  memset(dst_argb_c, 255, kStride * height);
+  memset(dst_argb_opt, 255, kStride * height);
 
   MaskCpuFlags(0);
-  ARGBBlend(src_argb_a + off, kStrideA,
-            src_argb_b + off, kStrideA,
-            dst_argb_c, kStrideB,
+  ARGBBlend(src_argb_a + off, kStride,
+            src_argb_b + off, kStride,
+            dst_argb_c, kStride,
             width, invert * height);
   MaskCpuFlags(-1);
   for (int i = 0; i < benchmark_iterations; ++i) {
-    ARGBBlend(src_argb_a + off, kStrideA,
-              src_argb_b + off, kStrideA,
-              dst_argb_opt, kStrideB,
+    ARGBBlend(src_argb_a + off, kStride,
+              src_argb_b + off, kStride,
+              dst_argb_opt, kStride,
               width, invert * height);
   }
   int max_diff = 0;
-  for (int i = 0; i < kStrideB * height; ++i) {
+  for (int i = 0; i < kStride * height; ++i) {
     int abs_diff =
         abs(static_cast<int>(dst_argb_c[i]) -
             static_cast<int>(dst_argb_opt[i]));