Attenuate AGRB pixels NEON optimized

BUG=164 TEST=./libyuv_unittest --gtest_filter=*Atten* Review URL: https://webrtc-codereview.appspot.com/937031 git-svn-id: http://libyuv.googlecode.com/svn/trunk@506 16f28f9a-4ce2-e073-06de-1de4eb20be90
2025-12-06 16:56:55 +08:00 · 2012-11-28 20:02:55 +00:00 · 2012-11-28 20:02:55 +00:00 · 1d160cb99f
commit 1d160cb99f
parent 326a521aba
18 changed files with 280 additions and 123 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 505
+Version: 506
 License: BSD
 License File: LICENSE
--- a/include/libyuv/cpu_id.h
+++ b/include/libyuv/cpu_id.h
@ -53,7 +53,7 @@ int ArmCpuCaps(const char* cpuinfo_name);
 // returns non-zero if instruction set is detected
 static __inline int TestCpuFlag(int test_flag) {
  LIBYUV_API extern int cpu_info_;
-  return (cpu_info_ == 1 ? InitCpuFlags() : cpu_info_) & test_flag;
+  return (cpu_info_ == kCpuInit ? InitCpuFlags() : cpu_info_) & test_flag;
 }
 // For testing, allow CPU flags to be disabled.
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -139,7 +139,7 @@ extern "C" {
 #if !defined(YUV_DISABLE_ASM) && \
    (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
    !defined(LIBYUV_SSSE3_ONLY)
-#define HAS_ARGBATTENUATE_SSE2
+#define HAS_ARGBATTENUATEROW_SSE2
 #define HAS_ARGBBLENDROW_SSE2
 #define HAS_MIRRORROW_SSE2
 #endif
@ -221,6 +221,7 @@ extern "C" {
 // Effects
 #define HAS_ARGBINTERPOLATEROW_NEON
 #define HAS_ARGBBLENDROW_NEON
 #define HAS_ARGBATTENUATEROW_NEON
 #endif
 // The following are available on Mips platforms
@ -935,6 +936,12 @@ void YToARGBRow_SSE2(const uint8* src_y,
 void YToARGBRow_NEON(const uint8* src_y,
                     uint8* dst_argb,
                     int width);
 void YToARGBRow_Any_SSE2(const uint8* src_y,
                         uint8* dst_argb,
                         int width);
 void YToARGBRow_Any_NEON(const uint8* src_y,
                         uint8* dst_argb,
                         int width);
 // ARGB preattenuated alpha blend.
 void ARGBBlendRow_SSSE3(const uint8* src_argb, const uint8* src_argb1,
@ -1194,6 +1201,13 @@ void I422ToUYVYRow_Any_NEON(const uint8* src_y,
 void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width);
 void ARGBAttenuateRow_Any_SSE2(const uint8* src_argb, uint8* dst_argb,
                               int width);
 void ARGBAttenuateRow_Any_SSSE3(const uint8* src_argb, uint8* dst_argb,
                                int width);
 void ARGBAttenuateRow_Any_NEON(const uint8* src_argb, uint8* dst_argb,
                               int width);
 // Inverse table for unattenuate, shared by C and SSE2.
 extern uint32 fixed_invtbl8[256];
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 505
+#define LIBYUV_VERSION 506
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert.cc
+++ b/source/convert.cc
@ -18,6 +18,7 @@
 #endif
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
 #include "libyuv/scale.h"  // For ScalePlane()
 #include "libyuv/video_common.h"
 #include "libyuv/row.h"
@ -215,12 +216,7 @@ int I444ToI420(const uint8* src_y, int src_stride_y,
  return 0;
 }
-// use Bilinear for upsampling chroma
+// TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler.
 void ScalePlaneBilinear(int src_width, int src_height,
                        int dst_width, int dst_height,
                        int src_stride, int dst_stride,
                        const uint8* src_ptr, uint8* dst_ptr);
 // 411 chroma is 1/4 width, 1x height
 // 420 chroma is 1/2 width, 1/2 height
 LIBYUV_API
@ -256,19 +252,15 @@ int I411ToI420(const uint8* src_y, int src_stride_y,
  int halfheight = (height + 1) >> 1;
  int quarterwidth = (width + 3) >> 2;
-  // Resample U plane.
+  // Resample U plane from 1/4 width, 1x height to 1/2 width, 1/2 height.
-  ScalePlaneBilinear(quarterwidth, height,  // from 1/4 width, 1x height
+  ScalePlane(src_u, src_stride_u, quarterwidth, height,
-                     halfwidth, halfheight,  // to 1/2 width, 1/2 height
+             dst_u, dst_stride_u, halfwidth, halfheight,
-                     src_stride_u,
+             kFilterNone);
                     dst_stride_u,
                     src_u, dst_u);
  // Resample V plane.
-  ScalePlaneBilinear(quarterwidth, height,  // from 1/4 width, 1x height
+  ScalePlane(src_v, src_stride_v, quarterwidth, height,
-                     halfwidth, halfheight,  // to 1/2 width, 1/2 height
+             dst_v, dst_stride_v, halfwidth, halfheight,
-                     src_stride_v,
+             kFilterNone);
                     dst_stride_v,
                     src_v, dst_v);
  return 0;
 }
@ -1738,7 +1730,6 @@ static void JpegI400ToI420(void* opaque,
 LIBYUV_API
 int MJPGSize(const uint8* sample, size_t sample_size,
             int* width, int* height) {
  // TODO(fbarchard): Port to C
  MJpegDecoder mjpeg_decoder;
  bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
  if (ret) {
@ -1764,7 +1755,7 @@ int MJPGToI420(const uint8* sample,
    return -1;
  }
-  // TODO(fbarchard): Port to C
+  // TODO(fbarchard): Port MJpeg to C.
  MJpegDecoder mjpeg_decoder;
  bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
  if (ret && (mjpeg_decoder.GetWidth() != w ||
--- a/source/convert_argb.cc
+++ b/source/convert_argb.cc
@ -230,14 +230,20 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
                     uint8* rgb_buf,
                     int width) = YToARGBRow_C;
 #if defined(HAS_YTOARGBROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 8) &&
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
    YToARGBRow = YToARGBRow_Any_SSE2;
    if (IS_ALIGNED(width, 8)) {
      YToARGBRow = YToARGBRow_SSE2;
    }
  }
 #elif defined(HAS_YTOARGBROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
    YToARGBRow = YToARGBRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
      YToARGBRow = YToARGBRow_NEON;
    }
  }
 #endif
  for (int y = 0; y < height; ++y) {
@ -941,7 +947,7 @@ int MJPGToARGB(const uint8* sample,
    return -1;
  }
-  // TODO(fbarchard): Port to C
+  // TODO(fbarchard): Port MJpeg to C.
  MJpegDecoder mjpeg_decoder;
  bool ret = mjpeg_decoder.LoadFrame(sample, sample_size);
  if (ret && (mjpeg_decoder.GetWidth() != w ||
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@ -16,6 +16,7 @@
 #include "libyuv/format_conversion.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
 #include "libyuv/scale.h"  // For ScalePlane()
 #include "libyuv/video_common.h"
 #include "libyuv/row.h"
@ -98,12 +99,7 @@ int I420ToI422(const uint8* src_y, int src_stride_y,
  return 0;
 }
-// use Bilinear for upsampling chroma
+// TODO(fbarchard): Enable bilinear when fast enough or specialized upsampler.
 void ScalePlaneBilinear(int src_width, int src_height,
                        int dst_width, int dst_height,
                        int src_stride, int dst_stride,
                        const uint8* src_ptr, uint8* dst_ptr);
 LIBYUV_API
 int I420ToI444(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
@ -136,19 +132,15 @@ int I420ToI444(const uint8* src_y, int src_stride_y,
  int halfwidth = (width + 1) >> 1;
  int halfheight = (height + 1) >> 1;
-  // Upsample U plane.
+  // Upsample U plane from from 1/2 width, 1/2 height to 1x width, 1x height.
-  ScalePlaneBilinear(halfwidth, halfheight,
+  ScalePlane(src_u, src_stride_u, halfwidth, halfheight,
-                     width, height,
+             dst_u, dst_stride_u, width, height,
-                     src_stride_u,
+             kFilterNone);
                     dst_stride_u,
                     src_u, dst_u);
  // Upsample V plane.
-  ScalePlaneBilinear(halfwidth, halfheight,
+  ScalePlane(src_v, src_stride_v, halfwidth, halfheight,
-                     width, height,
+             dst_v, dst_stride_v, width, height,
-                     src_stride_v,
+             kFilterNone);
                     dst_stride_v,
                     src_v, dst_v);
  return 0;
 }
@ -187,19 +179,15 @@ int I420ToI411(const uint8* src_y, int src_stride_y,
  int halfheight = (height + 1) >> 1;
  int quarterwidth = (width + 3) >> 2;
-  // Resample U plane.
+  // Resample U plane from 1/2 width, 1/2 height to 1/4 width, 1x height
-  ScalePlaneBilinear(halfwidth, halfheight,  // from 1/2 width, 1/2 height
+  ScalePlane(src_u, src_stride_u, halfwidth, halfheight,
-                     quarterwidth, height,  // to 1/4 width, 1x height
+             dst_u, dst_stride_u,quarterwidth, height,
-                     src_stride_u,
+             kFilterNone);
                     dst_stride_u,
                     src_u, dst_u);
  // Resample V plane.
-  ScalePlaneBilinear(halfwidth, halfheight,  // from 1/2 width, 1/2 height
+  ScalePlane(src_v, src_stride_v, halfwidth, halfheight,
-                     quarterwidth, height,  // to 1/4 width, 1x height
+             dst_v, dst_stride_v,quarterwidth, height,
-                     src_stride_v,
+             kFilterNone);
                     dst_stride_v,
                     src_v, dst_v);
  return 0;
 }
@ -360,7 +348,6 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
  return 0;
 }
 // TODO(fbarchard): Deprecate, move or expand 422 support?
 LIBYUV_API
 int I422ToUYVY(const uint8* src_y, int src_stride_y,
               const uint8* src_u, int src_stride_u,
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@ -138,9 +138,8 @@ static int MipsCpuCaps(const char* search_string) {
 #endif
 // CPU detect function for SIMD instruction sets.
 // TODO(fbarchard): Use constant if/when valgrind says cpu_info is initialized.
 LIBYUV_API
-int cpu_info_ = 1;  // 1 means cpu info is not initialized yet.
+int cpu_info_ = kCpuInit;  // cpu_info is not initialized yet.
 // Test environment variable for disabling CPU features. Any non-zero value
 // to disable. Zero ignored to make it easy to set the variable on/off.
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -767,19 +767,33 @@ int ARGBAttenuate(const uint8* src_argb, int src_stride_argb,
  }
  void (*ARGBAttenuateRow)(const uint8* src_argb, uint8* dst_argb,
                           int width) = ARGBAttenuateRow_C;
-#if defined(HAS_ARGBATTENUATE_SSE2)
+#if defined(HAS_ARGBATTENUATEROW_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 4) &&
+  if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSE2;
    if (IS_ALIGNED(width, 4)) {
      ARGBAttenuateRow = ARGBAttenuateRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 4 &&
      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
    ARGBAttenuateRow = ARGBAttenuateRow_Any_SSSE3;
    if (IS_ALIGNED(width, 4)) {
      ARGBAttenuateRow = ARGBAttenuateRow_SSSE3;
    }
  }
 #endif
 #if defined(HAS_ARGBATTENUATEROW_NEON)
  if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
    ARGBAttenuateRow = ARGBAttenuateRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
      ARGBAttenuateRow = ARGBAttenuateRow_NEON;
    }
  }
 #endif
  for (int y = 0; y < height; ++y) {
@ -1126,9 +1140,8 @@ int ARGBShade(const uint8* src_argb, int src_stride_argb,
 }
 // Interpolate 2 ARGB images by specified amount (0 to 255).
-// TODO(fbarchard): Check width is multiple of 16.  Do Any version.
+// TODO(fbarchard): Consider selecting a specialization for interpolation so
-// TODO(fbarchard): Consider selecting a specialized interpolator so
+//     row function doesn't need to check interpolation on each row.
 //     interpolation doesn't need to be checked on each row.
 LIBYUV_API
 int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
                    const uint8* src_argb1, int src_stride_argb1,
@ -1147,15 +1160,14 @@ int ARGBInterpolate(const uint8* src_argb0, int src_stride_argb0,
                             ptrdiff_t src_stride, int dst_width,
                             int source_y_fraction) = ARGBInterpolateRow_C;
 #if defined(HAS_ARGBINTERPOLATEROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
      IS_ALIGNED(src_argb0, 16) && IS_ALIGNED(src_stride_argb0, 16) &&
      IS_ALIGNED(src_argb1, 16) && IS_ALIGNED(src_stride_argb1, 16) &&
      IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
    ARGBInterpolateRow = ARGBInterpolateRow_SSSE3;
  }
 #elif defined(HAS_ARGBINTERPOLATEROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) &&
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
      IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
    ARGBInterpolateRow = ARGBInterpolateRow_NEON;
  }
 #endif
--- a/source/rotate_neon.cc
+++ b/source/rotate_neon.cc
@ -113,8 +113,8 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
    "vtbl.8      d0, {d2, d3}, d6              \n"
    "vtbl.8      d1, {d2, d3}, d7              \n"
-    // TODO: rework shuffle above to write
+    // TODO(frkoenig): Rework shuffle above to
-    //       out with 4 instead of 8 writes
+    // write out with 4 instead of 8 writes.
    "vst1.32     {d4[0]}, [r9], %3             \n"
    "vst1.32     {d4[1]}, [r9], %3             \n"
    "vst1.32     {d5[0]}, [r9], %3             \n"
@ -276,7 +276,7 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
    "cmp         %6, #4                        \n"
    "blt         2f                            \n"
-    //TODO(frkoenig) : clean this up
+    //TODO(frkoenig): Clean this up
    // 4x8 block
    "mov         r9, %0                        \n"
    "vld1.64     {d0}, [r9], %1                \n"
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -141,6 +141,8 @@ RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, ARGBToARGB4444Row_C,
       3, 4, 2)
 RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_Unaligned_SSE2, I400ToARGBRow_C,
       7, 1, 4)
 RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C,
       7, 1, 4)
 RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_Unaligned_SSSE3, YUY2ToARGBRow_C,
       15, 2, 4)
 RGBANY(UYVYToARGBRow_Any_SSSE3, UYVYToARGBRow_Unaligned_SSSE3, UYVYToARGBRow_C,
@ -157,6 +159,8 @@ RGBANY(ARGBToARGB4444Row_Any_NEON, ARGBToARGB4444Row_NEON, ARGBToARGB4444Row_C,
       7, 4, 2)
 RGBANY(I400ToARGBRow_Any_NEON, I400ToARGBRow_NEON, I400ToARGBRow_C,
       7, 1, 4)
 RGBANY(YToARGBRow_Any_NEON, YToARGBRow_NEON, YToARGBRow_C,
       7, 1, 4)
 RGBANY(YUY2ToARGBRow_Any_NEON, YUY2ToARGBRow_NEON, YUY2ToARGBRow_C,
       7, 2, 4)
 RGBANY(UYVYToARGBRow_Any_NEON, UYVYToARGBRow_NEON, UYVYToARGBRow_C,
@ -226,6 +230,28 @@ YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8)
 #endif
 #undef YANY
 // Attenuate is destructive so last16 method can not be used due to overlap.
 #define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK)                \
    void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) {             \
      int n = width & ~MASK;                                                   \
      ARGBTOY_SIMD(src_argb, dst_y, n);                                        \
      ARGBTOY_C(src_argb + n * SBPP,                                           \
                dst_y  + n * BPP, width & MASK);                               \
    }
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 YANY(ARGBAttenuateRow_Any_SSSE3, ARGBAttenuateRow_SSSE3, ARGBAttenuateRow_C,
     4, 4, 3)
 #endif
 #ifdef HAS_ARGBATTENUATEROW_SSE2
 YANY(ARGBAttenuateRow_Any_SSE2, ARGBAttenuateRow_SSE2, ARGBAttenuateRow_C,
     4, 4, 3)
 #endif
 #ifdef HAS_ARGBATTENUATEROW_NEON
 YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
     4, 4, 7)
 #endif
 // RGB/YUV to UV does multiple of 16 with SIMD and remainder with C.
 #define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP)                           \
    void NAMEANY(const uint8* src_argb, int src_stride_argb,                   \
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -2418,6 +2418,61 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
  );
 }
 // Attenuate 8 pixels at a time.
 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
  asm volatile (
    // Attenuate 8 pixels.
  "1:                                          \n"
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load 8 pixels of ARGB.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmull.u8   q10, d0, d3                    \n"  // b * a
    "vmull.u8   q11, d1, d3                    \n"  // g * a
    "vmull.u8   q12, d2, d3                    \n"  // r * a
    "vqrshrn.u16 d0, q10, #8                   \n"  // b >>= 8
    "vqrshrn.u16 d1, q11, #8                   \n"  // g >>= 8
    "vqrshrn.u16 d2, q12, #8                   \n"  // r >>= 8
    "vst4.8     {d0, d1, d2, d3}, [%1]!        \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
  : "+r"(src_argb),   // %0
    "+r"(dst_argb),   // %1
    "+r"(width)       // %2
  :
  : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
  );
 }
 #ifdef ARGBATTENUATEROW_VQRDMULH
 // TODO(fbarchard): Remove this.  Works but is slower and off by 2.
 void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
  asm volatile (
    // Attenuate 8 pixels.
  "1:                                          \n"
    "vld4.8     {d0, d2, d4, d6}, [%0]!        \n"  // load 8 pixels of ARGB.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop.
    "vmovl.u8   q0, d0                         \n"
    "vmovl.u8   q1, d2                         \n"
    "vmovl.u8   q2, d4                         \n"
    "vmovl.u8   q8, d6                         \n"
    "vshl.u16   q0, q0, #7                     \n"  // b << 7
    "vshl.u16   q1, q1, #7                     \n"  // g << 7
    "vshl.u16   q2, q2, #7                     \n"  // r << 7
    "vqrdmulh.s16 q0, q0, q8                   \n"  // b * a
    "vqrdmulh.s16 q1, q1, q8                   \n"  // g * a
    "vqrdmulh.s16 q2, q2, q8                   \n"  // r * a
    "vmovn.u16  d0, q0                         \n"
    "vmovn.u16  d2, q1                         \n"
    "vmovn.u16  d4, q2                         \n"
    "vst4.8     {d0, d2, d4, d6}, [%1]!        \n"  // store 8 pixels of ARGB.
    "bgt        1b                             \n"
  : "+r"(src_argb),   // %0
    "+r"(dst_argb),   // %1
    "+r"(width)       // %2
  :
  : "cc", "memory", "q0", "q1", "q2", "q3", "q8"
  );
 }
 #endif
 #endif  // __ARM_NEON__
 #ifdef __cplusplus
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -3519,7 +3519,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
-#ifdef HAS_ARGBATTENUATE_SSE2
+#ifdef HAS_ARGBATTENUATEROW_SSE2
 // Attenuate 4 pixels at a time.
 // aligned to 16 bytes
 void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
@ -3564,7 +3564,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
 #endif
  );
 }
-#endif  // HAS_ARGBATTENUATE_SSE2
+#endif  // HAS_ARGBATTENUATEROW_SSE2
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha
@ -4132,7 +4132,6 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
 #ifdef HAS_ARGBAFFINEROW_SSE2
 // TODO(fbarchard): Find 64 bit way to avoid masking.
 // TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
 // Copy ARGB pixels from source image with slope to a row of destination.
 // Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
 // an error if movq is used. movd  %%xmm0,%1
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -1675,7 +1675,6 @@ static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
 static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
 static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
 // TODO(fbarchard): NV12/NV21 fetch UV and use directly.
 // TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
 // Read 8 UV from 411.
@ -3701,7 +3700,7 @@ void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
 }
 #endif  // HAS_ARGBBLENDROW_SSSE3
-#ifdef HAS_ARGBATTENUATE_SSE2
+#ifdef HAS_ARGBATTENUATEROW_SSE2
 // Attenuate 4 pixels at a time.
 // Aligned to 16 bytes.
 __declspec(naked) __declspec(align(16))
@ -3743,7 +3742,7 @@ void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
    ret
  }
 }
-#endif  // HAS_ARGBATTENUATE_SSE2
+#endif  // HAS_ARGBATTENUATEROW_SSE2
 #ifdef HAS_ARGBATTENUATEROW_SSSE3
 // Shuffle table duplicating alpha.
--- a/source/scale.cc
+++ b/source/scale.cc
@ -3091,18 +3091,18 @@ void ScalePlaneBilinear(int src_width, int src_height,
                            int dst_width, int source_y_fraction) =
        ScaleFilterRows_C;
 #if defined(HAS_SCALEFILTERROWS_NEON)
-    if (TestCpuFlag(kCpuHasNEON)) {
+    if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(src_width, 16)) {
      ScaleFilterRows = ScaleFilterRows_NEON;
    }
 #endif
 #if defined(HAS_SCALEFILTERROWS_SSE2)
-    if (TestCpuFlag(kCpuHasSSE2) &&
+    if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 16) &&
        IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
      ScaleFilterRows = ScaleFilterRows_SSE2;
    }
 #endif
 #if defined(HAS_SCALEFILTERROWS_SSSE3)
-    if (TestCpuFlag(kCpuHasSSSE3)) {
+    if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_width, 16)) {
      ScaleFilterRows = ScaleFilterRows_Unaligned_SSSE3;
      if (IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
        ScaleFilterRows = ScaleFilterRows_SSSE3;
@ -3110,7 +3110,7 @@ void ScalePlaneBilinear(int src_width, int src_height,
    }
 #endif
 #if defined(HAS_SCALEFILTERROWS_MIPS_DSPR2)
-    if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+    if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
        IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4)) {
      ScaleFilterRows = ScaleFilterRows_MIPS_DSPR2;
    }
@ -3129,7 +3129,6 @@ void ScalePlaneBilinear(int src_width, int src_height,
      int yf = (y >> 8) & 255;
      const uint8* src = src_ptr + yi * src_stride;
      ScaleFilterRows(row, src, src_stride, src_width, yf);
      row[src_width] = row[src_width - 1];
      ScaleFilterCols_C(dst_ptr, row, dst_width, x, dx);
      dst_ptr += dst_stride;
      y += dy;
--- a/source/scale_argb.cc
+++ b/source/scale_argb.cc
@ -856,8 +856,7 @@ void ScaleARGBFilterRows_C(uint8* dst_argb, const uint8* src_argb,
  int y1_fraction = source_y_fraction;
  int y0_fraction = 256 - y1_fraction;
  const uint8* src_ptr1 = src_argb + src_stride;
-  uint8* end = dst_argb + (dst_width << 2);
+  for (int x = 0; x < dst_width - 1; x += 2) {
  do {
    dst_argb[0] = (src_argb[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
    dst_argb[1] = (src_argb[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
    dst_argb[2] = (src_argb[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
@ -869,7 +868,14 @@ void ScaleARGBFilterRows_C(uint8* dst_argb, const uint8* src_argb,
    src_argb += 8;
    src_ptr1 += 8;
    dst_argb += 8;
-  } while (dst_argb < end);
+  }
  if (dst_width & 1) {
    dst_argb[0] = (src_argb[0] * y0_fraction + src_ptr1[0] * y1_fraction) >> 8;
    dst_argb[1] = (src_argb[1] * y0_fraction + src_ptr1[1] * y1_fraction) >> 8;
    dst_argb[2] = (src_argb[2] * y0_fraction + src_ptr1[2] * y1_fraction) >> 8;
    dst_argb[3] = (src_argb[3] * y0_fraction + src_ptr1[3] * y1_fraction) >> 8;
    dst_argb += 4;
  }
  // Duplicate the last pixel (4 bytes) for filtering.
  dst_argb[0] = dst_argb[-4];
  dst_argb[1] = dst_argb[-3];
@ -975,21 +981,20 @@ static void ScaleARGBBilinear(int src_width, int src_height,
                              ptrdiff_t src_stride,
                              int dst_width, int source_y_fraction) =
      ScaleARGBFilterRows_C;
 // TODO(fbarchard): Check aligned width.
 #if defined(HAS_SCALEARGBFILTERROWS_SSE2)
-  if (TestCpuFlag(kCpuHasSSE2) &&
+  if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(src_width, 4) &&
-      IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_argb, 16)) {
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
    ScaleARGBFilterRows = ScaleARGBFilterRows_SSE2;
  }
 #endif
 #if defined(HAS_SCALEARGBFILTERROWS_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) &&
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_width, 4) &&
-      IS_ALIGNED(src_stride, 16) && IS_ALIGNED(src_argb, 16)) {
+      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
    ScaleARGBFilterRows = ScaleARGBFilterRows_SSSE3;
  }
 #endif
 #if defined(HAS_SCALEARGBFILTERROWS_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(src_width, 4)) {
    ScaleARGBFilterRows = ScaleARGBFilterRows_NEON;
  }
 #endif
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -478,8 +478,8 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) {                                 \
  /* Convert to ARGB so 565 is expanded to bytes that can be compared. */      \
  align_buffer_64(dst_argb32_c, kWidth * 4 * kHeight);                         \
  align_buffer_64(dst_argb32_opt, kWidth * 4 * kHeight);                       \
-  memset(dst_argb32_c, 0, kWidth * 4 * kHeight);                               \
+  memset(dst_argb32_c, 1, kWidth * 4 * kHeight);                               \
-  memset(dst_argb32_opt, 0, kWidth * 4 * kHeight);                             \
+  memset(dst_argb32_opt, 2, kWidth * 4 * kHeight);                             \
  FMT_B##ToARGB(dst_argb_c, kStrideB,                                          \
                dst_argb32_c, kWidth * 4,                                      \
                kWidth, kHeight);                                              \
@ -534,6 +534,12 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) {                                 \
  align_buffer_64(dst_y_opt, kWidth * kHeight);                                \
  align_buffer_64(dst_u_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);        \
  align_buffer_64(dst_v_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);        \
  memset(dst_y_c, 1, kWidth * kHeight);                                        \
  memset(dst_u_c, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);                \
  memset(dst_v_c, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);                \
  memset(dst_y_opt, 2, kWidth * kHeight);                                      \
  memset(dst_u_opt, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);              \
  memset(dst_v_opt, 0, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y);              \
  srandom(time(NULL));                                                         \
  for (int i = 0; i < kHeight; ++i)                                            \
    for (int j = 0; j < kStride; ++j)                                          \
@ -753,11 +759,11 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) {                                \
    align_buffer_page_end(src_argb, kStrideA * kHeightA);                      \
    align_buffer_page_end(dst_argb_c, kStrideB * kHeightB);                    \
    align_buffer_page_end(dst_argb_opt, kStrideB * kHeightB);                  \
    memset(dst_argb_c, 0, kStrideB * kHeightB);                                \
    memset(dst_argb_opt, 0, kStrideB * kHeightB);                              \
    for (int i = 0; i < kStrideA * kHeightA; ++i) {                            \
      src_argb[i] = (random() & 0xff);                                         \
    }                                                                          \
    memset(dst_argb_c, 0, kStrideB * kHeightB);                                \
    memset(dst_argb_opt, 0, kStrideB * kHeightB);                              \
    MaskCpuFlags(0);                                                           \
    FMT_A##To##FMT_B(src_argb, kStrideA,                                       \
                     dst_argb_c, kStrideB,                                     \
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -98,12 +98,75 @@ TEST_F(libyuvTest, TestAttenuate) {
  EXPECT_EQ(32, atten_pixels[128][1]);
  EXPECT_EQ(21,  atten_pixels[128][2]);
  EXPECT_EQ(128, atten_pixels[128][3]);
-  EXPECT_EQ(255, atten_pixels[255][0]);
+  EXPECT_NEAR(255, atten_pixels[255][0], 1);
-  EXPECT_EQ(127, atten_pixels[255][1]);
+  EXPECT_NEAR(127, atten_pixels[255][1], 1);
-  EXPECT_EQ(85,  atten_pixels[255][2]);
+  EXPECT_NEAR(85,  atten_pixels[255][2], 1);
  EXPECT_EQ(255, atten_pixels[255][3]);
 }
 static int TestAttenuateI(int width, int height, int benchmark_iterations,
                          int invert, int off) {
  const int kBpp = 4;
  const int kStride = (width * kBpp + 15) & ~15;
  align_buffer_64(src_argb, kStride * height + off);
  align_buffer_64(dst_argb_c, kStride * height);
  align_buffer_64(dst_argb_opt, kStride * height);
  srandom(time(NULL));
  for (int i = 0; i < kStride * height; ++i) {
    src_argb[i + off] = (random() & 0xff);
  }
  memset(dst_argb_c, 0, kStride * height);
  memset(dst_argb_opt, 0, kStride * height);
  MaskCpuFlags(0);
  ARGBAttenuate(src_argb + off, kStride,
                dst_argb_c, kStride,
                width, invert * height);
  MaskCpuFlags(-1);
  for (int i = 0; i < benchmark_iterations; ++i) {
    ARGBAttenuate(src_argb + off, kStride,
                  dst_argb_opt, kStride,
                  width, invert * height);
  }
  int max_diff = 0;
  for (int i = 0; i < kStride * height; ++i) {
    int abs_diff =
        abs(static_cast<int>(dst_argb_c[i]) -
            static_cast<int>(dst_argb_opt[i]));
    if (abs_diff > max_diff) {
      max_diff = abs_diff;
    }
  }
  free_aligned_buffer_64(src_argb)
  free_aligned_buffer_64(dst_argb_c)
  free_aligned_buffer_64(dst_argb_opt)
  return max_diff;
 }
 TEST_F(libyuvTest, ARGBAttenuate_Any) {
  int max_diff = TestAttenuateI(benchmark_width_ - 1, benchmark_height_,
                                benchmark_iterations_, +1, 0);
  EXPECT_LE(max_diff, 2);
 }
 TEST_F(libyuvTest, ARGBAttenuate_Unaligned) {
  int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
                                benchmark_iterations_, +1, 1);
  EXPECT_LE(max_diff, 2);
 }
 TEST_F(libyuvTest, ARGBAttenuate_Invert) {
  int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
                                benchmark_iterations_, -1, 0);
  EXPECT_LE(max_diff, 2);
 }
 TEST_F(libyuvTest, ARGBAttenuate_Opt) {
  int max_diff = TestAttenuateI(benchmark_width_, benchmark_height_,
                                benchmark_iterations_, +1, 0);
  EXPECT_LE(max_diff, 2);
 }
 TEST_F(libyuvTest, TestARGBComputeCumulativeSum) {
  SIMD_ALIGNED(uint8 orig_pixels[16][16][4]);
  SIMD_ALIGNED(int32 added_pixels[16][16][4]);
@ -632,7 +695,7 @@ TEST_F(libyuvTest, ARGBInterpolate##TERP##N) {                                 \
 #define TESTINTERPOLATE(TERP)                                                  \
    TESTTERP(ARGB, 4, 1, ARGB, 4, 1,                                           \
-             benchmark_width_ - 4, TERP, 1, _Any, +, 0)                        \
+             benchmark_width_ - 1, TERP, 1, _Any, +, 0)                        \
    TESTTERP(ARGB, 4, 1, ARGB, 4, 1,                                           \
             benchmark_width_, TERP, 1, _Unaligned, +, 1)                      \
    TESTTERP(ARGB, 4, 1, ARGB, 4, 1,                                           \
@ -648,42 +711,38 @@ TESTINTERPOLATE(255)
 static int TestBlend(int width, int height, int benchmark_iterations,
                     int invert, int off) {
-  const int BPP_A = 4;
+  const int kBpp = 4;
-  const int STRIDE_A = 1;
+  const int kStride = width * kBpp;
-  const int BPP_B = 4;
+  align_buffer_64(src_argb_a, kStride * height + off);
-  const int STRIDE_B = 1;
+  align_buffer_64(src_argb_b, kStride * height + off);
-  const int kStrideA = (width * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A;
+  align_buffer_64(dst_argb_c, kStride * height);
-  const int kStrideB = (width * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B;
+  align_buffer_64(dst_argb_opt, kStride * height);
  align_buffer_64(src_argb_a, kStrideA * height + off);
  align_buffer_64(src_argb_b, kStrideA * height + off);
  align_buffer_64(dst_argb_c, kStrideB * height);
  align_buffer_64(dst_argb_opt, kStrideB * height);
  srandom(time(NULL));
-  for (int i = 0; i < kStrideA * height; ++i) {
+  for (int i = 0; i < kStride * height; ++i) {
    src_argb_a[i + off] = (random() & 0xff);
    src_argb_b[i + off] = (random() & 0xff);
  }
-  ARGBAttenuate(src_argb_a + off, kStrideA, src_argb_a + off, kStrideA, width,
+  ARGBAttenuate(src_argb_a + off, kStride, src_argb_a + off, kStride, width,
                height);
-  ARGBAttenuate(src_argb_b + off, kStrideA, src_argb_b + off, kStrideA, width,
+  ARGBAttenuate(src_argb_b + off, kStride, src_argb_b + off, kStride, width,
                height);
-  memset(dst_argb_c, 255, kStrideB * height);
+  memset(dst_argb_c, 255, kStride * height);
-  memset(dst_argb_opt, 255, kStrideB * height);
+  memset(dst_argb_opt, 255, kStride * height);
  MaskCpuFlags(0);
-  ARGBBlend(src_argb_a + off, kStrideA,
+  ARGBBlend(src_argb_a + off, kStride,
-            src_argb_b + off, kStrideA,
+            src_argb_b + off, kStride,
-            dst_argb_c, kStrideB,
+            dst_argb_c, kStride,
            width, invert * height);
  MaskCpuFlags(-1);
  for (int i = 0; i < benchmark_iterations; ++i) {
-    ARGBBlend(src_argb_a + off, kStrideA,
+    ARGBBlend(src_argb_a + off, kStride,
-              src_argb_b + off, kStrideA,
+              src_argb_b + off, kStride,
-              dst_argb_opt, kStrideB,
+              dst_argb_opt, kStride,
              width, invert * height);
  }
  int max_diff = 0;
-  for (int i = 0; i < kStrideB * height; ++i) {
+  for (int i = 0; i < kStride * height; ++i) {
    int abs_diff =
        abs(static_cast<int>(dst_argb_c[i]) -
            static_cast<int>(dst_argb_opt[i]));