NV12Mirror and MirrorUVPlane functions added

HalfMergeUV AVX2 version Skylake Xeon performance for 1280x720 NV12Mirror_Any (109 ms) NV12Mirror_Unaligned (113 ms) NV12Mirror_Invert (107 ms) NV12Mirror_Opt (108 ms) NV12Mirror_NullY (19 ms) Slightly faster than comparable I420Mirror I420Mirror_Any (113 ms) I420Mirror_Unaligned (110 ms) I420Mirror_Invert (109 ms) I420Mirror_Opt (110 ms) BUG=libyuv:840, libyuv:858 Change-Id: I686b1b778383bfa10ecd1655e986bdc99e76d132 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2176066 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
2025-12-07 01:06:46 +08:00 · 2020-05-04 12:32:28 -07:00 · 2020-05-04 12:32:28 -07:00 · 7a61759f78
commit 7a61759f78
parent d9681c53b3
17 changed files with 432 additions and 87 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1749
+Version: 1751
 License: BSD
 License File: LICENSE
--- a/docs/formats.md
+++ b/docs/formats.md
@ -166,3 +166,4 @@ The 12 in NV12 refers to 12 bits per pixel.  NV12 has a half width and half
 height chroma channel, and therefore is a 420 subsampling.
 NV16 is 16 bits per pixel, with half width and full height.  aka 422.
 NV24 is 24 bits per pixel with full sized chroma channel. aka 444.
 Most NV12 functions allow the destination Y pointer to be NULL.
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@ -190,7 +190,7 @@ mips
    make V=1 -f linux.mk
    make V=1 -f linux.mk clean
-    make V=1 -f linux.mk CXX=clang++
+    make V=1 -f linux.mk CXX=clang++ CC=clang
 ## Building the library with cmake
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@ -314,6 +314,22 @@ int I400Mirror(const uint8_t* src_y,
               int width,
               int height);
 // Alias
 #define NV12ToNV12Mirror NV12Mirror
 // NV12 mirror.
 LIBYUV_API
 int NV12Mirror(const uint8_t* src_y,
               int src_stride_y,
               const uint8_t* src_uv,
               int src_stride_uv,
               uint8_t* dst_y,
               int dst_stride_y,
               uint8_t* dst_uv,
               int dst_stride_uv,
               int width,
               int height);
 // Alias
 #define ARGBToARGBMirror ARGBMirror
@ -347,6 +363,15 @@ void MirrorPlane(const uint8_t* src_y,
                 int width,
                 int height);
 // Mirror a plane of UV data.
 LIBYUV_API
 void MirrorUVPlane(const uint8_t* src_uv,
                   int src_stride_uv,
                   uint8_t* dst_uv,
                   int dst_stride_uv,
                   int width,
                   int height);
 // Convert NV12 to RGB565.
 LIBYUV_API
 int NV12ToRGB565(const uint8_t* src_y,
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -274,16 +274,18 @@ extern "C" {
 #define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_CONVERT8TO16ROW_SSE2
 #define HAS_HALFMERGEUVROW_SSSE3
 // I210 is for H010.  2 = 422.  I for 601 vs H for 709.
 #define HAS_I210TOAR30ROW_SSSE3
 #define HAS_I210TOARGBROW_SSSE3
 #define HAS_I422TOAR30ROW_SSSE3
 #define HAS_MERGERGBROW_SSSE3
 #define HAS_MIRRORUVROW_AVX2
 #define HAS_MIRRORUVROW_SSSE3
 #define HAS_RAWTORGBAROW_SSSE3
 #define HAS_RGB24MIRRORROW_SSSE3
 #define HAS_RGBATOYJROW_SSSE3
 #define HAS_SPLITRGBROW_SSSE3
 #define HAS_SWAPUVROW_SSSE3
 #endif
 // The following are available for AVX2 gcc/clang x86 platforms:
@ -299,6 +301,7 @@ extern "C" {
 #define HAS_ARGBTORGB24ROW_AVX2
 #define HAS_CONVERT16TO8ROW_AVX2
 #define HAS_CONVERT8TO16ROW_AVX2
 #define HAS_HALFMERGEUVROW_AVX2
 #define HAS_I210TOAR30ROW_AVX2
 #define HAS_I210TOARGBROW_AVX2
 #define HAS_I422TOAR30ROW_AVX2
@ -368,6 +371,7 @@ extern "C" {
 #define HAS_J400TOARGBROW_NEON
 #define HAS_MERGEUVROW_NEON
 #define HAS_MIRRORROW_NEON
 #define HAS_MIRRORUVROW_NEON
 #define HAS_MIRRORSPLITUVROW_NEON
 #define HAS_NV12TOARGBROW_NEON
 #define HAS_NV12TORGB24ROW_NEON
@ -1574,6 +1578,13 @@ void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
 void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorSplitUVRow_SSSE3(const uint8_t* src,
                            uint8_t* dst_u,
@ -1735,6 +1746,13 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
                          uint8_t* dst_uv,
                          int width);
 void HalfMergeUVRow_AVX2(const uint8_t* src_u,
                         int src_stride_u,
                         const uint8_t* src_v,
                         int src_stride_v,
                         uint8_t* dst_uv,
                         int width);
 void SplitRGBRow_C(const uint8_t* src_rgb,
                   uint8_t* dst_r,
                   uint8_t* dst_g,
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1749
+#define LIBYUV_VERSION 1751
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert.cc
+++ b/source/convert.cc
@ -775,7 +775,7 @@ int YUY2ToI420(const uint8_t* src_yuy2,
    }
  }
 #endif
-#if defined(HAS_YUY2TOYROW_MSA)  && defined(HAS_YUY2TOUVROW_MSA)
+#if defined(HAS_YUY2TOYROW_MSA) && defined(HAS_YUY2TOUVROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    YUY2ToYRow = YUY2ToYRow_Any_MSA;
    YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
@ -1476,7 +1476,7 @@ int RGB24ToI420(const uint8_t* src_rgb24,
    }
  }
 #endif
-#if defined(HAS_RGB24TOYROW_MSA)  && defined(HAS_RGB24TOUVROW_MSA)
+#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA)
  if (TestCpuFlag(kCpuHasMSA)) {
    RGB24ToUVRow = RGB24ToUVRow_Any_MSA;
    RGB24ToYRow = RGB24ToYRow_Any_MSA;
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -1049,6 +1049,56 @@ void MirrorPlane(const uint8_t* src_y,
  }
 }
 // Mirror a plane of UV data.
 LIBYUV_API
 void MirrorUVPlane(const uint8_t* src_uv,
                   int src_stride_uv,
                   uint8_t* dst_uv,
                   int dst_stride_uv,
                   int width,
                   int height) {
  int y;
  void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) =
      MirrorUVRow_C;
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    src_uv = src_uv + (height - 1) * src_stride_uv;
    src_stride_uv = -src_stride_uv;
  }
 #if defined(HAS_MIRRORUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    MirrorUVRow = MirrorUVRow_Any_NEON;
    if (IS_ALIGNED(width, 32)) {
      MirrorUVRow = MirrorUVRow_NEON;
    }
  }
 #endif
 #if defined(HAS_MIRRORUVROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    MirrorUVRow = MirrorUVRow_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
      MirrorUVRow = MirrorUVRow_SSSE3;
    }
  }
 #endif
 #if defined(HAS_MIRRORUVROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    MirrorUVRow = MirrorUVRow_Any_AVX2;
    if (IS_ALIGNED(width, 16)) {
      MirrorUVRow = MirrorUVRow_AVX2;
    }
  }
 #endif
  // MirrorUV plane
  for (y = 0; y < height; ++y) {
    MirrorUVRow(src_uv, dst_uv, width);
    src_uv += src_stride_uv;
    dst_uv += dst_stride_uv;
  }
 }
 // Mirror I400 with optional flipping
 LIBYUV_API
 int I400Mirror(const uint8_t* src_y,
@ -1089,7 +1139,7 @@ int I420Mirror(const uint8_t* src_y,
               int height) {
  int halfwidth = (width + 1) >> 1;
  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||
+  if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
      height == 0) {
    return -1;
  }
@ -1113,6 +1163,42 @@ int I420Mirror(const uint8_t* src_y,
  return 0;
 }
 // NV12 mirror.
 LIBYUV_API
 int NV12Mirror(const uint8_t* src_y,
               int src_stride_y,
               const uint8_t* src_uv,
               int src_stride_uv,
               uint8_t* dst_y,
               int dst_stride_y,
               uint8_t* dst_uv,
               int dst_stride_uv,
               int width,
               int height) {
  int halfwidth = (width + 1) >> 1;
  int halfheight = (height + 1) >> 1;
  if (!src_y || !src_uv || !dst_uv || width <= 0 ||
      height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    halfheight = (height + 1) >> 1;
    src_y = src_y + (height - 1) * src_stride_y;
    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
    src_stride_y = -src_stride_y;
    src_stride_uv = -src_stride_uv;
  }
  if (dst_y) {
    MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
  }
  MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth,
                halfheight);
  return 0;
 }
 // ARGB mirror.
 LIBYUV_API
 int ARGBMirror(const uint8_t* src_argb,
@ -1136,7 +1222,7 @@ int ARGBMirror(const uint8_t* src_argb,
 #if defined(HAS_ARGBMIRRORROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 8)) {
      ARGBMirrorRow = ARGBMirrorRow_NEON;
    }
  }
@ -4136,7 +4222,11 @@ void HalfMergeUVPlane(const uint8_t* src_u,
    HalfMergeUVRow = HalfMergeUVRow_SSSE3;
  }
 #endif
-
+#if defined(HAS_HALFMERGEUVROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
    HalfMergeUVRow = HalfMergeUVRow_AVX2;
  }
 #endif
  for (y = 0; y < height - 1; y += 2) {
    // Merge a row of U and V into a row of UV.
    HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
--- a/source/rotate.cc
+++ b/source/rotate.cc
@ -347,7 +347,7 @@ void RotateUV180(const uint8_t* src,
  void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
                           int width) = MirrorSplitUVRow_C;
 #if defined(HAS_MIRRORSPLITUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
    MirrorSplitUVRow = MirrorSplitUVRow_NEON;
  }
 #endif
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@ -126,7 +126,7 @@ static int ARGBRotate180(const uint8_t* src_argb,
 #if defined(HAS_ARGBMIRRORROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 8)) {
      ARGBMirrorRow = ARGBMirrorRow_NEON;
    }
  }
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -1182,6 +1182,15 @@ ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
 #ifdef HAS_MIRRORROW_MMI
 ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7)
 #endif
 #ifdef HAS_MIRRORUVROW_AVX2
 ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
 #endif
 #ifdef HAS_MIRRORUVROW_SSSE3
 ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
 #endif
 #ifdef HAS_MIRRORUVROW_NEON
 ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
 #endif
 #ifdef HAS_ARGBMIRRORROW_AVX2
 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
 #endif
@ -1189,7 +1198,7 @@ ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
 ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
 #endif
 #ifdef HAS_ARGBMIRRORROW_NEON
-ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 15)
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
 #endif
 #ifdef HAS_ARGBMIRRORROW_MSA
 ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -2162,6 +2162,17 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
  }
 }
 void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
  int x;
  src_uv += (width - 1) << 1;
  for (x = 0; x < width; ++x) {
    dst_uv[0] = src_uv[0];
    dst_uv[1] = src_uv[1];
    src_uv -= 2;
    dst_uv += 2;
  }
 }
 void MirrorSplitUVRow_C(const uint8_t* src_uv,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -3229,10 +3229,62 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 }
 #endif  // HAS_MIRRORROW_AVX2
 #ifdef HAS_MIRRORUVROW_SSSE3
 // Shuffle table for reversing the UV.
 static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
                                       6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
 void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
  intptr_t temp_width = (intptr_t)(width);
  asm volatile(
      "movdqa    %3,%%xmm5                       \n"
      LABELALIGN
      "1:                                        \n"
      "movdqu    -0x10(%0,%2,2),%%xmm0           \n"
      "pshufb    %%xmm5,%%xmm0                   \n"
      "movdqu    %%xmm0,(%1)                     \n"
      "lea       0x10(%1),%1                     \n"
      "sub       $0x8,%2                         \n"
      "jg        1b                              \n"
      : "+r"(src_uv),          // %0
        "+r"(dst_uv),          // %1
        "+r"(temp_width)       // %2
      : "m"(kShuffleMirrorUV)  // %3
      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORUVROW_SSSE3
 #ifdef HAS_MIRRORUVROW_AVX2
 void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
  intptr_t temp_width = (intptr_t)(width);
  asm volatile(
      "vbroadcastf128 %3,%%ymm5                  \n"
      LABELALIGN
      "1:                                        \n"
      "vmovdqu    -0x20(%0,%2,2),%%ymm0          \n"
      "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
      "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
      "vmovdqu    %%ymm0,(%1)                    \n"
      "lea       0x20(%1),%1                     \n"
      "sub       $0x10,%2                        \n"
      "jg        1b                              \n"
      "vzeroupper                                \n"
      : "+r"(src_uv),          // %0
        "+r"(dst_uv),          // %1
        "+r"(temp_width)       // %2
      : "m"(kShuffleMirrorUV)  // %3
      : "memory", "cc", "xmm0", "xmm5");
 }
 #endif  // HAS_MIRRORUVROW_AVX2
 #ifdef HAS_MIRRORSPLITUVROW_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
-                                       15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
+                                            15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
 void MirrorSplitUVRow_SSSE3(const uint8_t* src,
                            uint8_t* dst_u,
                            uint8_t* dst_v,
@ -3253,11 +3305,11 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src,
      "lea       0x8(%1),%1                      \n"
      "sub       $8,%3                           \n"
      "jg        1b                              \n"
-      : "+r"(src),             // %0
+      : "+r"(src),                  // %0
-        "+r"(dst_u),           // %1
+        "+r"(dst_u),                // %1
-        "+r"(dst_v),           // %2
+        "+r"(dst_v),                // %2
-        "+r"(temp_width)       // %3
+        "+r"(temp_width)            // %3
-      : "m"(kShuffleMirrorUV)  // %4
+      : "m"(kShuffleMirrorSplitUV)  // %4
      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_MIRRORSPLITUVROW_SSSE3
@ -7052,6 +7104,54 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 void HalfMergeUVRow_AVX2(const uint8_t* src_u,
                         int src_stride_u,
                         const uint8_t* src_v,
                         int src_stride_v,
                         uint8_t* dst_uv,
                         int width) {
  asm volatile(
      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
      "1:                                        \n"
      LABELALIGN
      "1:                                        \n"
      "vmovdqu    (%0),%%ymm0                    \n"  // load 32 U values
      "vmovdqu    (%1),%%ymm1                    \n"  // load 32 V values
      "vmovdqu    0(%0,%4,1),%%ymm2              \n"  // 32 from next row
      "vmovdqu    0(%1,%5,1),%%ymm3              \n"
      "lea        0x20(%0),%0                    \n"
      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"  // half size
      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
      "lea        0x20(%1),%1                    \n"
      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
      "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
      "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
      "vpackuswb  %%ymm1,%%ymm1,%%ymm1           \n"
      "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"
      "vmovdqu    %%ymm0,(%2)                    \n"  // store 16 UV pixels
      "lea        0x20(%2),%2                    \n"
      "sub        $0x20,%3                       \n"  // 32 src pixels per loop
      "jg         1b                             \n"
      "vzeroupper                                \n"
      : "+r"(src_u),                    // %0
        "+r"(src_v),                    // %1
        "+r"(dst_uv),                   // %2
        "+r"(width)                     // %3
      : "r"((intptr_t)(src_stride_u)),  // %4
        "r"((intptr_t)(src_stride_v))   // %5
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 #endif  // defined(__x86_64__) || defined(__i386__)
 #ifdef __cplusplus
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -701,6 +701,26 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
      : "cc", "memory", "q0", "q1", "q2");
 }
 void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
  asm volatile(
      // Start at end of source row.
      "mov        r12, #-16                      \n"
      "add        %0, %0, %2, lsl #1             \n"
      "sub        %0, #16                        \n"
      "1:                                        \n"
      "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
      "subs       %2, #8                         \n"  // 8 pixels per loop.
      "vrev64.8   q0, q0                         \n"
      "vst2.8     {d0, d1}, [%1]!                \n"  // dst += 16
      "bgt        1b                             \n"
      : "+r"(src_uv),  // %0
        "+r"(dst_uv),  // %1
        "+r"(width)    // %2
      :
      : "cc", "memory", "r12", "q0");
 }
 void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -747,67 +747,99 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      // Start at end of source row.
-      "ld1        {v3.16b}, [%4]                 \n"  // shuffler
+      "ld1        {v3.16b}, [%3]                 \n"  // shuffler
      "add        %0, %0, %w2, sxtw              \n"
      "sub        %0, %0, #32                    \n"
      "1:                                        \n"
-      "ld1        {v1.16b,v2.16b}, [%0], %3      \n"  // src -= 32
+      "ldr        q2, [%0, 16]                   \n"
      "ldr        q1, [%0], -32                  \n"  // src -= 32
      "subs       %w2, %w2, #32                  \n"  // 32 pixels per loop.
      "tbl        v1.16b, {v1.16b}, v3.16b       \n"
      "tbl        v0.16b, {v2.16b}, v3.16b       \n"
      "tbl        v1.16b, {v1.16b}, v3.16b       \n"
      "st1        {v0.16b, v1.16b}, [%1], #32    \n"  // store 32 pixels
      "b.gt       1b                             \n"
      : "+r"(src),            // %0
        "+r"(dst),            // %1
        "+r"(width)           // %2
-      : "r"((ptrdiff_t)-32),  // %3
+      : "r"(&kShuffleMirror)  // %3
        "r"(&kShuffleMirror)  // %4
      : "cc", "memory", "v0", "v1", "v2", "v3");
 }
 // Shuffle table for reversing the UV.
 static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
                                       6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
 void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
  asm volatile(
      // Start at end of source row.
      "ld1        {v4.16b}, [%3]                 \n"  // shuffler
      "add        %0, %0, %w2, sxtw #1           \n"
      "sub        %0, %0, #32                    \n"
      "1:                                        \n"
      "ldr        q1, [%0, 16]                   \n"
      "ldr        q0, [%0], -32                  \n"  // src -= 32
      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
      "tbl        v2.16b, {v1.16b}, v4.16b       \n"
      "tbl        v3.16b, {v0.16b}, v4.16b       \n"
      "st1        {v2.16b, v3.16b}, [%1], #32    \n"  // dst += 32
      "b.gt       1b                             \n"
      : "+r"(src_uv),           // %0
        "+r"(dst_uv),           // %1
        "+r"(width)             // %2
      : "r"(&kShuffleMirrorUV)  // %3
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
 }
 void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
  asm volatile(
      // Start at end of source row.
      "ld1        {v4.16b}, [%4]                 \n"  // shuffler
      "add        %0, %0, %w3, sxtw #1           \n"
-      "sub        %0, %0, #16                    \n"
+      "sub        %0, %0, #32                    \n"
      "1:                                        \n"
-      "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
+      "ldr        q1, [%0, 16]                   \n"
-      "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.
+      "ldr        q0, [%0], -32                  \n"  // src -= 32
-      "rev64      v0.8b, v0.8b                   \n"
+      "subs       %w3, %w3, #16                  \n"  // 16 pixels per loop.
-      "rev64      v1.8b, v1.8b                   \n"
+      "tbl        v2.16b, {v1.16b}, v4.16b       \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
+      "tbl        v3.16b, {v0.16b}, v4.16b       \n"
-      "st1        {v1.8b}, [%2], #8              \n"
+      "uzp1       v0.16b, v2.16b, v3.16b         \n"  // U
      "uzp2       v1.16b, v2.16b, v3.16b         \n"  // V
      "st1        {v0.16b}, [%1], #16            \n"  // dst += 16
      "st1        {v1.16b}, [%2], #16            \n"
      "b.gt       1b                             \n"
-      : "+r"(src_uv),        // %0
+      : "+r"(src_uv),           // %0
-        "+r"(dst_u),         // %1
+        "+r"(dst_u),            // %1
-        "+r"(dst_v),         // %2
+        "+r"(dst_v),            // %2
-        "+r"(width)          // %3
+        "+r"(width)             // %3
-      : "r"((ptrdiff_t)-16)  // %4
+      : "r"(&kShuffleMirrorUV)  // %4
-      : "cc", "memory", "v0", "v1");
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
 }
 // Shuffle table for reversing the ARGB.
 static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
                                         4u,  5u,  6u,  7u,  0u, 1u, 2u,  3u};
 void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
  asm volatile(
-      "ld1        {v4.16b}, [%4]                 \n"  // shuffler
+      // Start at end of source row.
-      "add        %0, %0, %w2, sxtw #2           \n"  // Start at end of row.
+      "ld1        {v4.16b}, [%3]                 \n"  // shuffler
-      "sub        %0, %0, #64                    \n"
+      "add        %0, %0, %w2, sxtw #2           \n"
      "sub        %0, %0, #32                    \n"
      "1:                                        \n"
-      "ld4        {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], %3\n"  // src -= 64
+      "ldr        q1, [%0, 16]                   \n"
-      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
+      "ldr        q0, [%0], -32                  \n"  // src -= 32
-      "tbl        v0.16b, {v0.16b}, v4.16b       \n"
+      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop.
-      "tbl        v1.16b, {v1.16b}, v4.16b       \n"
+      "tbl        v2.16b, {v1.16b}, v4.16b       \n"
-      "tbl        v2.16b, {v2.16b}, v4.16b       \n"
+      "tbl        v3.16b, {v0.16b}, v4.16b       \n"
-      "tbl        v3.16b, {v3.16b}, v4.16b       \n"
+      "st1        {v2.16b, v3.16b}, [%1], #32    \n"  // dst += 32
      "st4        {v0.16b, v1.16b, v2.16b, v3.16b}, [%1], #64 \n"  // dst += 64
      "b.gt       1b                             \n"
-      : "+r"(src_argb),       // %0
+      : "+r"(src_argb),           // %0
-        "+r"(dst_argb),       // %1
+        "+r"(dst_argb),           // %1
-        "+r"(width)           // %2
+        "+r"(width)               // %2
-      : "r"((ptrdiff_t)-64),  // %3
+      : "r"(&kShuffleMirrorARGB)  // %3
        "r"(&kShuffleMirror)  // %4
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
 }
@ -3249,20 +3281,27 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
      : "cc", "memory", "v0", "v1", "v2", "v3");
 }
 // Shuffle table for swapping UV bytes.
 static const uvec8 kShuffleSwapUV = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
                                     9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
 // Convert UV plane of NV12 to VU of NV21.
 void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
  asm volatile(
      "ld1        {v2.16b}, [%3]                 \n"  // shuffler
      "1:                                        \n"
-      "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 UV values
+      "ld1        {v0.16b}, [%0], 16             \n"  // load 16 UV values
-      "orr        v2.16b, v0.16b, v0.16b         \n"  // move U after V
+      "ld1        {v1.16b}, [%0], 16             \n"
      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop
-      "st2        {v1.16b, v2.16b}, [%1], #32    \n"  // store 16 VU pixels
+      "tbl        v0.16b, {v0.16b}, v2.16b       \n"
      "tbl        v1.16b, {v1.16b}, v2.16b       \n"
      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "stp        q0, q1, [%1], 32               \n"  // store 16 VU pixels
      "b.gt       1b                             \n"
-      : "+r"(src_uv),  // %0
+      : "+r"(src_uv),         // %0
-        "+r"(dst_vu),  // %1
+        "+r"(dst_vu),         // %1
-        "+r"(width)    // %2
+        "+r"(width)           // %2
-      :
+      : "r"(&kShuffleSwapUV)  // %3
      : "cc", "memory", "v0", "v1", "v2");
 }
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -497,6 +497,7 @@ TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0)
 TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
 TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
 #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -782,44 +782,75 @@ TEST_F(LibYUVPlanarTest, TestARGBQuantize) {
  }
 }
-TEST_F(LibYUVPlanarTest, TestARGBMirror) {
+TEST_F(LibYUVPlanarTest, ARGBMirror_Opt) {
-  SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
+  align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 4);
-  SIMD_ALIGNED(uint8_t dst_pixels[1280][4]);
+  align_buffer_page_end(dst_pixels_opt,
                        benchmark_width_ * benchmark_height_ * 4);
  align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 4);
-  for (int i = 0; i < 1280; ++i) {
+  MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 4);
-    orig_pixels[i][0] = i;
+  MaskCpuFlags(disable_cpu_flags_);
-    orig_pixels[i][1] = i / 2;
+  ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_c,
-    orig_pixels[i][2] = i / 3;
+             benchmark_width_ * 4, benchmark_width_, benchmark_height_);
-    orig_pixels[i][3] = i / 4;
+  MaskCpuFlags(benchmark_cpu_info_);
  }
  ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
-  for (int i = 0; i < 1280; ++i) {
+  for (int i = 0; i < benchmark_iterations_; ++i) {
-    EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]);
+    ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
-    EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]);
+               benchmark_width_ * 4, benchmark_width_, benchmark_height_);
    EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]);
    EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]);
  }
-  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
-    ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  }
  free_aligned_buffer_page_end(src_pixels);
  free_aligned_buffer_page_end(dst_pixels_opt);
  free_aligned_buffer_page_end(dst_pixels_c);
 }
-TEST_F(LibYUVPlanarTest, TestMirrorPlane) {
+TEST_F(LibYUVPlanarTest, MirrorPlane_Opt) {
-  SIMD_ALIGNED(uint8_t orig_pixels[1280]);
+  align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_);
-  SIMD_ALIGNED(uint8_t dst_pixels[1280]);
+  align_buffer_page_end(dst_pixels_opt, benchmark_width_ * benchmark_height_);
  align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_);
-  for (int i = 0; i < 1280; ++i) {
+  MemRandomize(src_pixels, benchmark_width_ * benchmark_height_);
-    orig_pixels[i] = i;
+  MaskCpuFlags(disable_cpu_flags_);
-  }
+  MirrorPlane(src_pixels, benchmark_width_, dst_pixels_c, benchmark_width_,
-  MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
+              benchmark_width_, benchmark_height_);
  MaskCpuFlags(benchmark_cpu_info_);
-  for (int i = 0; i < 1280; ++i) {
+  for (int i = 0; i < benchmark_iterations_; ++i) {
-    EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i]);
+    MirrorPlane(src_pixels, benchmark_width_, dst_pixels_opt, benchmark_width_,
                benchmark_width_, benchmark_height_);
  }
-  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
-    MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  }
  free_aligned_buffer_page_end(src_pixels);
  free_aligned_buffer_page_end(dst_pixels_opt);
  free_aligned_buffer_page_end(dst_pixels_c);
 }
 TEST_F(LibYUVPlanarTest, MirrorUVPlane_Opt) {
  align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 2);
  align_buffer_page_end(dst_pixels_opt,
                        benchmark_width_ * benchmark_height_ * 2);
  align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 2);
  MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 2);
  MaskCpuFlags(disable_cpu_flags_);
  MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c,
                benchmark_width_ * 2, benchmark_width_, benchmark_height_);
  MaskCpuFlags(benchmark_cpu_info_);
  for (int i = 0; i < benchmark_iterations_; ++i) {
    MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt,
                  benchmark_width_ * 2, benchmark_width_, benchmark_height_);
  }
  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  }
  free_aligned_buffer_page_end(src_pixels);
  free_aligned_buffer_page_end(dst_pixels_opt);
  free_aligned_buffer_page_end(dst_pixels_c);
 }
 TEST_F(LibYUVPlanarTest, TestShade) {