NV12Mirror and MirrorUVPlane functions added

HalfMergeUV AVX2 version Skylake Xeon performance for 1280x720 NV12Mirror_Any (109 ms) NV12Mirror_Unaligned (113 ms) NV12Mirror_Invert (107 ms) NV12Mirror_Opt (108 ms) NV12Mirror_NullY (19 ms) Slightly faster than comparable I420Mirror I420Mirror_Any (113 ms) I420Mirror_Unaligned (110 ms) I420Mirror_Invert (109 ms) I420Mirror_Opt (110 ms) BUG=libyuv:840, libyuv:858 Change-Id: I686b1b778383bfa10ecd1655e986bdc99e76d132 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2176066 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
2025-12-06 16:56:55 +08:00 · 2020-05-04 12:32:28 -07:00 · 2020-05-04 12:32:28 -07:00 · 7a61759f78
commit 7a61759f78
parent d9681c53b3
17 changed files with 432 additions and 87 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1749
+Version: 1751
 License: BSD
 License File: LICENSE

--- a/docs/formats.md
+++ b/docs/formats.md
@ -166,3 +166,4 @@ The 12 in NV12 refers to 12 bits per pixel.  NV12 has a half width and half
 height chroma channel, and therefore is a 420 subsampling.
 NV16 is 16 bits per pixel, with half width and full height.  aka 422.
 NV24 is 24 bits per pixel with full sized chroma channel. aka 444.
+Most NV12 functions allow the destination Y pointer to be NULL.
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@ -190,7 +190,7 @@ mips

    make V=1 -f linux.mk
    make V=1 -f linux.mk clean
-    make V=1 -f linux.mk CXX=clang++
+    make V=1 -f linux.mk CXX=clang++ CC=clang

 ## Building the library with cmake

--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@ -314,6 +314,22 @@ int I400Mirror(const uint8_t* src_y,
               int width,
               int height);

+// Alias
+#define NV12ToNV12Mirror NV12Mirror
+
+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height);
+
 // Alias
 #define ARGBToARGBMirror ARGBMirror

@ -347,6 +363,15 @@ void MirrorPlane(const uint8_t* src_y,
                 int width,
                 int height);

+// Mirror a plane of UV data.
+LIBYUV_API
+void MirrorUVPlane(const uint8_t* src_uv,
+                   int src_stride_uv,
+                   uint8_t* dst_uv,
+                   int dst_stride_uv,
+                   int width,
+                   int height);
+
 // Convert NV12 to RGB565.
 LIBYUV_API
 int NV12ToRGB565(const uint8_t* src_y,
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -274,16 +274,18 @@ extern "C" {
 #define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_CONVERT8TO16ROW_SSE2
 #define HAS_HALFMERGEUVROW_SSSE3
-// I210 is for H010.  2 = 422.  I for 601 vs H for 709.
 #define HAS_I210TOAR30ROW_SSSE3
 #define HAS_I210TOARGBROW_SSSE3
 #define HAS_I422TOAR30ROW_SSSE3
 #define HAS_MERGERGBROW_SSSE3
+#define HAS_MIRRORUVROW_AVX2
+#define HAS_MIRRORUVROW_SSSE3
 #define HAS_RAWTORGBAROW_SSSE3
 #define HAS_RGB24MIRRORROW_SSSE3
 #define HAS_RGBATOYJROW_SSSE3
 #define HAS_SPLITRGBROW_SSSE3
 #define HAS_SWAPUVROW_SSSE3
+
 #endif

 // The following are available for AVX2 gcc/clang x86 platforms:
@ -299,6 +301,7 @@ extern "C" {
 #define HAS_ARGBTORGB24ROW_AVX2
 #define HAS_CONVERT16TO8ROW_AVX2
 #define HAS_CONVERT8TO16ROW_AVX2
+#define HAS_HALFMERGEUVROW_AVX2
 #define HAS_I210TOAR30ROW_AVX2
 #define HAS_I210TOARGBROW_AVX2
 #define HAS_I422TOAR30ROW_AVX2
@ -368,6 +371,7 @@ extern "C" {
 #define HAS_J400TOARGBROW_NEON
 #define HAS_MERGEUVROW_NEON
 #define HAS_MIRRORROW_NEON
+#define HAS_MIRRORUVROW_NEON
 #define HAS_MIRRORSPLITUVROW_NEON
 #define HAS_NV12TOARGBROW_NEON
 #define HAS_NV12TORGB24ROW_NEON
@ -1574,6 +1578,13 @@ void MirrorRow_Any_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void MirrorRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
 void MirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width);
+void MirrorUVRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);
+void MirrorUVRow_Any_NEON(const uint8_t* src_ptr, uint8_t* dst_ptr, int width);

 void MirrorSplitUVRow_SSSE3(const uint8_t* src,
                            uint8_t* dst_u,
@ -1735,6 +1746,13 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
                          uint8_t* dst_uv,
                          int width);

+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width);
+
 void SplitRGBRow_C(const uint8_t* src_rgb,
                   uint8_t* dst_r,
                   uint8_t* dst_g,
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1749
+#define LIBYUV_VERSION 1751

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -1049,6 +1049,56 @@ void MirrorPlane(const uint8_t* src_y,
  }
 }

+// Mirror a plane of UV data.
+LIBYUV_API
+void MirrorUVPlane(const uint8_t* src_uv,
+                   int src_stride_uv,
+                   uint8_t* dst_uv,
+                   int dst_stride_uv,
+                   int width,
+                   int height) {
+  int y;
+  void (*MirrorUVRow)(const uint8_t* src, uint8_t* dst, int width) =
+      MirrorUVRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_uv = src_uv + (height - 1) * src_stride_uv;
+    src_stride_uv = -src_stride_uv;
+  }
+#if defined(HAS_MIRRORUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorUVRow = MirrorUVRow_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorUVRow = MirrorUVRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorUVRow = MirrorUVRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      MirrorUVRow = MirrorUVRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorUVRow = MirrorUVRow_Any_AVX2;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorUVRow = MirrorUVRow_AVX2;
+    }
+  }
+#endif
+
+  // MirrorUV plane
+  for (y = 0; y < height; ++y) {
+    MirrorUVRow(src_uv, dst_uv, width);
+    src_uv += src_stride_uv;
+    dst_uv += dst_stride_uv;
+  }
+}
+
 // Mirror I400 with optional flipping
 LIBYUV_API
 int I400Mirror(const uint8_t* src_y,
@ -1089,7 +1139,7 @@ int I420Mirror(const uint8_t* src_y,
               int height) {
  int halfwidth = (width + 1) >> 1;
  int halfheight = (height + 1) >> 1;
-  if (!src_y || !src_u || !src_v || !dst_y || !dst_u || !dst_v || width <= 0 ||
+  if (!src_y || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
      height == 0) {
    return -1;
  }
@ -1113,6 +1163,42 @@ int I420Mirror(const uint8_t* src_y,
  return 0;
 }

+// NV12 mirror.
+LIBYUV_API
+int NV12Mirror(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_uv,
+               int src_stride_uv,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  int halfwidth = (width + 1) >> 1;
+  int halfheight = (height + 1) >> 1;
+  if (!src_y || !src_uv || !dst_uv || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+    src_stride_y = -src_stride_y;
+    src_stride_uv = -src_stride_uv;
+  }
+
+  if (dst_y) {
+    MirrorPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  MirrorUVPlane(src_uv, src_stride_uv, dst_uv, dst_stride_uv, halfwidth,
+                halfheight);
+  return 0;
+}
+
 // ARGB mirror.
 LIBYUV_API
 int ARGBMirror(const uint8_t* src_argb,
@ -1136,7 +1222,7 @@ int ARGBMirror(const uint8_t* src_argb,
 #if defined(HAS_ARGBMIRRORROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 8)) {
      ARGBMirrorRow = ARGBMirrorRow_NEON;
    }
  }
@ -4136,7 +4222,11 @@ void HalfMergeUVPlane(const uint8_t* src_u,
    HalfMergeUVRow = HalfMergeUVRow_SSSE3;
  }
 #endif
-
+#if defined(HAS_HALFMERGEUVROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+    HalfMergeUVRow = HalfMergeUVRow_AVX2;
+  }
+#endif
  for (y = 0; y < height - 1; y += 2) {
    // Merge a row of U and V into a row of UV.
    HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
--- a/source/rotate.cc
+++ b/source/rotate.cc
@ -347,7 +347,7 @@ void RotateUV180(const uint8_t* src,
  void (*MirrorSplitUVRow)(const uint8_t* src, uint8_t* dst_u, uint8_t* dst_v,
                           int width) = MirrorSplitUVRow_C;
 #if defined(HAS_MIRRORSPLITUVROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
    MirrorSplitUVRow = MirrorSplitUVRow_NEON;
  }
 #endif
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@ -126,7 +126,7 @@ static int ARGBRotate180(const uint8_t* src_argb,
 #if defined(HAS_ARGBMIRRORROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    ARGBMirrorRow = ARGBMirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 16)) {
+    if (IS_ALIGNED(width, 8)) {
      ARGBMirrorRow = ARGBMirrorRow_NEON;
    }
  }
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -1182,6 +1182,15 @@ ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63)
 #ifdef HAS_MIRRORROW_MMI
 ANY11M(MirrorRow_Any_MMI, MirrorRow_MMI, 1, 7)
 #endif
+#ifdef HAS_MIRRORUVROW_AVX2
+ANY11M(MirrorUVRow_Any_AVX2, MirrorUVRow_AVX2, 2, 15)
+#endif
+#ifdef HAS_MIRRORUVROW_SSSE3
+ANY11M(MirrorUVRow_Any_SSSE3, MirrorUVRow_SSSE3, 2, 7)
+#endif
+#ifdef HAS_MIRRORUVROW_NEON
+ANY11M(MirrorUVRow_Any_NEON, MirrorUVRow_NEON, 2, 31)
+#endif
 #ifdef HAS_ARGBMIRRORROW_AVX2
 ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
 #endif
@ -1189,7 +1198,7 @@ ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7)
 ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3)
 #endif
 #ifdef HAS_ARGBMIRRORROW_NEON
-ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 15)
+ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 7)
 #endif
 #ifdef HAS_ARGBMIRRORROW_MSA
 ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15)
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -2162,6 +2162,17 @@ void MirrorRow_C(const uint8_t* src, uint8_t* dst, int width) {
  }
 }

+void MirrorUVRow_C(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  int x;
+  src_uv += (width - 1) << 1;
+  for (x = 0; x < width; ++x) {
+    dst_uv[0] = src_uv[0];
+    dst_uv[1] = src_uv[1];
+    src_uv -= 2;
+    dst_uv += 2;
+  }
+}
+
 void MirrorSplitUVRow_C(const uint8_t* src_uv,
                        uint8_t* dst_u,
                        uint8_t* dst_v,
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -3229,9 +3229,61 @@ void MirrorRow_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 }
 #endif  // HAS_MIRRORROW_AVX2

+#ifdef HAS_MIRRORUVROW_SSSE3
+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+                                       6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
+
+void MirrorUVRow_SSSE3(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile(
+
+      "movdqa    %3,%%xmm5                       \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    -0x10(%0,%2,2),%%xmm0           \n"
+      "pshufb    %%xmm5,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%1)                     \n"
+      "lea       0x10(%1),%1                     \n"
+      "sub       $0x8,%2                         \n"
+      "jg        1b                              \n"
+      : "+r"(src_uv),          // %0
+        "+r"(dst_uv),          // %1
+        "+r"(temp_width)       // %2
+      : "m"(kShuffleMirrorUV)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
+}
+#endif  // HAS_MIRRORUVROW_SSSE3
+
+#ifdef HAS_MIRRORUVROW_AVX2
+void MirrorUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  intptr_t temp_width = (intptr_t)(width);
+  asm volatile(
+
+      "vbroadcastf128 %3,%%ymm5                  \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    -0x20(%0,%2,2),%%ymm0          \n"
+      "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
+      "vmovdqu    %%ymm0,(%1)                    \n"
+      "lea       0x20(%1),%1                     \n"
+      "sub       $0x10,%2                        \n"
+      "jg        1b                              \n"
+      "vzeroupper                                \n"
+      : "+r"(src_uv),          // %0
+        "+r"(dst_uv),          // %1
+        "+r"(temp_width)       // %2
+      : "m"(kShuffleMirrorUV)  // %3
+      : "memory", "cc", "xmm0", "xmm5");
+}
+#endif  // HAS_MIRRORUVROW_AVX2
+
 #ifdef HAS_MIRRORSPLITUVROW_SSSE3
 // Shuffle table for reversing the bytes of UV channels.
-static const uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
+static const uvec8 kShuffleMirrorSplitUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
                                            15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
 void MirrorSplitUVRow_SSSE3(const uint8_t* src,
                            uint8_t* dst_u,
@ -3257,7 +3309,7 @@ void MirrorSplitUVRow_SSSE3(const uint8_t* src,
        "+r"(dst_u),                // %1
        "+r"(dst_v),                // %2
        "+r"(temp_width)            // %3
-      : "m"(kShuffleMirrorUV)  // %4
+      : "m"(kShuffleMirrorSplitUV)  // %4
      : "memory", "cc", "xmm0", "xmm1");
 }
 #endif  // HAS_MIRRORSPLITUVROW_SSSE3
@ -7052,6 +7104,54 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }

+void HalfMergeUVRow_AVX2(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width) {
+  asm volatile(
+      "vpcmpeqb    %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpsrlw      $0xf,%%ymm4,%%ymm4            \n"
+      "vpackuswb   %%ymm4,%%ymm4,%%ymm4          \n"
+      "vpxor       %%ymm5,%%ymm5,%%ymm5          \n"
+      "1:                                        \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu    (%0),%%ymm0                    \n"  // load 32 U values
+      "vmovdqu    (%1),%%ymm1                    \n"  // load 32 V values
+      "vmovdqu    0(%0,%4,1),%%ymm2              \n"  // 32 from next row
+      "vmovdqu    0(%1,%5,1),%%ymm3              \n"
+      "lea        0x20(%0),%0                    \n"
+      "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"  // half size
+      "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
+      "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
+      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
+      "lea        0x20(%1),%1                    \n"
+      "vpaddw     %%ymm2,%%ymm0,%%ymm0           \n"
+      "vpaddw     %%ymm3,%%ymm1,%%ymm1           \n"
+      "vpsrlw     $0x1,%%ymm0,%%ymm0             \n"
+      "vpsrlw     $0x1,%%ymm1,%%ymm1             \n"
+      "vpavgw     %%ymm5,%%ymm0,%%ymm0           \n"
+      "vpavgw     %%ymm5,%%ymm1,%%ymm1           \n"
+      "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
+      "vpackuswb  %%ymm1,%%ymm1,%%ymm1           \n"
+      "vpunpcklbw %%ymm1,%%ymm0,%%ymm0           \n"
+      "vmovdqu    %%ymm0,(%2)                    \n"  // store 16 UV pixels
+      "lea        0x20(%2),%2                    \n"
+      "sub        $0x20,%3                       \n"  // 32 src pixels per loop
+      "jg         1b                             \n"
+      "vzeroupper                                \n"
+      : "+r"(src_u),                    // %0
+        "+r"(src_v),                    // %1
+        "+r"(dst_uv),                   // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride_u)),  // %4
+        "r"((intptr_t)(src_stride_v))   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
 #endif  // defined(__x86_64__) || defined(__i386__)

 #ifdef __cplusplus
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -701,6 +701,26 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
      : "cc", "memory", "q0", "q1", "q2");
 }

+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "mov        r12, #-16                      \n"
+      "add        %0, %0, %2, lsl #1             \n"
+      "sub        %0, #16                        \n"
+
+      "1:                                        \n"
+      "vld2.8     {d0, d1}, [%0], r12            \n"  // src -= 16
+      "subs       %2, #8                         \n"  // 8 pixels per loop.
+      "vrev64.8   q0, q0                         \n"
+      "vst2.8     {d0, d1}, [%1]!                \n"  // dst += 16
+      "bgt        1b                             \n"
+      : "+r"(src_uv),  // %0
+        "+r"(dst_uv),  // %1
+        "+r"(width)    // %2
+      :
+      : "cc", "memory", "r12", "q0");
+}
+
 void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -747,67 +747,99 @@ static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
 void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
  asm volatile(
      // Start at end of source row.
-      "ld1        {v3.16b}, [%4]                 \n"  // shuffler
+      "ld1        {v3.16b}, [%3]                 \n"  // shuffler
      "add        %0, %0, %w2, sxtw              \n"
      "sub        %0, %0, #32                    \n"
      "1:                                        \n"
-      "ld1        {v1.16b,v2.16b}, [%0], %3      \n"  // src -= 32
+      "ldr        q2, [%0, 16]                   \n"
+      "ldr        q1, [%0], -32                  \n"  // src -= 32
      "subs       %w2, %w2, #32                  \n"  // 32 pixels per loop.
-      "tbl        v1.16b, {v1.16b}, v3.16b       \n"
      "tbl        v0.16b, {v2.16b}, v3.16b       \n"
+      "tbl        v1.16b, {v1.16b}, v3.16b       \n"
      "st1        {v0.16b, v1.16b}, [%1], #32    \n"  // store 32 pixels
      "b.gt       1b                             \n"
      : "+r"(src),            // %0
        "+r"(dst),            // %1
        "+r"(width)           // %2
-      : "r"((ptrdiff_t)-32),  // %3
-        "r"(&kShuffleMirror)  // %4
+      : "r"(&kShuffleMirror)  // %3
      : "cc", "memory", "v0", "v1", "v2", "v3");
 }

+// Shuffle table for reversing the UV.
+static const uvec8 kShuffleMirrorUV = {14u, 15u, 12u, 13u, 10u, 11u, 8u, 9u,
+                                       6u,  7u,  4u,  5u,  2u,  3u,  0u, 1u};
+
+void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
+  asm volatile(
+      // Start at end of source row.
+      "ld1        {v4.16b}, [%3]                 \n"  // shuffler
+      "add        %0, %0, %w2, sxtw #1           \n"
+      "sub        %0, %0, #32                    \n"
+      "1:                                        \n"
+      "ldr        q1, [%0, 16]                   \n"
+      "ldr        q0, [%0], -32                  \n"  // src -= 32
+      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
+      "tbl        v2.16b, {v1.16b}, v4.16b       \n"
+      "tbl        v3.16b, {v0.16b}, v4.16b       \n"
+      "st1        {v2.16b, v3.16b}, [%1], #32    \n"  // dst += 32
+      "b.gt       1b                             \n"
+      : "+r"(src_uv),           // %0
+        "+r"(dst_uv),           // %1
+        "+r"(width)             // %2
+      : "r"(&kShuffleMirrorUV)  // %3
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
+}
+
 void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
                           uint8_t* dst_u,
                           uint8_t* dst_v,
                           int width) {
  asm volatile(
      // Start at end of source row.
+      "ld1        {v4.16b}, [%4]                 \n"  // shuffler
      "add        %0, %0, %w3, sxtw #1           \n"
-      "sub        %0, %0, #16                    \n"
+      "sub        %0, %0, #32                    \n"
      "1:                                        \n"
-      "ld2        {v0.8b, v1.8b}, [%0], %4       \n"  // src -= 16
-      "subs       %w3, %w3, #8                   \n"  // 8 pixels per loop.
-      "rev64      v0.8b, v0.8b                   \n"
-      "rev64      v1.8b, v1.8b                   \n"
-      "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
-      "st1        {v1.8b}, [%2], #8              \n"
+      "ldr        q1, [%0, 16]                   \n"
+      "ldr        q0, [%0], -32                  \n"  // src -= 32
+      "subs       %w3, %w3, #16                  \n"  // 16 pixels per loop.
+      "tbl        v2.16b, {v1.16b}, v4.16b       \n"
+      "tbl        v3.16b, {v0.16b}, v4.16b       \n"
+      "uzp1       v0.16b, v2.16b, v3.16b         \n"  // U
+      "uzp2       v1.16b, v2.16b, v3.16b         \n"  // V
+      "st1        {v0.16b}, [%1], #16            \n"  // dst += 16
+      "st1        {v1.16b}, [%2], #16            \n"
      "b.gt       1b                             \n"
      : "+r"(src_uv),           // %0
        "+r"(dst_u),            // %1
        "+r"(dst_v),            // %2
        "+r"(width)             // %3
-      : "r"((ptrdiff_t)-16)  // %4
-      : "cc", "memory", "v0", "v1");
+      : "r"(&kShuffleMirrorUV)  // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
 }

+// Shuffle table for reversing the ARGB.
+static const uvec8 kShuffleMirrorARGB = {12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u,
+                                         4u,  5u,  6u,  7u,  0u, 1u, 2u,  3u};
+
 void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
  asm volatile(
-      "ld1        {v4.16b}, [%4]                 \n"  // shuffler
-      "add        %0, %0, %w2, sxtw #2           \n"  // Start at end of row.
-      "sub        %0, %0, #64                    \n"
+      // Start at end of source row.
+      "ld1        {v4.16b}, [%3]                 \n"  // shuffler
+      "add        %0, %0, %w2, sxtw #2           \n"
+      "sub        %0, %0, #32                    \n"
      "1:                                        \n"
-      "ld4        {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], %3\n"  // src -= 64
-      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop.
-      "tbl        v0.16b, {v0.16b}, v4.16b       \n"
-      "tbl        v1.16b, {v1.16b}, v4.16b       \n"
-      "tbl        v2.16b, {v2.16b}, v4.16b       \n"
-      "tbl        v3.16b, {v3.16b}, v4.16b       \n"
-      "st4        {v0.16b, v1.16b, v2.16b, v3.16b}, [%1], #64 \n"  // dst += 64
+      "ldr        q1, [%0, 16]                   \n"
+      "ldr        q0, [%0], -32                  \n"  // src -= 32
+      "subs       %w2, %w2, #8                   \n"  // 8 pixels per loop.
+      "tbl        v2.16b, {v1.16b}, v4.16b       \n"
+      "tbl        v3.16b, {v0.16b}, v4.16b       \n"
+      "st1        {v2.16b, v3.16b}, [%1], #32    \n"  // dst += 32
      "b.gt       1b                             \n"
      : "+r"(src_argb),           // %0
        "+r"(dst_argb),           // %1
        "+r"(width)               // %2
-      : "r"((ptrdiff_t)-64),  // %3
-        "r"(&kShuffleMirror)  // %4
+      : "r"(&kShuffleMirrorARGB)  // %3
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4");
 }

@ -3249,20 +3281,27 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
      : "cc", "memory", "v0", "v1", "v2", "v3");
 }

+// Shuffle table for swapping UV bytes.
+static const uvec8 kShuffleSwapUV = {1u, 0u, 3u,  2u,  5u,  4u,  7u,  6u,
+                                     9u, 8u, 11u, 10u, 13u, 12u, 15u, 14u};
+
 // Convert UV plane of NV12 to VU of NV21.
 void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
  asm volatile(
+      "ld1        {v2.16b}, [%3]                 \n"  // shuffler
      "1:                                        \n"
-      "ld2        {v0.16b, v1.16b}, [%0], #32    \n"  // load 16 UV values
-      "orr        v2.16b, v0.16b, v0.16b         \n"  // move U after V
+      "ld1        {v0.16b}, [%0], 16             \n"  // load 16 UV values
+      "ld1        {v1.16b}, [%0], 16             \n"
      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop
-      "st2        {v1.16b, v2.16b}, [%1], #32    \n"  // store 16 VU pixels
+      "tbl        v0.16b, {v0.16b}, v2.16b       \n"
+      "tbl        v1.16b, {v1.16b}, v2.16b       \n"
      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "stp        q0, q1, [%1], 32               \n"  // store 16 VU pixels
      "b.gt       1b                             \n"
      : "+r"(src_uv),         // %0
        "+r"(dst_vu),         // %1
        "+r"(width)           // %2
-      :
+      : "r"(&kShuffleSwapUV)  // %3
      : "cc", "memory", "v0", "v1", "v2");
 }

--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -497,6 +497,7 @@ TESTPLANARTOBP(I400, 2, 2, NV21, 2, 2)
                    SUBSAMP_X, SUBSAMP_Y, benchmark_width_, _NullY, +, 0, 0)

 TESTBIPLANARTOBP(NV21, 2, 2, NV12, 2, 2)
+TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)

 #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y,         \
                         FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF, \
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -782,44 +782,75 @@ TEST_F(LibYUVPlanarTest, TestARGBQuantize) {
  }
 }

-TEST_F(LibYUVPlanarTest, TestARGBMirror) {
-  SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
-  SIMD_ALIGNED(uint8_t dst_pixels[1280][4]);
+TEST_F(LibYUVPlanarTest, ARGBMirror_Opt) {
+  align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 4);
+  align_buffer_page_end(dst_pixels_opt,
+                        benchmark_width_ * benchmark_height_ * 4);
+  align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 4);

-  for (int i = 0; i < 1280; ++i) {
-    orig_pixels[i][0] = i;
-    orig_pixels[i][1] = i / 2;
-    orig_pixels[i][2] = i / 3;
-    orig_pixels[i][3] = i / 4;
-  }
-  ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
+  MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 4);
+  MaskCpuFlags(disable_cpu_flags_);
+  ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_c,
+             benchmark_width_ * 4, benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);

-  for (int i = 0; i < 1280; ++i) {
-    EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i][0]);
-    EXPECT_EQ((i / 2) & 255, dst_pixels[1280 - 1 - i][1]);
-    EXPECT_EQ((i / 3) & 255, dst_pixels[1280 - 1 - i][2]);
-    EXPECT_EQ((i / 4) & 255, dst_pixels[1280 - 1 - i][3]);
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    ARGBMirror(src_pixels, benchmark_width_ * 4, dst_pixels_opt,
+               benchmark_width_ * 4, benchmark_width_, benchmark_height_);
  }
-  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
-    ARGBMirror(&orig_pixels[0][0], 0, &dst_pixels[0][0], 0, 1280, 1);
+  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 4; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  }
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
 }

-TEST_F(LibYUVPlanarTest, TestMirrorPlane) {
-  SIMD_ALIGNED(uint8_t orig_pixels[1280]);
-  SIMD_ALIGNED(uint8_t dst_pixels[1280]);
+TEST_F(LibYUVPlanarTest, MirrorPlane_Opt) {
+  align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(dst_pixels_opt, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_);

-  for (int i = 0; i < 1280; ++i) {
-    orig_pixels[i] = i;
-  }
-  MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
+  MemRandomize(src_pixels, benchmark_width_ * benchmark_height_);
+  MaskCpuFlags(disable_cpu_flags_);
+  MirrorPlane(src_pixels, benchmark_width_, dst_pixels_c, benchmark_width_,
+              benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);

-  for (int i = 0; i < 1280; ++i) {
-    EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i]);
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    MirrorPlane(src_pixels, benchmark_width_, dst_pixels_opt, benchmark_width_,
+                benchmark_width_, benchmark_height_);
  }
-  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
-    MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  }
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
+}
+
+TEST_F(LibYUVPlanarTest, MirrorUVPlane_Opt) {
+  align_buffer_page_end(src_pixels, benchmark_width_ * benchmark_height_ * 2);
+  align_buffer_page_end(dst_pixels_opt,
+                        benchmark_width_ * benchmark_height_ * 2);
+  align_buffer_page_end(dst_pixels_c, benchmark_width_ * benchmark_height_ * 2);
+
+  MemRandomize(src_pixels, benchmark_width_ * benchmark_height_ * 2);
+  MaskCpuFlags(disable_cpu_flags_);
+  MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_c,
+                benchmark_width_ * 2, benchmark_width_, benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    MirrorUVPlane(src_pixels, benchmark_width_ * 2, dst_pixels_opt,
+                  benchmark_width_ * 2, benchmark_width_, benchmark_height_);
+  }
+  for (int i = 0; i < benchmark_width_ * benchmark_height_ * 2; ++i) {
+    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
+  }
+  free_aligned_buffer_page_end(src_pixels);
+  free_aligned_buffer_page_end(dst_pixels_opt);
+  free_aligned_buffer_page_end(dst_pixels_c);
 }

 TEST_F(LibYUVPlanarTest, TestShade) {