HalfMergeUVPlane function and optimized I444ToNV12 and I444ToNV21

Bug: libyuv:858 Change-Id: Ie1f03a9acaff02ee8059cf1e5c2c2e5afcde8592 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2154608 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
2026-01-01 03:12:16 +08:00 · 2020-04-17 11:08:04 -07:00 · 2020-04-17 11:08:04 -07:00 · 2f48ffd42b
commit 2f48ffd42b
parent d4c3f45eb6
10 changed files with 336 additions and 84 deletions
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@ -105,6 +105,19 @@ void MergeUVPlane(const uint8_t* src_u,
                  int width,
                  int height);

+// Scale U and V to half width and height and merge into interleaved UV plane.
+// width and height are source size, allowing odd sizes.
+// Use for converting I444 or I422 to NV12.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int dst_stride_uv,
+                      int width,
+                      int height);
+
 // Swap U and V channels in interleaved UV plane.
 LIBYUV_API
 void SwapUVPlane(const uint8_t* src_uv,
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -273,6 +273,7 @@ extern "C" {
 #define HAS_ARGBTOAR30ROW_SSSE3
 #define HAS_CONVERT16TO8ROW_SSSE3
 #define HAS_CONVERT8TO16ROW_SSE2
+#define HAS_HALFMERGEUVROW_SSSE3
 // I210 is for H010.  2 = 422.  I for 601 vs H for 709.
 #define HAS_I210TOAR30ROW_SSSE3
 #define HAS_I210TOARGBROW_SSSE3
@ -343,7 +344,6 @@ extern "C" {
 #define HAS_ARGBTOUVJROW_NEON
 #define HAS_ARGBTOUVROW_NEON
 #define HAS_ARGBTOYJROW_NEON
-#define HAS_RGBATOYJROW_NEON
 #define HAS_ARGBTOYROW_NEON
 #define HAS_AYUVTOUVROW_NEON
 #define HAS_AYUVTOVUROW_NEON
@ -353,6 +353,7 @@ extern "C" {
 #define HAS_BYTETOFLOATROW_NEON
 #define HAS_COPYROW_NEON
 #define HAS_HALFFLOATROW_NEON
+#define HAS_HALFMERGEUVROW_NEON
 #define HAS_I400TOARGBROW_NEON
 #define HAS_I422ALPHATOARGBROW_NEON
 #define HAS_I422TOARGB1555ROW_NEON
@ -375,19 +376,20 @@ extern "C" {
 #define HAS_NV21TORGB24ROW_NEON
 #define HAS_NV21TOYUV24ROW_NEON
 #define HAS_RAWTOARGBROW_NEON
-#define HAS_RAWTORGBAROW_NEON
 #define HAS_RAWTORGB24ROW_NEON
+#define HAS_RAWTORGBAROW_NEON
 #define HAS_RAWTOUVROW_NEON
-#define HAS_RAWTOYROW_NEON
 #define HAS_RAWTOYJROW_NEON
+#define HAS_RAWTOYROW_NEON
 #define HAS_RGB24TOARGBROW_NEON
 #define HAS_RGB24TOUVROW_NEON
-#define HAS_RGB24TOYROW_NEON
 #define HAS_RGB24TOYJROW_NEON
+#define HAS_RGB24TOYROW_NEON
 #define HAS_RGB565TOARGBROW_NEON
 #define HAS_RGB565TOUVROW_NEON
 #define HAS_RGB565TOYROW_NEON
 #define HAS_RGBATOUVROW_NEON
+#define HAS_RGBATOYJROW_NEON
 #define HAS_RGBATOYROW_NEON
 #define HAS_SETROW_NEON
 #define HAS_SPLITRGBROW_NEON
@ -1712,6 +1714,27 @@ void MergeUVRow_Any_MMI(const uint8_t* y_buf,
                        uint8_t* dst_ptr,
                        int width);

+void HalfMergeUVRow_C(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int width);
+
+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width);
+
+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          uint8_t* dst_uv,
+                          int width);
+
 void SplitRGBRow_C(const uint8_t* src_rgb,
                   uint8_t* dst_r,
                   uint8_t* dst_g,
--- a/source/convert.cc
+++ b/source/convert.cc
@ -426,7 +426,41 @@ int I444ToI420(const uint8_t* src_y,
                    dst_v, dst_stride_v, width, height, width, height);
 }

-// TODO(fbarchard): Implement row conversion.
+LIBYUV_API
+int I444ToNV12(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_uv,
+               int dst_stride_uv,
+               int width,
+               int height) {
+  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+  if (dst_y) {
+    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+  }
+  HalfMergeUVPlane(src_u, src_stride_u, src_v, src_stride_v, dst_uv,
+                   dst_stride_uv, width, height);
+  return 0;
+}
+
 LIBYUV_API
 int I444ToNV21(const uint8_t* src_y,
               int src_stride_y,
@ -440,30 +474,9 @@ int I444ToNV21(const uint8_t* src_y,
               int dst_stride_vu,
               int width,
               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  // Allocate u and v buffers
-  align_buffer_64(plane_u, halfwidth * halfheight * 2);
-  uint8_t* plane_v = plane_u + halfwidth * halfheight;
-
-  I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
-             dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
-             height);
-  MergeUVPlane(plane_v, halfwidth, plane_u, halfwidth, dst_vu, dst_stride_vu,
-               halfwidth, halfheight);
-  free_aligned_buffer_64(plane_u);
-  return 0;
+  return I444ToNV12(src_y, src_stride_y, src_v, src_stride_v, src_u,
+                    src_stride_u, dst_y, dst_stride_y, dst_vu, dst_stride_vu,
+                    width, height);
 }

 // I400 is greyscale typically used in MJPG
@ -498,46 +511,6 @@ int I400ToI420(const uint8_t* src_y,
  return 0;
 }

-// TODO(fbarchard): Implement row conversion.
-LIBYUV_API
-int I444ToNV12(const uint8_t* src_y,
-               int src_stride_y,
-               const uint8_t* src_u,
-               int src_stride_u,
-               const uint8_t* src_v,
-               int src_stride_v,
-               uint8_t* dst_y,
-               int dst_stride_y,
-               uint8_t* dst_uv,
-               int dst_stride_uv,
-               int width,
-               int height) {
-  int halfwidth = (width + 1) >> 1;
-  int halfheight = (height + 1) >> 1;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    halfheight = (height + 1) >> 1;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_u = src_u + (height - 1) * src_stride_u;
-    src_v = src_v + (height - 1) * src_stride_v;
-    src_stride_y = -src_stride_y;
-    src_stride_u = -src_stride_u;
-    src_stride_v = -src_stride_v;
-  }
-  // Allocate u and v buffers
-  align_buffer_64(plane_u, halfwidth * halfheight * 2);
-  uint8_t* plane_v = plane_u + halfwidth * halfheight;
-
-  I444ToI420(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
-             dst_y, dst_stride_y, plane_u, halfwidth, plane_v, halfwidth, width,
-             height);
-  MergeUVPlane(plane_u, halfwidth, plane_v, halfwidth, dst_uv, dst_stride_uv,
-               halfwidth, halfheight);
-  free_aligned_buffer_64(plane_u);
-  return 0;
-}
-
 // I400 is greyscale typically used in MJPG
 LIBYUV_API
 int I400ToNV21(const uint8_t* src_y,
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@ -488,7 +488,6 @@ int I420ToUYVY(const uint8_t* src_y,
  return 0;
 }

-// TODO(fbarchard): test negative height for invert.
 LIBYUV_API
 int I420ToNV12(const uint8_t* src_y,
               int src_stride_y,
@ -502,12 +501,23 @@ int I420ToNV12(const uint8_t* src_y,
               int dst_stride_uv,
               int width,
               int height) {
+  int halfwidth = (width + 1) / 2;
+  int halfheight = (height + 1) / 2;
  if (!src_y || !src_u || !src_v || !dst_y || !dst_uv || width <= 0 ||
      height == 0) {
    return -1;
  }
-  int halfwidth = (width + 1) / 2;
-  int halfheight = height > 0 ? (height + 1) / 2 : (height - 1) / 2;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    halfheight = (height + 1) >> 1;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (halfheight - 1) * src_stride_u;
+    src_v = src_v + (halfheight - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
  if (dst_y) {
    CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
  }
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -4103,6 +4103,52 @@ int UYVYToNV12(const uint8_t* src_uyvy,
  return 0;
 }

+// width and height are src size allowing odd size handling.
+LIBYUV_API
+void HalfMergeUVPlane(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int dst_stride_uv,
+                      int width,
+                      int height) {
+  int y;
+  void (*HalfMergeUVRow)(const uint8_t* src_u, int src_stride_u,
+                         const uint8_t* src_v, int src_stride_v,
+                         uint8_t* dst_uv, int width) = HalfMergeUVRow_C;
+
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_u = src_u + (height - 1) * src_stride_u;
+    src_v = src_v + (height - 1) * src_stride_v;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+#if defined(HAS_HALFMERGEUVROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+    HalfMergeUVRow = HalfMergeUVRow_NEON;
+  }
+#endif
+#if defined(HAS_HALFMERGEUVROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16)) {
+    HalfMergeUVRow = HalfMergeUVRow_SSSE3;
+  }
+#endif
+
+  for (y = 0; y < height - 1; y += 2) {
+    // Merge a row of U and V into a row of UV.
+    HalfMergeUVRow(src_u, src_stride_u, src_v, src_stride_v, dst_uv, width);
+    src_u += src_stride_u * 2;
+    src_v += src_stride_v * 2;
+    dst_uv += dst_stride_uv;
+  }
+  if (height & 1) {
+    HalfMergeUVRow(src_u, 0, src_v, 0, dst_uv, width);
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -3563,6 +3563,30 @@ void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
  }
 }

+void HalfMergeUVRow_C(const uint8_t* src_u,
+                      int src_stride_u,
+                      const uint8_t* src_v,
+                      int src_stride_v,
+                      uint8_t* dst_uv,
+                      int width) {
+  int x;
+  for (x = 0; x < width - 1; x += 2) {
+    dst_uv[0] = (src_u[0] + src_u[1] + src_u[src_stride_u] +
+                 src_u[src_stride_u + 1] + 2) >>
+                2;
+    dst_uv[1] = (src_v[0] + src_v[1] + src_v[src_stride_v] +
+                 src_v[src_stride_v + 1] + 2) >>
+                2;
+    src_u += 2;
+    src_v += 2;
+    dst_uv += 2;
+  }
+  if (width & 1) {
+    dst_uv[0] = (src_u[0] + src_u[src_stride_u] + 1) >> 1;
+    dst_uv[1] = (src_v[0] + src_v[src_stride_v] + 1) >> 1;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -1078,6 +1078,8 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
 }
 #endif

+// clang-format off
+
 // TODO(mraptis): Consider passing R, G, B multipliers as parameter.
 // round parameter is register containing value to add before shift.
 #define RGBTOY(round)                            \
@ -1102,10 +1104,8 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  "phaddw    %%xmm0,%%xmm6                   \n" \
  "phaddw    %%xmm2,%%xmm1                   \n" \
  "prefetcht0 1280(%0)                       \n" \
-  "paddw     %%" #round                          \
-  ",%%xmm6             \n"                       \
-  "paddw     %%" #round                          \
-  ",%%xmm1             \n"                       \
+  "paddw     %%" #round ",%%xmm6             \n" \
+  "paddw     %%" #round ",%%xmm1             \n" \
  "psrlw     $0x8,%%xmm6                     \n" \
  "psrlw     $0x8,%%xmm1                     \n" \
  "packuswb  %%xmm1,%%xmm6                   \n" \
@ -1132,10 +1132,8 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */  \
  "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                 \
  "prefetcht0 1280(%0)                       \n"                 \
-  "vpaddw     %%" #round                                         \
-  ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
-  "vpaddw     %%" #round                                         \
-  ",%%ymm2,%%ymm2     \n"                                        \
+  "vpaddw     %%" #round ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
+  "vpaddw     %%" #round ",%%ymm2,%%ymm2     \n" \
  "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                 \
  "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                 \
  "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */  \
@ -1146,6 +1144,8 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  "jg        1b                              \n"                 \
  "vzeroupper                                \n"

+// clang-format on
+
 #ifdef HAS_ARGBTOYROW_SSSE3
 // Convert 16 ARGB pixels (64 bytes) to 16 Y values.
 void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@ -7005,6 +7005,53 @@ void SwapUVRow_AVX2(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
 }
 #endif  // HAS_SWAPUVROW_AVX2

+void HalfMergeUVRow_SSSE3(const uint8_t* src_u,
+                          int src_stride_u,
+                          const uint8_t* src_v,
+                          int src_stride_v,
+                          uint8_t* dst_uv,
+                          int width) {
+  asm volatile(
+      "pcmpeqb    %%xmm4,%%xmm4                  \n"
+      "psrlw      $0xf,%%xmm4                    \n"
+      "packuswb   %%xmm4,%%xmm4                  \n"
+      "pxor       %%xmm5,%%xmm5                  \n"
+      "1:                                        \n"
+
+      LABELALIGN
+      "1:                                        \n"
+      "movdqu    (%0),%%xmm0                     \n"  // load 16 U values
+      "movdqu    (%1),%%xmm1                     \n"  // load 16 V values
+      "movdqu    0(%0,%4,1),%%xmm2               \n"  // 16 from next row
+      "movdqu    0(%1,%5,1),%%xmm3               \n"
+      "lea       0x10(%0),%0                     \n"
+      "pmaddubsw %%xmm4,%%xmm0                   \n"  // half size
+      "pmaddubsw %%xmm4,%%xmm1                   \n"
+      "pmaddubsw %%xmm4,%%xmm2                   \n"
+      "pmaddubsw %%xmm4,%%xmm3                   \n"
+      "lea       0x10(%1),%1                     \n"
+      "paddw     %%xmm2,%%xmm0                   \n"
+      "paddw     %%xmm3,%%xmm1                   \n"
+      "psrlw     $0x1,%%xmm0                     \n"
+      "psrlw     $0x1,%%xmm1                     \n"
+      "pavgw     %%xmm5,%%xmm0                   \n"
+      "pavgw     %%xmm5,%%xmm1                   \n"
+      "packuswb  %%xmm0,%%xmm0                   \n"
+      "packuswb  %%xmm1,%%xmm1                   \n"
+      "punpcklbw %%xmm1,%%xmm0                   \n"
+      "movdqu    %%xmm0,(%2)                     \n"  // store 8 UV pixels
+      "lea       0x10(%2),%2                     \n"
+      "sub       $0x10,%3                        \n"  // 16 src pixels per loop
+      "jg        1b                              \n"
+      : "+r"(src_u),                    // %0
+        "+r"(src_v),                    // %1
+        "+r"(dst_uv),                   // %2
+        "+r"(width)                     // %3
+      : "r"((intptr_t)(src_stride_u)),  // %4
+        "r"((intptr_t)(src_stride_v))   // %5
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
+}
+
 #endif  // defined(__x86_64__) || defined(__i386__)

 #ifdef __cplusplus
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -2984,6 +2984,39 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
      : "cc", "memory", "q0", "q1", "q2");
 }

+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width) {
+  const uint8_t* src_u_1 = src_u + src_stride_u;
+  const uint8_t* src_v_1 = src_v + src_stride_v;
+  asm volatile(
+      "1:                                        \n"
+      "vld1.8     {q0}, [%0]!                    \n"  // load 16 U values
+      "vld1.8     {q1}, [%2]!                    \n"  // load 16 V values
+      "vld1.8     {q2}, [%1]!                    \n"
+      "vld1.8     {q3}, [%3]!                    \n"
+      "vpaddl.u8  q0, q0                         \n"  // half size
+      "vpaddl.u8  q1, q1                         \n"
+      "vpadal.u8  q0, q2                         \n"
+      "vpadal.u8  q1, q3                         \n"
+      "vqrshrn.u16 d0, q0, #2                    \n"
+      "vqrshrn.u16 d1, q1, #2                    \n"
+      "subs       %5, %5, #16                    \n"  // 16 src pixels per loop
+      "vst2.8     {d0, d1}, [%4]!                \n"  // store 8 UV pixels
+      "bgt        1b                             \n"
+      : "+r"(src_u),    // %0
+        "+r"(src_u_1),  // %1
+        "+r"(src_v),    // %2
+        "+r"(src_v_1),  // %3
+        "+r"(dst_uv),   // %4
+        "+r"(width)     // %5
+      :
+      : "cc", "memory", "q0", "q1", "q2", "q3");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..

 #ifdef __cplusplus
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -3188,11 +3188,12 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
      "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
      "uadalp     v0.8h, v4.16b                  \n"  // V 16 bytes -> 8 shorts.
      "uadalp     v1.8h, v5.16b                  \n"  // U 16 bytes -> 8 shorts.
+      "prfm       pldl1keep, [%0, 448]           \n"
      "uqrshrn    v3.8b, v0.8h, #2               \n"  // 2x2 average
      "uqrshrn    v2.8b, v1.8h, #2               \n"
+      "prfm       pldl1keep, [%1, 448]           \n"
      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
      "st2        {v2.8b,v3.8b}, [%2], #16       \n"  // store 8 pixels UV.
-      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_ayuv),    // %0
        "+r"(src_ayuv_1),  // %1
@ -3210,18 +3211,18 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
  asm volatile(

      "1:                                        \n"
-      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16
-                                                                // pixels.
+      "ld4        {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n"  // load 16 ayuv
      "uaddlp     v0.8h, v0.16b                  \n"  // V 16 bytes -> 8 shorts.
      "uaddlp     v1.8h, v1.16b                  \n"  // U 16 bytes -> 8 shorts.
      "ld4        {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n"  // load next 16
      "uadalp     v0.8h, v4.16b                  \n"  // V 16 bytes -> 8 shorts.
      "uadalp     v1.8h, v5.16b                  \n"  // U 16 bytes -> 8 shorts.
+      "prfm       pldl1keep, [%0, 448]           \n"
      "uqrshrn    v0.8b, v0.8h, #2               \n"  // 2x2 average
      "uqrshrn    v1.8b, v1.8h, #2               \n"
+      "prfm       pldl1keep, [%1, 448]           \n"
      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
      "st2        {v0.8b,v1.8b}, [%2], #16       \n"  // store 8 pixels VU.
-      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_ayuv),    // %0
        "+r"(src_ayuv_1),  // %1
@ -3265,6 +3266,41 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
      : "cc", "memory", "v0", "v1", "v2");
 }

+void HalfMergeUVRow_NEON(const uint8_t* src_u,
+                         int src_stride_u,
+                         const uint8_t* src_v,
+                         int src_stride_v,
+                         uint8_t* dst_uv,
+                         int width) {
+  const uint8_t* src_u_1 = src_u + src_stride_u;
+  const uint8_t* src_v_1 = src_v + src_stride_v;
+  asm volatile(
+      "1:                                        \n"
+      "ld1        {v0.16b}, [%0], #16            \n"  // load 16 U values
+      "ld1        {v1.16b}, [%2], #16            \n"  // load 16 V values
+      "ld1        {v2.16b}, [%1], #16            \n"
+      "ld1        {v3.16b}, [%3], #16            \n"
+      "uaddlp     v0.8h, v0.16b                  \n"  // half size
+      "uaddlp     v1.8h, v1.16b                  \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "uadalp     v0.8h, v2.16b                  \n"
+      "uadalp     v1.8h, v3.16b                  \n"
+      "prfm       pldl1keep, [%2, 448]           \n"
+      "uqrshrn    v0.8b, v0.8h, #2               \n"
+      "uqrshrn    v1.8b, v1.8h, #2               \n"
+      "subs       %w5, %w5, #16                  \n"  // 16 src pixels per loop
+      "st2        {v0.8b, v1.8b}, [%4], #16      \n"  // store 8 UV pixels
+      "b.gt       1b                             \n"
+      : "+r"(src_u),    // %0
+        "+r"(src_u_1),  // %1
+        "+r"(src_v),    // %2
+        "+r"(src_v_1),  // %3
+        "+r"(dst_uv),   // %4
+        "+r"(width)     // %5
+      :
+      : "cc", "memory", "v0", "v1", "v2", "v3");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

 #ifdef __cplusplus
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -21,6 +21,7 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/planar_functions.h"
 #include "libyuv/rotate.h"
+#include "libyuv/scale.h"

 #ifdef ENABLE_ROW_TESTS
 // row.h defines SIMD_ALIGNED, overriding unit_test.h
@ -3479,4 +3480,50 @@ TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) {
  free_aligned_buffer_page_end(orig_pixels);
 }

+TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) {
+  // Round count up to multiple of 16
+  int dst_width = (benchmark_width_ + 1) / 2;
+  int dst_height = (benchmark_height_ + 1) / 2;
+  align_buffer_page_end(src_pixels_u, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(src_pixels_v, benchmark_width_ * benchmark_height_);
+  align_buffer_page_end(tmp_pixels_u, dst_width * dst_height);
+  align_buffer_page_end(tmp_pixels_v, dst_width * dst_height);
+  align_buffer_page_end(dst_pixels_uv_opt, dst_width * 2 * dst_height);
+  align_buffer_page_end(dst_pixels_uv_c, dst_width * 2 * dst_height);
+
+  MemRandomize(src_pixels_u, benchmark_width_ * benchmark_height_);
+  MemRandomize(src_pixels_v, benchmark_width_ * benchmark_height_);
+  MemRandomize(tmp_pixels_u, dst_width * dst_height);
+  MemRandomize(tmp_pixels_v, dst_width * dst_height);
+  MemRandomize(dst_pixels_uv_opt, dst_width * 2 * dst_height);
+  MemRandomize(dst_pixels_uv_c, dst_width * 2 * dst_height);
+
+  ScalePlane(src_pixels_u, benchmark_width_, benchmark_width_,
+             benchmark_height_,
+
+             tmp_pixels_u, dst_width, dst_width, dst_height, kFilterBilinear);
+  ScalePlane(src_pixels_v, benchmark_width_, benchmark_width_,
+             benchmark_height_, tmp_pixels_v, dst_width, dst_width, dst_height,
+             kFilterBilinear);
+  MergeUVPlane(tmp_pixels_u, dst_width, tmp_pixels_v, dst_width,
+               dst_pixels_uv_c, dst_width * 2, dst_width, dst_height);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v,
+                     benchmark_width_, dst_pixels_uv_opt, dst_width * 2,
+                     benchmark_width_, benchmark_height_);
+  }
+
+  for (int i = 0; i < dst_width * 2 * dst_height; ++i) {
+    EXPECT_EQ(dst_pixels_uv_c[i], dst_pixels_uv_opt[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels_u);
+  free_aligned_buffer_page_end(src_pixels_v);
+  free_aligned_buffer_page_end(tmp_pixels_u);
+  free_aligned_buffer_page_end(tmp_pixels_v);
+  free_aligned_buffer_page_end(dst_pixels_uv_opt);
+  free_aligned_buffer_page_end(dst_pixels_uv_c);
+}
+
 }  // namespace libyuv