MergeUV_AVX512BW for I420ToNV12

On Skylake Xeon 640x360 100000 iterations AVX512 MergeUVPlane_Opt (1196 ms) AVX2 MergeUVPlane_Opt (1565 ms) SSE2 MergeUVPlane_Opt (1780 ms) Pixel 7 MergeUVPlane_Opt (1177 ms) Bug: None Change-Id: If47d4fa957cf27781bba5fd6a2f0bf554101a5c6 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4242247 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: richard winterton <rrwinterton@gmail.com>
2025-12-07 01:06:46 +08:00 · 2023-02-13 10:52:58 -08:00 · 2023-02-13 10:52:58 -08:00 · 2bdc210be9
commit 2bdc210be9
parent b2528b0be9
17 changed files with 390 additions and 108 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1857
+Version: 1860
 License: BSD
 License File: LICENSE
--- a/include/libyuv/rotate_row.h
+++ b/include/libyuv/rotate_row.h
@ -232,6 +232,27 @@ void TransposeWx1_16_C(const uint16_t* src,
                       uint16_t* dst,
                       int dst_stride,
                       int width);
 // Transpose 32 bit values (ARGB)
 void Transpose4x4_32_NEON(const uint8_t* src,
                          int src_stride,
                          uint8_t* dst,
                          int dst_stride,
                          int width);
 void Transpose4x4_32_C(const uint8_t* src,
                       int src_stride,
                       uint8_t* dst,
                       int dst_stride,
                       int width);
 // Transpose 32 bit values (ARGB)
 void Transpose8x8_32_NEON(const uint8_t* src,
                          int src_stride,
                          uint8_t* dst,
                          int dst_stride,
                          int width);
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -402,9 +402,11 @@ extern "C" {
 // The following are available for AVX512 clang x86 platforms:
 // TODO(fbarchard): Port to GCC and Visual C
 // TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
 // TODO(fbarchard): Port MERGEUV to assembly
 #if !defined(LIBYUV_DISABLE_X86) && \
-    (defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512))
+    (defined(__x86_64__) || defined(__i386__)) && (defined(CLANG_HAS_AVX512) && !defined(_MSC_VER))
 #define HAS_ARGBTORGB24ROW_AVX512VBMI
 #define HAS_MERGEUVROW_AVX512BW
 #endif
 // The following are available for AVX512 clang x64 platforms:
@ -2184,6 +2186,10 @@ void MergeUVRow_AVX2(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
                     int width);
 void MergeUVRow_AVX512BW(const uint8_t* src_u,
                         const uint8_t* src_v,
                         uint8_t* dst_uv,
                         int width);
 void MergeUVRow_NEON(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
@ -2204,6 +2210,10 @@ void MergeUVRow_Any_AVX2(const uint8_t* y_buf,
                         const uint8_t* uv_buf,
                         uint8_t* dst_ptr,
                         int width);
 void MergeUVRow_Any_AVX512BW(const uint8_t* y_buf,
                             const uint8_t* uv_buf,
                             uint8_t* dst_ptr,
                             int width);
 void MergeUVRow_Any_NEON(const uint8_t* y_buf,
                         const uint8_t* uv_buf,
                         uint8_t* dst_ptr,
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1857
+#define LIBYUV_VERSION 1860
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert.cc
+++ b/source/convert.cc
@ -924,6 +924,14 @@ int I422ToNV21(const uint8_t* src_y,
    }
  }
 #endif
 #if defined(HAS_MERGEUVROW_AVX512BW)
  if (TestCpuFlag(kCpuHasAVX512BW)) {
    MergeUVRow = MergeUVRow_Any_AVX512BW;
    if (IS_ALIGNED(halfwidth, 32)) {
      MergeUVRow = MergeUVRow_AVX512BW;
    }
  }
 #endif
 #if defined(HAS_MERGEUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    MergeUVRow = MergeUVRow_Any_NEON;
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@ -389,6 +389,14 @@ int ARGBToNV12(const uint8_t* src_argb,
    }
  }
 #endif
 #if defined(HAS_MERGEUVROW_AVX512BW)
  if (TestCpuFlag(kCpuHasAVX512BW)) {
    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
    if (IS_ALIGNED(halfwidth, 32)) {
      MergeUVRow_ = MergeUVRow_AVX512BW;
    }
  }
 #endif
 #if defined(HAS_MERGEUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    MergeUVRow_ = MergeUVRow_Any_NEON;
@ -559,6 +567,14 @@ int ARGBToNV21(const uint8_t* src_argb,
    }
  }
 #endif
 #if defined(HAS_MERGEUVROW_AVX512BW)
  if (TestCpuFlag(kCpuHasAVX512BW)) {
    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
    if (IS_ALIGNED(halfwidth, 64)) {
      MergeUVRow_ = MergeUVRow_AVX512BW;
    }
  }
 #endif
 #if defined(HAS_MERGEUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    MergeUVRow_ = MergeUVRow_Any_NEON;
@ -726,6 +742,14 @@ int ABGRToNV12(const uint8_t* src_abgr,
    }
  }
 #endif
 #if defined(HAS_MERGEUVROW_AVX512BW)
  if (TestCpuFlag(kCpuHasAVX512BW)) {
    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
    if (IS_ALIGNED(halfwidth, 64)) {
      MergeUVRow_ = MergeUVRow_AVX512BW;
    }
  }
 #endif
 #if defined(HAS_MERGEUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    MergeUVRow_ = MergeUVRow_Any_NEON;
@ -894,6 +918,14 @@ int ABGRToNV21(const uint8_t* src_abgr,
    }
  }
 #endif
 #if defined(HAS_MERGEUVROW_AVX512BW)
  if (TestCpuFlag(kCpuHasAVX512BW)) {
    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
    if (IS_ALIGNED(halfwidth, 64)) {
      MergeUVRow_ = MergeUVRow_AVX512BW;
    }
  }
 #endif
 #if defined(HAS_MERGEUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    MergeUVRow_ = MergeUVRow_Any_NEON;
@ -2921,6 +2953,14 @@ int RAWToJNV21(const uint8_t* src_raw,
    }
  }
 #endif
 #if defined(HAS_MERGEUVROW_AVX512BW)
  if (TestCpuFlag(kCpuHasAVX512BW)) {
    MergeUVRow_ = MergeUVRow_Any_AVX512BW;
    if (IS_ALIGNED(halfwidth, 64)) {
      MergeUVRow_ = MergeUVRow_AVX512BW;
    }
  }
 #endif
 #if defined(HAS_MERGEUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    MergeUVRow_ = MergeUVRow_Any_NEON;
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -599,6 +599,14 @@ void MergeUVPlane(const uint8_t* src_u,
    }
  }
 #endif
 #if defined(HAS_MERGEUVROW_AVX512BW)
  if (TestCpuFlag(kCpuHasAVX512BW)) {
    MergeUVRow = MergeUVRow_Any_AVX512BW;
    if (IS_ALIGNED(width, 32)) {
      MergeUVRow = MergeUVRow_AVX512BW;
    }
  }
 #endif
 #if defined(HAS_MERGEUVROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    MergeUVRow = MergeUVRow_Any_NEON;
--- a/source/rotate_common.cc
+++ b/source/rotate_common.cc
@ -166,6 +166,63 @@ void TransposeWxH_16_C(const uint16_t* src,
  }
 }
 // Transpose 32 bit values (ARGB)
 void Transpose4x4_32_C(const uint8_t* src,
                       int src_stride,
                       uint8_t* dst,
                       int dst_stride,
                       int width) {
  const uint8_t* src1 = src + src_stride;
  const uint8_t* src2 = src1 + src_stride;
  const uint8_t* src3 = src2 + src_stride;
  uint8_t* dst1 = dst + dst_stride;
  uint8_t* dst2 = dst1 + dst_stride;
  uint8_t* dst3 = dst2 + dst_stride;
  int i;
  for (i = 0; i < width; i += 4) {
    uint32_t p00 = ((uint32_t*)(src))[0];
    uint32_t p10 = ((uint32_t*)(src))[1];
    uint32_t p20 = ((uint32_t*)(src))[2];
    uint32_t p30 = ((uint32_t*)(src))[3];
    uint32_t p01 = ((uint32_t*)(src1))[0];
    uint32_t p11 = ((uint32_t*)(src1))[1];
    uint32_t p21 = ((uint32_t*)(src1))[2];
    uint32_t p31 = ((uint32_t*)(src1))[3];
    uint32_t p02 = ((uint32_t*)(src2))[0];
    uint32_t p12 = ((uint32_t*)(src2))[1];
    uint32_t p22 = ((uint32_t*)(src2))[2];
    uint32_t p32 = ((uint32_t*)(src2))[3];
    uint32_t p03 = ((uint32_t*)(src3))[0];
    uint32_t p13 = ((uint32_t*)(src3))[1];
    uint32_t p23 = ((uint32_t*)(src3))[2];
    uint32_t p33 = ((uint32_t*)(src3))[3];
    ((uint32_t*)(dst))[0] = p00;
    ((uint32_t*)(dst))[1] = p01;
    ((uint32_t*)(dst))[2] = p02;
    ((uint32_t*)(dst))[3] = p03;
    ((uint32_t*)(dst1))[0] = p10;
    ((uint32_t*)(dst1))[1] = p11;
    ((uint32_t*)(dst1))[2] = p12;
    ((uint32_t*)(dst1))[3] = p13;
    ((uint32_t*)(dst2))[0] = p20;
    ((uint32_t*)(dst2))[1] = p21;
    ((uint32_t*)(dst2))[2] = p22;
    ((uint32_t*)(dst2))[3] = p23;
    ((uint32_t*)(dst3))[0] = p30;
    ((uint32_t*)(dst3))[1] = p31;
    ((uint32_t*)(dst3))[2] = p32;
    ((uint32_t*)(dst3))[3] = p33;
    src += src_stride * 4;  // advance 4 rows
    src1 += src_stride * 4;
    src2 += src_stride * 4;
    src3 += src_stride * 4;
    dst += 4 * 4;  // advance 4 columns
    dst1 += 4 * 4;
    dst2 += 4 * 4;
    dst3 += 4 * 4;
  }
 }
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/source/rotate_neon64.cc
+++ b/source/rotate_neon64.cc
@ -435,6 +435,45 @@ void TransposeUVWx8_NEON(const uint8_t* src,
      : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
        "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v30", "v31");
 }
 // Transpose 32 bit values (ARGB)
 void Transpose4x4_32_NEON(const uint8_t* src,
                          int src_stride,
                          uint8_t* dst,
                          int dst_stride,
                          int width) {
  const uint8_t* src1 = src + src_stride;
  const uint8_t* src2 = src1 + src_stride;
  const uint8_t* src3 = src2 + src_stride;
  uint8_t* dst1 = dst + dst_stride;
  uint8_t* dst2 = dst1 + dst_stride;
  uint8_t* dst3 = dst2 + dst_stride;
  asm volatile(
      // Main loop transpose 4x4.  Read a column, write a row.
      "1:                                        \n"
      "ld4         {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"
      "ld4         {v0.s, v1.s, v2.s, v3.s}[1], [%1], %9 \n"
      "ld4         {v0.s, v1.s, v2.s, v3.s}[2], [%2], %9 \n"
      "ld4         {v0.s, v1.s, v2.s, v3.s}[3], [%3], %9 \n"
      "subs        %w8, %w8, #4                  \n"  // w -= 4
      "st1         {v0.4s}, [%4], 16             \n"
      "st1         {v1.4s}, [%5], 16             \n"
      "st1         {v2.4s}, [%6], 16             \n"
      "st1         {v3.4s}, [%7], 16             \n"
      "b.gt        1b                            \n"
      : "+r"(src),                        // %0
        "+r"(src1),                       // %1
        "+r"(src2),                       // %2
        "+r"(src3),                       // %3
        "+r"(dst),                        // %4
        "+r"(dst1),                       // %5
        "+r"(dst2),                       // %6
        "+r"(dst3),                       // %7
        "+r"(width)                       // %8
      : "r"((ptrdiff_t)(src_stride * 4))  // %9
      : "memory", "cc", "v0", "v1", "v2", "v3");
 }
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 #ifdef __cplusplus
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -571,6 +571,9 @@ ANY21(MergeUVRow_Any_SSE2, MergeUVRow_SSE2, 0, 1, 1, 2, 15)
 #ifdef HAS_MERGEUVROW_AVX2
 ANY21(MergeUVRow_Any_AVX2, MergeUVRow_AVX2, 0, 1, 1, 2, 31)
 #endif
 #ifdef HAS_MERGEUVROW_AVX512BW
 ANY21(MergeUVRow_Any_AVX512BW, MergeUVRow_AVX512BW, 0, 1, 1, 2, 31)
 #endif
 #ifdef HAS_MERGEUVROW_NEON
 ANY21(MergeUVRow_Any_NEON, MergeUVRow_NEON, 0, 1, 1, 2, 15)
 #endif
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -17,6 +17,8 @@ extern "C" {
 // This module is for GCC x86 and x64.
 #if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
 #include <immintrin.h>
 #if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
 // Constants for ARGB
@ -5142,6 +5144,25 @@ void DetileSplitUVRow_SSSE3(const uint8_t* src_uv,
 }
 #endif  // HAS_DETILESPLITUVROW_SSSE3
 #ifdef HAS_MERGEUVROW_AVX512BW
 __attribute__ ((target("avx512vl,avx512bw")))
 void MergeUVRow_AVX512BW(const uint8_t* src_u,
                         const uint8_t* src_v,
                         uint8_t* dst_uv,
                         int width) {
  do {
    const __m512i u = _mm512_cvtepu8_epi16(_mm256_loadu_epi8(src_u));
    const __m512i v = _mm512_slli_epi64(_mm512_cvtepu8_epi16(_mm256_loadu_epi8(src_v)), 8);
    const __m512i uv = _mm512_or_si512(u, v);
    _mm512_storeu_epi8(dst_uv, uv);
    src_u += 32;
    src_v += 32;
    dst_uv += 64;
    width -= 32;
  } while (width > 0);
 }
 #endif  // HAS_MERGEUVROW_AVX512BW
 #ifdef HAS_MERGEUVROW_AVX2
 void MergeUVRow_AVX2(const uint8_t* src_u,
                     const uint8_t* src_v,
--- a/source/row_lasx.cc
+++ b/source/row_lasx.cc
@ -2047,10 +2047,12 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
      "xvld            $xr4,  %0,    0             \n\t"
      "xvld            $xr5,  %0,    32            \n\t"
      "xvld            $xr6,  %0,    64            \n\t"
-      "xvld            $xr7,  %0,    96            \n\t"  // load 32 pixels of ARGB
+      "xvld            $xr7,  %0,    96            \n\t"  // load 32 pixels of
                                                          // ARGB
      "xvor.v          $xr12, $xr3,  $xr3          \n\t"
      "xvor.v          $xr13, $xr3,  $xr3          \n\t"
-      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per loop.
+      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per
                                                          // loop.
      "xvpickev.b      $xr8,  $xr5,  $xr4          \n\t"  // BR
      "xvpickev.b      $xr10, $xr7,  $xr6          \n\t"
      "xvpickod.b      $xr9,  $xr5,  $xr4          \n\t"  // GA
@ -2070,10 +2072,8 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
      : "+&r"(src_argb),  // %0
        "+&r"(dst_y),     // %1
        "+&r"(width)      // %2
-      : "r"(rgbconstants),
+      : "r"(rgbconstants), "r"(shuff)
-        "r"(shuff)
+      : "memory");
      : "memory"
    );
 }
 void ARGBToYRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@ -2109,10 +2109,12 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
      "xvld            $xr4,  %0,    0             \n\t"
      "xvld            $xr5,  %0,    32            \n\t"
      "xvld            $xr6,  %0,    64            \n\t"
-      "xvld            $xr7,  %0,    96            \n\t"  // load 32 pixels of RGBA
+      "xvld            $xr7,  %0,    96            \n\t"  // load 32 pixels of
                                                          // RGBA
      "xvor.v          $xr12, $xr3,  $xr3          \n\t"
      "xvor.v          $xr13, $xr3,  $xr3          \n\t"
-      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per loop.
+      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per
                                                          // loop.
      "xvpickev.b      $xr8,  $xr5,  $xr4          \n\t"  // AG
      "xvpickev.b      $xr10, $xr7,  $xr6          \n\t"
      "xvpickod.b      $xr9,  $xr5,  $xr4          \n\t"  // BR
@ -2132,10 +2134,8 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
      : "+&r"(src_rgba),  // %0
        "+&r"(dst_y),     // %1
        "+&r"(width)      // %2
-      : "r"(rgbconstants),
+      : "r"(rgbconstants), "r"(shuff)
-        "r"(shuff)
+      : "memory");
      : "memory"
    );
 }
 void RGBAToYRow_LASX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
@ -2154,7 +2154,8 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
                                 uint8_t* dst_y,
                                 int width,
                                 const struct RgbConstants* rgbconstants) {
-    int8_t shuff[128] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
+  int8_t shuff[128] = {
      0,  2,  3,  5,  6,  8, 9,  11, 12, 14, 15, 17, 18, 20, 21, 23,
      0,  2,  3,  5,  6,  8, 9,  11, 12, 14, 15, 17, 18, 20, 21, 23,
      24, 26, 27, 29, 30, 0, 1,  3,  4,  6,  7,  9,  10, 12, 13, 15,
      24, 26, 27, 29, 30, 0, 1,  3,  4,  6,  7,  9,  10, 12, 13, 15,
@ -2174,11 +2175,13 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
      "1:                                          \n\t"
      "xvld            $xr8,  %0,    0             \n\t"
      "xvld            $xr9,  %0,    32            \n\t"
-      "xvld            $xr10, %0,    64            \n\t"  // load 32 pixels of RGB
+      "xvld            $xr10, %0,    64            \n\t"  // load 32 pixels of
                                                          // RGB
      "xvor.v          $xr12, $xr3,  $xr3          \n\t"
      "xvor.v          $xr13, $xr3,  $xr3          \n\t"
      "xvor.v          $xr11, $xr9,  $xr9          \n\t"
-      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per loop.
+      "addi.d          %2,    %2,    -32           \n\t"  // 32 processed per
                                                          // loop.
      "xvpermi.q       $xr9,  $xr8,  0x30          \n\t"  // src0
      "xvpermi.q       $xr8,  $xr10, 0x03          \n\t"  // src1
      "xvpermi.q       $xr10, $xr11, 0x30          \n\t"  // src2
@ -2202,8 +2205,7 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
        "+&r"(width)        // %2
      : "r"(rgbconstants),  // %3
        "r"(shuff)          // %4
-      : "memory"
+      : "memory");
    );
 }
 void RGB24ToYJRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
--- a/source/row_lsx.cc
+++ b/source/row_lsx.cc
@ -1688,10 +1688,12 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
      "vld            $vr4,  %0,    0             \n\t"
      "vld            $vr5,  %0,    16            \n\t"
      "vld            $vr6,  %0,    32            \n\t"
-      "vld            $vr7,  %0,    48            \n\t"  // load 16 pixels of ARGB
+      "vld            $vr7,  %0,    48            \n\t"  // load 16 pixels of
                                                         // ARGB
      "vor.v          $vr12, $vr3,  $vr3          \n\t"
      "vor.v          $vr13, $vr3,  $vr3          \n\t"
-      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per loop.
+      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
                                                         // loop.
      "vpickev.b      $vr8,  $vr5,  $vr4          \n\t"  // BR
      "vpickev.b      $vr10, $vr7,  $vr6          \n\t"
      "vpickod.b      $vr9,  $vr5,  $vr4          \n\t"  // GA
@ -1711,8 +1713,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
        "+&r"(dst_y),     // %1
        "+&r"(width)      // %2
      : "r"(rgbconstants)
-      : "memory"
+      : "memory");
    );
 }
 void ARGBToYRow_LSX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@ -1746,10 +1747,12 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
      "vld            $vr4,  %0,    0             \n\t"
      "vld            $vr5,  %0,    16            \n\t"
      "vld            $vr6,  %0,    32            \n\t"
-      "vld            $vr7,  %0,    48            \n\t"  // load 16 pixels of RGBA
+      "vld            $vr7,  %0,    48            \n\t"  // load 16 pixels of
                                                         // RGBA
      "vor.v          $vr12, $vr3,  $vr3          \n\t"
      "vor.v          $vr13, $vr3,  $vr3          \n\t"
-      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per loop.
+      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
                                                         // loop.
      "vpickev.b      $vr8,  $vr5,  $vr4          \n\t"  // AG
      "vpickev.b      $vr10, $vr7,  $vr6          \n\t"
      "vpickod.b      $vr9,  $vr5,  $vr4          \n\t"  // BR
@ -1769,8 +1772,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
        "+&r"(dst_y),     // %1
        "+&r"(width)      // %2
      : "r"(rgbconstants)
-      : "memory"
+      : "memory");
    );
 }
 void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
@ -1789,10 +1791,11 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
                                uint8_t* dst_y,
                                int width,
                                const struct RgbConstants* rgbconstants) {
-    int8_t shuff[64] = {0, 2, 3, 5, 6, 8, 9, 11, 12, 14, 15, 17, 18, 20, 21, 23,
+  int8_t shuff[64] = {0,  2,  3,  5,  6,  8,  9,  11, 12, 14, 15, 17, 18,
-                        24, 26, 27, 29, 30, 0, 1, 3, 4, 6, 7, 9, 10, 12, 13, 15,
+                      20, 21, 23, 24, 26, 27, 29, 30, 0,  1,  3,  4,  6,
-                        1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
+                      7,  9,  10, 12, 13, 15, 1,  0,  4,  0,  7,  0,  10,
-                        25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
+                      0,  13, 0,  16, 0,  19, 0,  22, 0,  25, 0,  28, 0,
                      31, 0,  2,  0,  5,  0,  8,  0,  11, 0,  14, 0};
  asm volatile(
      "vldrepl.b      $vr0,  %3,    0             \n\t"  // load rgbconstants
      "vldrepl.b      $vr1,  %3,    1             \n\t"  // load rgbconstants
@ -1805,10 +1808,12 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
      "1:                                         \n\t"
      "vld            $vr8,  %0,    0             \n\t"
      "vld            $vr9,  %0,    16            \n\t"
-      "vld            $vr10, %0,    32            \n\t"  // load 16 pixels of RGB
+      "vld            $vr10, %0,    32            \n\t"  // load 16 pixels of
                                                         // RGB
      "vor.v          $vr12, $vr3,  $vr3          \n\t"
      "vor.v          $vr13, $vr3,  $vr3          \n\t"
-      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per loop.
+      "addi.d         %2,    %2,    -16           \n\t"  // 16 processed per
                                                         // loop.
      "vshuf.b        $vr14, $vr9,  $vr8,  $vr4   \n\t"
      "vshuf.b        $vr15, $vr9,  $vr10, $vr5   \n\t"
      "vshuf.b        $vr16, $vr9,  $vr8,  $vr6   \n\t"
@ -1829,8 +1834,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
        "+&r"(width)        // %2
      : "r"(rgbconstants),  // %3
        "r"(shuff)          // %4
-      : "memory"
+      : "memory");
    );
 }
 void RGB24ToYJRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -820,6 +820,28 @@ void MergeUVRow_NEON(const uint8_t* src_u,
      : "cc", "memory", "v0", "v1"  // Clobber List
  );
 }
 // Reads 16 U's and V's and writes out 16 pairs of UV.
 void MergeUVRow_NEON1(const uint8_t* src_u,
                     const uint8_t* src_v,
                     uint8_t* dst_uv,
                     int width) {
  asm volatile(
      "1:                                        \n"
      "ld1         {v0.16b,v2.16b}, [%0], #32    \n"  // load U
      "ld1         {v1.16b,v3.16b}, [%1], #32    \n"  // load V
      "subs        %w3, %w3, #32                 \n"  // 32 processed per loop
      "prfm        pldl1keep, [%0, 448]          \n"
      "prfm        pldl1keep, [%1, 448]          \n"
      "st2         {v0.16b,v1.16b,v2.16b,v3.16b}, [%2], #64 \n"  // store 32 UV
      "b.gt        1b                            \n"
      : "+r"(src_u),                // %0
        "+r"(src_v),                // %1
        "+r"(dst_uv),               // %2
        "+r"(width)                 // %3  // Output registers
      :                             // Input registers
      : "cc", "memory", "v0", "v1"  // Clobber List
  );
 }
 void MergeUVRow_16_NEON(const uint16_t* src_u,
                        const uint16_t* src_v,
--- a/unit_test/rotate_test.cc
+++ b/unit_test/rotate_test.cc
@ -14,6 +14,10 @@
 #include "libyuv/cpu_id.h"
 #include "libyuv/rotate.h"
 #ifdef ENABLE_ROW_TESTS
 #include "libyuv/rotate_row.h"
 #endif
 namespace libyuv {
 #define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a))
@ -858,4 +862,47 @@ TEST_F(LibYUVRotateTest, I410Rotate270_Opt) {
                 disable_cpu_flags_, benchmark_cpu_info_);
 }
 #if defined(ENABLE_ROW_TESTS)
 TEST_F(LibYUVRotateTest, Transpose4x4) {
  // dst width and height
  const int width = ((benchmark_width_ * benchmark_height_ + 3) / 4 + 3) & ~3;
  const int height = 4;
  align_buffer_page_end(src_pixels, height * width * 4);
  align_buffer_page_end(dst_pixels_c, width * height * 4);
  align_buffer_page_end(dst_pixels_opt, width * height * 4);
  MemRandomize(src_pixels, height * width * 4);
  memset(dst_pixels_c, 1, width * height * 4);
  memset(dst_pixels_opt, 1, width * height * 4);
  Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
                    (uint8_t*)dst_pixels_c, width * 4, width);
  for (int i = 0; i < benchmark_iterations_; ++i) {
 #if defined(__aarch64__)
    if (TestCpuFlag(kCpuHasNEON)) {
      Transpose4x4_32_NEON((const uint8_t*)src_pixels, height * 4,
                           (uint8_t*)dst_pixels_opt, width * 4, width);
    } else {
      Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
                        (uint8_t*)dst_pixels_opt, width * 4, width);
    }
 #else
    Transpose4x4_32_C((const uint8_t*)src_pixels, height * 4,
                      (uint8_t*)dst_pixels_opt, width * 4, width);
 #endif
  }
  //  for (int i = 0; i < width * height; ++i) {
  //    EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
  //  }
  free_aligned_buffer_page_end(src_pixels);
  free_aligned_buffer_page_end(dst_pixels_c);
  free_aligned_buffer_page_end(dst_pixels_opt);
 }
 #endif  // ENABLE_ROW_TESTS
 }  // namespace libyuv