J420ToI420 using planar 8 bit scaling

- Add Convert8To8Plane which scale and add 8 bit values allowing full range YUV to be converted to limited range YUV libyuv_test '--gunit_filter=*J420ToI420*' --gunit_also_run_disabled_tests --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1 Samsung S23 J420ToI420_Opt (45 ms) I420ToI420_Opt (37 ms) Skylake J420ToI420_Opt (596 ms) I420ToI420_Opt (99 ms) Bug: 381327032 Change-Id: I380c3fa783491f2e3727af28b0ea9ce16d2bb8a4 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6182631 Reviewed-by: Wan-Teh Chang <wtc@google.com>
2026-02-12 21:29:51 +08:00 · 2025-01-21 15:56:56 -08:00 · 2025-01-21 15:56:56 -08:00 · 26277baf96
commit 26277baf96
parent ef52c1658a
15 changed files with 428 additions and 96 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1899
+Version: 1900
 License: BSD
 License File: LICENSE
 Shipped: yes
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@ -598,6 +598,23 @@ int I400ToI420(const uint8_t* src_y,
               int width,
               int height);
 // Convert J420 to I420.
 LIBYUV_API
 int J420ToI420(const uint8_t* src_y,
               int src_stride_y,
               const uint8_t* src_u,
               int src_stride_u,
               const uint8_t* src_v,
               int src_stride_v,
               uint8_t* dst_y,
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height);
 // Convert I400 (grey) to NV21.
 LIBYUV_API
 int I400ToNV21(const uint8_t* src_y,
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@ -78,6 +78,16 @@ void Convert8To16Plane(const uint8_t* src_y,
                       int width,
                       int height);
 LIBYUV_API
 void Convert8To8Plane(const uint8_t* src_y,
                      int src_stride_y,
                      uint8_t* dst_y,
                      int dst_stride_y,
                      int scale,  // 220 for Y, 225 for U,V
                      int bias,   // 16
                      int width,
                      int height);
 // Set a plane of data to a 32 bit value.
 LIBYUV_API
 void SetPlane(uint8_t* dst_y,
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -507,6 +507,7 @@ extern "C" {
 // The following are available on AArch64 platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 #define HAS_CONVERT8TO8ROW_NEON
 #define HAS_ARGBTOAR30ROW_NEON
 #define HAS_ABGRTOAR30ROW_NEON
 #define HAS_I210ALPHATOARGBROW_NEON
@ -3641,6 +3642,22 @@ void Convert16To8Row_SME(const uint16_t* src_y,
                         int scale,
                         int width);
 void Convert8To8Row_C(const uint8_t* src_y,
                      uint8_t* dst_y,
                      int scale,
                      int bias,
                      int width);
 void Convert8To8Row_NEON(const uint8_t* src_y,
                         uint8_t* dst_y,
                         int scale,
                         int bias,
                         int width);
 void Convert8To8Row_Any_NEON(const uint8_t* src_ptr,
                             uint8_t* dst_ptr,
                             int scale,
                             int bias,
                             int width);
 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width);
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1899
+#define LIBYUV_VERSION 1900
 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert.cc
+++ b/source/convert.cc
@ -4356,6 +4356,78 @@ int P010ToNV12(const uint16_t* src_y,
                             1, 1, 16);
 }
 static int Planar8bitTo8bit(const uint8_t* src_y,
                            int src_stride_y,
                            const uint8_t* src_u,
                            int src_stride_u,
                            const uint8_t* src_v,
                            int src_stride_v,
                            uint8_t* dst_y,
                            int dst_stride_y,
                            uint8_t* dst_u,
                            int dst_stride_u,
                            uint8_t* dst_v,
                            int dst_stride_v,
                            int width,
                            int height,
                            int subsample_x,
                            int subsample_y,
                            int scale_y,
                            int bias_y,
                            int scale_uv,
                            int bias_uv) {
  int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
  int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
      height == 0) {
    return -1;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    uv_height = -uv_height;
    src_y = src_y + (height - 1) * src_stride_y;
    src_u = src_u + (uv_height - 1) * src_stride_u;
    src_v = src_v + (uv_height - 1) * src_stride_v;
    src_stride_y = -src_stride_y;
    src_stride_u = -src_stride_u;
    src_stride_v = -src_stride_v;
  }
  // Convert Y plane.
  if (dst_y) {
    Convert8To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale_y, bias_y,
                     width, height);
  }
  // Convert UV planes.
  Convert8To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, scale_uv, bias_uv,
                   uv_width, uv_height);
  Convert8To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, scale_uv, bias_uv,
                   uv_width, uv_height);
  return 0;
 }
 LIBYUV_API
 int J420ToI420(const uint8_t* src_y,
               int src_stride_y,
               const uint8_t* src_u,
               int src_stride_u,
               const uint8_t* src_v,
               int src_stride_v,
               uint8_t* dst_y,
               int dst_stride_y,
               uint8_t* dst_u,
               int dst_stride_u,
               uint8_t* dst_v,
               int dst_stride_v,
               int width,
               int height) {
  return Planar8bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
                          src_stride_v, dst_y, dst_stride_y, dst_u,
                          dst_stride_u, dst_v, dst_stride_v, width, height, 1,
                          1, 220, 16, 225, 16);
 }
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@ -450,7 +450,7 @@ static SAFEBUFFERS int GetCpuFlags(void) {
                  ((cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0) |
                  ((cpu_info7[3] & 0x02000000) ? kCpuHasAMXINT8 : 0);
      if (cpu_info0[0] >= 0x24 && (cpu_einfo7[3] & 0x00080000)) {
-        cpu_info |= ((cpu_info24[1] & 0xFF) >= 2) ? kCpuHasAVX10_2: 0;
+        cpu_info |= ((cpu_info24[1] & 0xFF) >= 2) ? kCpuHasAVX10_2 : 0;
      }
    }
  }
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -234,6 +234,81 @@ void Convert8To16Plane(const uint8_t* src_y,
  }
 }
 // Convert a plane of 8 bit data to 8 bit
 LIBYUV_API
 void Convert8To8Plane(const uint8_t* src_y,
                      int src_stride_y,
                      uint8_t* dst_y,
                      int dst_stride_y,
                      int scale,  // 220 for Y, 225 to UV
                      int bias,   // 16
                      int width,
                      int height) {
  int y;
  void (*Convert8To8Row)(const uint8_t* src_y, uint8_t* dst_y, int scale,
                         int bias, int width) = Convert8To8Row_C;
  if (width <= 0 || height == 0) {
    return;
  }
  // Negative height means invert the image.
  if (height < 0) {
    height = -height;
    dst_y = dst_y + (height - 1) * dst_stride_y;
    dst_stride_y = -dst_stride_y;
  }
  // Coalesce rows.
  if (src_stride_y == width && dst_stride_y == width) {
    width *= height;
    height = 1;
    src_stride_y = dst_stride_y = 0;
  }
 #if defined(HAS_CONVERT8TO8ROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
    Convert8To8Row = Convert8To8Row_Any_NEON;
    if (IS_ALIGNED(width, 32)) {
      Convert8To8Row = Convert8To8Row_NEON;
    }
  }
 #endif
 #if defined(HAS_CONVERT8TO8ROW_SME)
  if (TestCpuFlag(kCpuHasSME)) {
    Convert8To8Row = Convert8To8Row_SME;
  }
 #endif
 #if defined(HAS_CONVERT8TO8ROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
    Convert8To8Row = Convert8To8Row_Any_SSSE3;
    if (IS_ALIGNED(width, 8)) {
      Convert8To8Row = Convert8To8Row_SSSE3;
    }
  }
 #endif
 #if defined(HAS_CONVERT8TO8ROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    Convert8To8Row = Convert8To8Row_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      Convert8To8Row = Convert8To8Row_AVX2;
    }
  }
 #endif
 #if defined(HAS_CONVERT8TO8ROW_AVX512BW)
  if (TestCpuFlag(kCpuHasAVX512BW)) {
    Convert8To8Row = Convert8To8Row_Any_AVX512BW;
    if (IS_ALIGNED(width, 64)) {
      Convert8To8Row = Convert8To8Row_AVX512BW;
    }
  }
 #endif
  // Convert plane
  for (y = 0; y < height; ++y) {
    Convert8To8Row(src_y, dst_y, scale, bias, width);
    src_y += src_stride_y;
    dst_y += dst_stride_y;
  }
 }
 // Copy I422.
 LIBYUV_API
 int I422Copy(const uint8_t* src_y,
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -1780,6 +1780,34 @@ ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15)
 #endif
 #undef ANY11C
 // Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
 #define ANY11SB(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)         \
  void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int bias, \
               int width) {                                               \
    SIMD_ALIGNED(STYPE vin[64]);                                          \
    SIMD_ALIGNED(DTYPE vout[64]);                                         \
    memset(vin, 0, sizeof(vin)); /* for msan */                           \
    int r = width & MASK;                                                 \
    int n = width & ~MASK;                                                \
    if (n > 0) {                                                          \
      ANY_SIMD(src_ptr, dst_ptr, scale, bias, n);                         \
    }                                                                     \
    memcpy(vin, src_ptr + n, r * SBPP);                                   \
    ANY_SIMD(vin, vout, scale, bias, MASK + 1);                           \
    memcpy(dst_ptr + n, vout, r * BPP);                                   \
  }
 #ifdef HAS_CONVERT8TO8ROW_NEON
 ANY11SB(Convert8To8Row_Any_NEON,
        Convert8To8Row_NEON,
        1,
        1,
        uint8_t,
        uint8_t,
        31)
 #endif
 #undef ANY11B
 // Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
 #define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK)             \
  void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -3240,6 +3240,24 @@ void Convert8To16Row_C(const uint8_t* src_y,
  }
 }
 // Use scale to convert J420 to I420
 // scale parameter is 8.8 fixed point but limited to 0 to 255
 // Function is based on DivideRow, but adds a bias
 // Does not clamp
 void Convert8To8Row_C(const uint8_t* src_y,
                      uint8_t* dst_y,
                      int scale,
                      int bias,
                      int width) {
  int x;
  assert(scale >= 0);
  assert(scale <= 255);
  for (x = 0; x < width; ++x) {
    dst_y[x] = ((src_y[x] * scale) >> 8) + bias;
  }
 }
 void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) {
  memcpy(dst, src, count);
 }
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -290,23 +290,22 @@ void I210ToAR30Row_NEON(const uint16_t* src_y,
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  uint16_t limit = 0x3ff0;
  uint16_t alpha = 0xc000;
-  asm volatile(
+  asm volatile(YUVTORGB_SETUP
      YUVTORGB_SETUP
      "dup         v22.8h, %w[limit]             \n"
      "dup         v23.8h, %w[alpha]             \n"
      "1:                                        \n" READYUV210 NVTORGB
      "subs        %w[width], %w[width], #8      \n" STOREAR30
      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),             // %[src_y]
+               : [src_y] "+r"(src_y),             // %[src_y]
-        [src_u] "+r"(src_u),             // %[src_u]
+                 [src_u] "+r"(src_u),             // %[src_u]
-        [src_v] "+r"(src_v),             // %[src_v]
+                 [src_v] "+r"(src_v),             // %[src_v]
-        [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
+                 [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
-        [width] "+r"(width)              // %[width]
+                 [width] "+r"(width)              // %[width]
-      : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
+               : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
+                 [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
-        [limit] "r"(limit),              // %[limit]
+                 [limit] "r"(limit),              // %[limit]
-        [alpha] "r"(alpha)               // %[alpha]
+                 [alpha] "r"(alpha)               // %[alpha]
-      : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+               : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
 }
 void I410ToAR30Row_NEON(const uint16_t* src_y,
@ -319,23 +318,22 @@ void I410ToAR30Row_NEON(const uint16_t* src_y,
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  uint16_t limit = 0x3ff0;
  uint16_t alpha = 0xc000;
-  asm volatile(
+  asm volatile(YUVTORGB_SETUP
      YUVTORGB_SETUP
      "dup         v22.8h, %w[limit]             \n"
      "dup         v23.8h, %w[alpha]             \n"
      "1:                                        \n" READYUV410 NVTORGB
      "subs        %w[width], %w[width], #8      \n" STOREAR30
      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),             // %[src_y]
+               : [src_y] "+r"(src_y),             // %[src_y]
-        [src_u] "+r"(src_u),             // %[src_u]
+                 [src_u] "+r"(src_u),             // %[src_u]
-        [src_v] "+r"(src_v),             // %[src_v]
+                 [src_v] "+r"(src_v),             // %[src_v]
-        [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
+                 [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
-        [width] "+r"(width)              // %[width]
+                 [width] "+r"(width)              // %[width]
-      : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
+               : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
+                 [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
-        [limit] "r"(limit),              // %[limit]
+                 [limit] "r"(limit),              // %[limit]
-        [alpha] "r"(alpha)               // %[alpha]
+                 [alpha] "r"(alpha)               // %[alpha]
-      : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+               : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
 }
 void I212ToAR30Row_NEON(const uint16_t* src_y,
@ -347,22 +345,21 @@ void I212ToAR30Row_NEON(const uint16_t* src_y,
  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  const uint16_t limit = 0x3ff0;
-  asm volatile(
+  asm volatile(YUVTORGB_SETUP
      YUVTORGB_SETUP
      "dup         v22.8h, %w[limit]             \n"
      "movi        v23.8h, #0xc0, lsl #8         \n"  // A
      "1:                                        \n" READYUV212 NVTORGB
      "subs        %w[width], %w[width], #8      \n" STOREAR30
      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),             // %[src_y]
+               : [src_y] "+r"(src_y),             // %[src_y]
-        [src_u] "+r"(src_u),             // %[src_u]
+                 [src_u] "+r"(src_u),             // %[src_u]
-        [src_v] "+r"(src_v),             // %[src_v]
+                 [src_v] "+r"(src_v),             // %[src_v]
-        [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
+                 [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
-        [width] "+r"(width)              // %[width]
+                 [width] "+r"(width)              // %[width]
-      : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
+               : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
+                 [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
-        [limit] "r"(limit)               // %[limit]
+                 [limit] "r"(limit)               // %[limit]
-      : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+               : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
 }
 void I210ToARGBRow_NEON(const uint16_t* src_y,
@ -374,7 +371,8 @@ void I210ToARGBRow_NEON(const uint16_t* src_y,
  asm volatile(
      YUVTORGB_SETUP
      "movi        v19.8b, #255                  \n"
-      "1:                                        \n" READYUV210 NVTORGB RGBTORGB8
+      "1:                                        \n" READYUV210 NVTORGB
          RGBTORGB8
      "subs        %w[width], %w[width], #8      \n"
      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt        1b                            \n"
@ -397,7 +395,8 @@ void I410ToARGBRow_NEON(const uint16_t* src_y,
  asm volatile(
      YUVTORGB_SETUP
      "movi        v19.8b, #255                  \n"
-      "1:                                        \n" READYUV410 NVTORGB RGBTORGB8
+      "1:                                        \n" READYUV410 NVTORGB
          RGBTORGB8
      "subs        %w[width], %w[width], #8      \n"
      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt        1b                            \n"
@ -422,7 +421,8 @@ void I212ToARGBRow_NEON(const uint16_t* src_y,
  asm volatile(
      YUVTORGB_SETUP
      "movi        v19.8b, #255                  \n"
-      "1:                                        \n" READYUV212 NVTORGB RGBTORGB8
+      "1:                                        \n" READYUV212 NVTORGB
          RGBTORGB8
      "subs        %w[width], %w[width], #8      \n"
      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt        1b                            \n"
@ -526,22 +526,23 @@ void P210ToAR30Row_NEON(const uint16_t* src_y,
  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  const uint16_t limit = 0x3ff0;
-  asm volatile(YUVTORGB_SETUP
+  asm volatile(
      YUVTORGB_SETUP
      "dup         v22.8h, %w[limit]             \n"
      "movi        v23.8h, #0xc0, lsl #8         \n"  // A
      "ldr         q2, [%[kIndices]]             \n"
      "1:                                        \n" READYUVP210 NVTORGB
      "subs        %w[width], %w[width], #8      \n" STOREAR30
      "b.gt        1b                            \n"
-               : [src_y] "+r"(src_y),                     // %[src_y]
+      : [src_y] "+r"(src_y),                     // %[src_y]
-                 [src_uv] "+r"(src_uv),                   // %[src_uv]
+        [src_uv] "+r"(src_uv),                   // %[src_uv]
-                 [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
+        [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
-                 [width] "+r"(width)                      // %[width]
+        [width] "+r"(width)                      // %[width]
-               : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
+      : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
-                 [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
+        [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
-                 [limit] "r"(limit),                      // %[limit]
+        [limit] "r"(limit),                      // %[limit]
-                 [kIndices] "r"(kP210LoadShuffleIndices)  // %[kIndices]
+        [kIndices] "r"(kP210LoadShuffleIndices)  // %[kIndices]
-               : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+      : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
 }
 void P410ToAR30Row_NEON(const uint16_t* src_y,
@ -552,22 +553,23 @@ void P410ToAR30Row_NEON(const uint16_t* src_y,
  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  uint16_t limit = 0x3ff0;
-  asm volatile(YUVTORGB_SETUP
+  asm volatile(
      YUVTORGB_SETUP
      "dup         v22.8h, %w[limit]             \n"
      "movi        v23.8h, #0xc0, lsl #8         \n"  // A
      "ldr         q2, [%[kIndices]]             \n"
      "1:                                        \n" READYUVP410 NVTORGB
      "subs        %w[width], %w[width], #8      \n" STOREAR30
      "b.gt        1b                            \n"
-               : [src_y] "+r"(src_y),                     // %[src_y]
+      : [src_y] "+r"(src_y),                     // %[src_y]
-                 [src_uv] "+r"(src_uv),                   // %[src_uv]
+        [src_uv] "+r"(src_uv),                   // %[src_uv]
-                 [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
+        [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
-                 [width] "+r"(width)                      // %[width]
+        [width] "+r"(width)                      // %[width]
-               : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
+      : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
-                 [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
+        [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
-                 [limit] "r"(limit),                      // %[limit]
+        [limit] "r"(limit),                      // %[limit]
-                 [kIndices] "r"(kP410LoadShuffleIndices)  // %[kIndices]
+        [kIndices] "r"(kP410LoadShuffleIndices)  // %[kIndices]
-               : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+      : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
 }
 void I422ToAR30Row_NEON(const uint8_t* src_y,
@ -820,7 +822,8 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
      READYUV422 I4XXTORGB RGBTORGB8_TOP
      "subs        %w[width], %w[width], #8      \n"  //
      ARGBTOARGB1555_FROM_TOP
-      "st1         {v19.8h}, [%[dst_argb1555]], #16 \n"  // store 8 pixels RGB1555.
+      "st1         {v19.8h}, [%[dst_argb1555]], #16 \n"  // store 8 pixels
                                                         // RGB1555.
      "b.gt        1b                            \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
@ -3460,7 +3463,8 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
      "movi        v6.16b, #66                   \n"  // R * 0.2578 coefficient
      "movi        v7.16b, #16                   \n"  // Add 16 constant
      "1:                                        \n"
-      "ldp         q0, q3, [%0], #32             \n"  // load 16 ARGB1555 pixels.
+      "ldp         q0, q3, [%0], #32             \n"  // load 16 ARGB1555
                                                      // pixels.
      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
      RGB555TOARGB
      "umull       v16.8h, v0.8b, v4.8b          \n"  // B
@ -3492,7 +3496,8 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
      "movi        v26.16b, #66                  \n"  // R * 0.2578 coefficient
      "movi        v27.16b, #16                  \n"  // Add 16 constant
      "1:                                        \n"
-      "ldp         q0, q3, [%0], #32             \n"  // load 16 ARGB4444 pixels.
+      "ldp         q0, q3, [%0], #32             \n"  // load 16 ARGB4444
                                                      // pixels.
      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
      ARGB4444TORGB
      "umull       v16.8h, v0.8b, v24.8b         \n"  // B
@ -4136,7 +4141,8 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
                       uint32_t value) {
  asm volatile(
      "dup         v0.4s, %w3                    \n"  // duplicate scale value.
-      "zip1        v0.16b, v0.16b, v0.16b        \n"  // v0.16b aarrggbbaarrggbb.
+      "zip1        v0.16b, v0.16b, v0.16b        \n"  // v0.16b
                                                      // aarrggbbaarrggbb.
      "ushr        v0.8h, v0.8h, #1              \n"  // scale / 2.
      // 8 pixel loop.
@ -5277,11 +5283,11 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
      "dup         v2.8h, %w3                    \n"
      "1:                                        \n"
      "ldp         q0, q1, [%0], #32             \n"
      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
      "mul         v0.8h, v0.8h, v2.8h           \n"
      "prfm        pldl1keep, [%0, 448]          \n"
      "mul         v1.8h, v1.8h, v2.8h           \n"
      "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
      "b.gt        1b                            \n"
      : "+r"(src_y),  // %0
        "+r"(dst_y),  // %1
@ -5298,6 +5304,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
      "dup         v4.8h, %w3                    \n"
      "1:                                        \n"
      "ldp         q2, q3, [%0], #32             \n"
      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
      "umull       v0.4s, v2.4h, v4.4h           \n"
      "umull2      v1.4s, v2.8h, v4.8h           \n"
      "umull       v2.4s, v3.4h, v4.4h           \n"
@ -5306,7 +5313,6 @@ void DivideRow_16_NEON(const uint16_t* src_y,
      "uzp2        v0.8h, v0.8h, v1.8h           \n"
      "uzp2        v1.8h, v2.8h, v3.8h           \n"
      "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
      "b.gt        1b                            \n"
      : "+r"(src_y),  // %0
        "+r"(dst_y),  // %1
@ -5332,11 +5338,11 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
      "dup         v2.8h, %w3                    \n"
      "1:                                        \n"
      "ldp         q0, q1, [%0], #32             \n"
      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
      "uqshl       v0.8h, v0.8h, v2.8h           \n"
      "uqshl       v1.8h, v1.8h, v2.8h           \n"
      "prfm        pldl1keep, [%0, 448]          \n"
      "uzp2        v0.16b, v0.16b, v1.16b        \n"
      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
      "str         q0, [%1], #16                 \n"  // store 16 pixels
      "b.gt        1b                            \n"
      : "+r"(src_y),  // %0
@ -5346,6 +5352,40 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
      : "cc", "memory", "v0", "v1", "v2");
 }
 // Use scale to convert J420 to I420
 // scale parameter is 8.8 fixed point but limited to 0 to 255
 // Function is based on DivideRow, but adds a bias
 // Does not clamp
 void Convert8To8Row_NEON(const uint8_t* src_y,
                         uint8_t* dst_y,
                         int scale,
                         int bias,
                         int width) {
  asm volatile(
      "dup         v4.16b, %w3                   \n"  // scale
      "dup         v5.16b, %w4                   \n"  // bias
      "1:                                        \n"
      "ldp         q2, q3, [%0], #32             \n"
      "subs        %w2, %w2, #32                 \n"  // 32 pixels per loop
      "umull       v0.8h, v2.8b, v4.8b           \n"
      "umull2      v1.8h, v2.16b, v4.16b         \n"
      "umull       v2.8h, v3.8b, v4.8b           \n"
      "umull2      v3.8h, v3.16b, v4.16b         \n"
      "prfm        pldl1keep, [%0, 448]          \n"
      "uzp2        v0.16b, v0.16b, v1.16b        \n"
      "uzp2        v1.16b, v2.16b, v3.16b        \n"
      "add         v0.16b, v0.16b, v5.16b        \n"  // add bias (16)
      "add         v1.16b, v1.16b, v5.16b        \n"
      "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
      "b.gt        1b                            \n"
      : "+r"(src_y),  // %0
        "+r"(dst_y),  // %1
        "+r"(width)   // %2
      : "r"(scale),   // %3
        "r"(bias)     // %4
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
 }
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 #ifdef __cplusplus
--- a/source/scale.cc
+++ b/source/scale.cc
@ -2336,9 +2336,9 @@ int I420Scale(const uint8_t* src_y,
  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
  int r;
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
@ -2381,9 +2381,9 @@ int I420Scale_16(const uint16_t* src_y,
  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
  int r;
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
@ -2426,9 +2426,9 @@ int I420Scale_12(const uint16_t* src_y,
  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
  int r;
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
@ -2470,9 +2470,9 @@ int I444Scale(const uint8_t* src_y,
              enum FilterMode filtering) {
  int r;
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
@ -2511,9 +2511,9 @@ int I444Scale_16(const uint16_t* src_y,
                 enum FilterMode filtering) {
  int r;
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
@ -2552,9 +2552,9 @@ int I444Scale_12(const uint16_t* src_y,
                 enum FilterMode filtering) {
  int r;
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
@ -2598,9 +2598,9 @@ int I422Scale(const uint8_t* src_y,
  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
  int r;
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
@ -2641,9 +2641,9 @@ int I422Scale_16(const uint16_t* src_y,
  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
  int r;
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
@ -2684,9 +2684,9 @@ int I422Scale_12(const uint16_t* src_y,
  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
  int r;
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@ -1369,7 +1369,8 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
      "uzp2        v1.8h, v2.8h, v3.8h           \n"
      "uzp2        v2.8h, v4.8h, v5.8h           \n"
      "uzp2        v3.8h, v6.8h, v7.8h           \n"
-      "subs        %w[dst_width], %w[dst_width], #32 \n"  // 32 elems per iteration.
+      "subs        %w[dst_width], %w[dst_width], #32 \n"  // 32 elems per
                                                          // iteration.
      "stp         q0, q1, [%[dst_ptr]]          \n"
      "stp         q2, q3, [%[dst_ptr], #32]     \n"
      "add         %[dst_ptr], %[dst_ptr], #64   \n"
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -188,6 +188,7 @@ TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10)
 TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10)
 TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8)
 TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H012, uint16_t, 2, 2, 2, 8)
 TESTPLANARTOP(J420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
 TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10)
 TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10)
 TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12)
@ -2107,6 +2108,28 @@ TEST_F(LibYUVConvertTest, TestRGB24ToI420) {
 }
 #endif
 TEST_F(LibYUVConvertTest, TestJ420ToI420) {
  const uint8_t src_y[12] = {0, 0, 128, 128, 255, 255,
                             0, 0, 128, 128, 255, 255};
  const uint8_t src_u[3] = {0, 128, 255};
  const uint8_t src_v[3] = {0, 128, 255};
  uint8_t dst_y[12];
  uint8_t dst_u[3];
  uint8_t dst_v[3];
  ASSERT_EQ(J420ToI420(src_y, 6, src_u, 3, src_v, 3, dst_y, 6, dst_u, 3, dst_v,
                       3, 6, 2),
            0);
  EXPECT_EQ(dst_y[0], 16);
  EXPECT_EQ(dst_y[2], 126);
  EXPECT_EQ(dst_y[4], 235);
  EXPECT_EQ(dst_u[0], 16);
  EXPECT_EQ(dst_u[1], 128);
  EXPECT_EQ(dst_u[2], 240);
  EXPECT_EQ(dst_v[0], 16);
  EXPECT_EQ(dst_v[1], 128);
  EXPECT_EQ(dst_v[2], 240);
 }
 #endif  // !defined(LEAN_TESTS)
 }  // namespace libyuv
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -3802,6 +3802,37 @@ TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
  free_aligned_buffer_page_end(dst_pixels_y_c);
 }
 TEST_F(LibYUVPlanarTest, Convert8To8Plane) {
  const int kPixels = benchmark_width_ * benchmark_height_;
  align_buffer_page_end(src_pixels_y, kPixels);
  align_buffer_page_end(dst_pixels_y_opt, kPixels);
  align_buffer_page_end(dst_pixels_y_c, kPixels);
  MemRandomize(src_pixels_y, kPixels);
  memset(dst_pixels_y_opt, 0, kPixels);
  memset(dst_pixels_y_c, 1, kPixels);
  MaskCpuFlags(disable_cpu_flags_);
  Convert8To8Plane(src_pixels_y, benchmark_width_, dst_pixels_y_c,
                   benchmark_width_, 220, 16, benchmark_width_,
                   benchmark_height_);
  MaskCpuFlags(benchmark_cpu_info_);
  for (int i = 0; i < benchmark_iterations_; ++i) {
    Convert8To8Plane(src_pixels_y, benchmark_width_, dst_pixels_y_opt,
                     benchmark_width_, 220, 16, benchmark_width_,
                     benchmark_height_);
  }
  for (int i = 0; i < kPixels; ++i) {
    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
  }
  free_aligned_buffer_page_end(src_pixels_y);
  free_aligned_buffer_page_end(dst_pixels_y_opt);
  free_aligned_buffer_page_end(dst_pixels_y_c);
 }
 TEST_F(LibYUVPlanarTest, YUY2ToY) {
  const int kPixels = benchmark_width_ * benchmark_height_;
  align_buffer_page_end(src_pixels_y, kPixels * 2);