J420ToI420 using planar 8 bit scaling

- Add Convert8To8Plane which scale and add 8 bit values allowing full range YUV to be converted to limited range YUV libyuv_test '--gunit_filter=*J420ToI420*' --gunit_also_run_disabled_tests --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1 Samsung S23 J420ToI420_Opt (45 ms) I420ToI420_Opt (37 ms) Skylake J420ToI420_Opt (596 ms) I420ToI420_Opt (99 ms) Bug: 381327032 Change-Id: I380c3fa783491f2e3727af28b0ea9ce16d2bb8a4 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6182631 Reviewed-by: Wan-Teh Chang <wtc@google.com>
2026-01-01 03:12:16 +08:00 · 2025-01-21 15:56:56 -08:00 · 2025-01-21 15:56:56 -08:00 · 26277baf96
commit 26277baf96
parent ef52c1658a
15 changed files with 428 additions and 96 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1899
+Version: 1900
 License: BSD
 License File: LICENSE
 Shipped: yes
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@ -598,6 +598,23 @@ int I400ToI420(const uint8_t* src_y,
               int width,
               int height);

+// Convert J420 to I420.
+LIBYUV_API
+int J420ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 // Convert I400 (grey) to NV21.
 LIBYUV_API
 int I400ToNV21(const uint8_t* src_y,
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@ -78,6 +78,16 @@ void Convert8To16Plane(const uint8_t* src_y,
                       int width,
                       int height);

+LIBYUV_API
+void Convert8To8Plane(const uint8_t* src_y,
+                      int src_stride_y,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      int scale,  // 220 for Y, 225 for U,V
+                      int bias,   // 16
+                      int width,
+                      int height);
+
 // Set a plane of data to a 32 bit value.
 LIBYUV_API
 void SetPlane(uint8_t* dst_y,
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -507,6 +507,7 @@ extern "C" {

 // The following are available on AArch64 platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_CONVERT8TO8ROW_NEON
 #define HAS_ARGBTOAR30ROW_NEON
 #define HAS_ABGRTOAR30ROW_NEON
 #define HAS_I210ALPHATOARGBROW_NEON
@ -3641,6 +3642,22 @@ void Convert16To8Row_SME(const uint16_t* src_y,
                         int scale,
                         int width);

+void Convert8To8Row_C(const uint8_t* src_y,
+                      uint8_t* dst_y,
+                      int scale,
+                      int bias,
+                      int width);
+void Convert8To8Row_NEON(const uint8_t* src_y,
+                         uint8_t* dst_y,
+                         int scale,
+                         int bias,
+                         int width);
+void Convert8To8Row_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int scale,
+                             int bias,
+                             int width);
+
 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width);
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1899
+#define LIBYUV_VERSION 1900

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/convert.cc
+++ b/source/convert.cc
@ -4356,6 +4356,78 @@ int P010ToNV12(const uint16_t* src_y,
                             1, 1, 16);
 }

+static int Planar8bitTo8bit(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_y,
+                            int dst_stride_y,
+                            uint8_t* dst_u,
+                            int dst_stride_u,
+                            uint8_t* dst_v,
+                            int dst_stride_v,
+                            int width,
+                            int height,
+                            int subsample_x,
+                            int subsample_y,
+                            int scale_y,
+                            int bias_y,
+                            int scale_uv,
+                            int bias_uv) {
+  int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+  int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    uv_height = -uv_height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (uv_height - 1) * src_stride_u;
+    src_v = src_v + (uv_height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Convert Y plane.
+  if (dst_y) {
+    Convert8To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale_y, bias_y,
+                     width, height);
+  }
+  // Convert UV planes.
+  Convert8To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, scale_uv, bias_uv,
+                   uv_width, uv_height);
+  Convert8To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, scale_uv, bias_uv,
+                   uv_width, uv_height);
+  return 0;
+}
+
+LIBYUV_API
+int J420ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return Planar8bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_y, dst_stride_y, dst_u,
+                          dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+                          1, 220, 16, 225, 16);
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@ -450,7 +450,7 @@ static SAFEBUFFERS int GetCpuFlags(void) {
                  ((cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0) |
                  ((cpu_info7[3] & 0x02000000) ? kCpuHasAMXINT8 : 0);
      if (cpu_info0[0] >= 0x24 && (cpu_einfo7[3] & 0x00080000)) {
-        cpu_info |= ((cpu_info24[1] & 0xFF) >= 2) ? kCpuHasAVX10_2: 0;
+        cpu_info |= ((cpu_info24[1] & 0xFF) >= 2) ? kCpuHasAVX10_2 : 0;
      }
    }
  }
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -234,6 +234,81 @@ void Convert8To16Plane(const uint8_t* src_y,
  }
 }

+// Convert a plane of 8 bit data to 8 bit
+LIBYUV_API
+void Convert8To8Plane(const uint8_t* src_y,
+                      int src_stride_y,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      int scale,  // 220 for Y, 225 to UV
+                      int bias,   // 16
+                      int width,
+                      int height) {
+  int y;
+  void (*Convert8To8Row)(const uint8_t* src_y, uint8_t* dst_y, int scale,
+                         int bias, int width) = Convert8To8Row_C;
+
+  if (width <= 0 || height == 0) {
+    return;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_CONVERT8TO8ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Convert8To8Row = Convert8To8Row_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      Convert8To8Row = Convert8To8Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_CONVERT8TO8ROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    Convert8To8Row = Convert8To8Row_SME;
+  }
+#endif
+#if defined(HAS_CONVERT8TO8ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Convert8To8Row = Convert8To8Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      Convert8To8Row = Convert8To8Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_CONVERT8TO8ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Convert8To8Row = Convert8To8Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      Convert8To8Row = Convert8To8Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_CONVERT8TO8ROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    Convert8To8Row = Convert8To8Row_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      Convert8To8Row = Convert8To8Row_AVX512BW;
+    }
+  }
+#endif
+
+  // Convert plane
+  for (y = 0; y < height; ++y) {
+    Convert8To8Row(src_y, dst_y, scale, bias, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
 // Copy I422.
 LIBYUV_API
 int I422Copy(const uint8_t* src_y,
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -1780,6 +1780,34 @@ ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15)
 #endif
 #undef ANY11C

+// Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
+#define ANY11SB(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)         \
+  void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int bias, \
+               int width) {                                               \
+    SIMD_ALIGNED(STYPE vin[64]);                                          \
+    SIMD_ALIGNED(DTYPE vout[64]);                                         \
+    memset(vin, 0, sizeof(vin)); /* for msan */                           \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr, dst_ptr, scale, bias, n);                         \
+    }                                                                     \
+    memcpy(vin, src_ptr + n, r * SBPP);                                   \
+    ANY_SIMD(vin, vout, scale, bias, MASK + 1);                           \
+    memcpy(dst_ptr + n, vout, r * BPP);                                   \
+  }
+
+#ifdef HAS_CONVERT8TO8ROW_NEON
+ANY11SB(Convert8To8Row_Any_NEON,
+        Convert8To8Row_NEON,
+        1,
+        1,
+        uint8_t,
+        uint8_t,
+        31)
+#endif
+#undef ANY11B
+
 // Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
 #define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK)             \
  void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -3240,6 +3240,24 @@ void Convert8To16Row_C(const uint8_t* src_y,
  }
 }

+// Use scale to convert J420 to I420
+// scale parameter is 8.8 fixed point but limited to 0 to 255
+// Function is based on DivideRow, but adds a bias
+// Does not clamp
+void Convert8To8Row_C(const uint8_t* src_y,
+                      uint8_t* dst_y,
+                      int scale,
+                      int bias,
+                      int width) {
+  int x;
+  assert(scale >= 0);
+  assert(scale <= 255);
+
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = ((src_y[x] * scale) >> 8) + bias;
+  }
+}
+
 void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) {
  memcpy(dst, src, count);
 }
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -290,23 +290,22 @@ void I210ToAR30Row_NEON(const uint16_t* src_y,
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  uint16_t limit = 0x3ff0;
  uint16_t alpha = 0xc000;
-  asm volatile(
-      YUVTORGB_SETUP
+  asm volatile(YUVTORGB_SETUP
      "dup         v22.8h, %w[limit]             \n"
      "dup         v23.8h, %w[alpha]             \n"
      "1:                                        \n" READYUV210 NVTORGB
      "subs        %w[width], %w[width], #8      \n" STOREAR30
      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),             // %[src_y]
-        [src_u] "+r"(src_u),             // %[src_u]
-        [src_v] "+r"(src_v),             // %[src_v]
-        [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
-        [width] "+r"(width)              // %[width]
-      : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
-        [limit] "r"(limit),              // %[limit]
-        [alpha] "r"(alpha)               // %[alpha]
-      : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+               : [src_y] "+r"(src_y),             // %[src_y]
+                 [src_u] "+r"(src_u),             // %[src_u]
+                 [src_v] "+r"(src_v),             // %[src_v]
+                 [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
+                 [width] "+r"(width)              // %[width]
+               : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
+                 [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
+                 [limit] "r"(limit),              // %[limit]
+                 [alpha] "r"(alpha)               // %[alpha]
+               : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
 }

 void I410ToAR30Row_NEON(const uint16_t* src_y,
@ -319,23 +318,22 @@ void I410ToAR30Row_NEON(const uint16_t* src_y,
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  uint16_t limit = 0x3ff0;
  uint16_t alpha = 0xc000;
-  asm volatile(
-      YUVTORGB_SETUP
+  asm volatile(YUVTORGB_SETUP
      "dup         v22.8h, %w[limit]             \n"
      "dup         v23.8h, %w[alpha]             \n"
      "1:                                        \n" READYUV410 NVTORGB
      "subs        %w[width], %w[width], #8      \n" STOREAR30
      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),             // %[src_y]
-        [src_u] "+r"(src_u),             // %[src_u]
-        [src_v] "+r"(src_v),             // %[src_v]
-        [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
-        [width] "+r"(width)              // %[width]
-      : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
-        [limit] "r"(limit),              // %[limit]
-        [alpha] "r"(alpha)               // %[alpha]
-      : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+               : [src_y] "+r"(src_y),             // %[src_y]
+                 [src_u] "+r"(src_u),             // %[src_u]
+                 [src_v] "+r"(src_v),             // %[src_v]
+                 [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
+                 [width] "+r"(width)              // %[width]
+               : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
+                 [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
+                 [limit] "r"(limit),              // %[limit]
+                 [alpha] "r"(alpha)               // %[alpha]
+               : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
 }

 void I212ToAR30Row_NEON(const uint16_t* src_y,
@ -347,22 +345,21 @@ void I212ToAR30Row_NEON(const uint16_t* src_y,
  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  const uint16_t limit = 0x3ff0;
-  asm volatile(
-      YUVTORGB_SETUP
+  asm volatile(YUVTORGB_SETUP
      "dup         v22.8h, %w[limit]             \n"
      "movi        v23.8h, #0xc0, lsl #8         \n"  // A
      "1:                                        \n" READYUV212 NVTORGB
      "subs        %w[width], %w[width], #8      \n" STOREAR30
      "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),             // %[src_y]
-        [src_u] "+r"(src_u),             // %[src_u]
-        [src_v] "+r"(src_v),             // %[src_v]
-        [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
-        [width] "+r"(width)              // %[width]
-      : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
-        [limit] "r"(limit)               // %[limit]
-      : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+               : [src_y] "+r"(src_y),             // %[src_y]
+                 [src_u] "+r"(src_u),             // %[src_u]
+                 [src_v] "+r"(src_v),             // %[src_v]
+                 [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
+                 [width] "+r"(width)              // %[width]
+               : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
+                 [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
+                 [limit] "r"(limit)               // %[limit]
+               : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
 }

 void I210ToARGBRow_NEON(const uint16_t* src_y,
@ -374,7 +371,8 @@ void I210ToARGBRow_NEON(const uint16_t* src_y,
  asm volatile(
      YUVTORGB_SETUP
      "movi        v19.8b, #255                  \n"
-      "1:                                        \n" READYUV210 NVTORGB RGBTORGB8
+      "1:                                        \n" READYUV210 NVTORGB
+          RGBTORGB8
      "subs        %w[width], %w[width], #8      \n"
      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt        1b                            \n"
@ -397,7 +395,8 @@ void I410ToARGBRow_NEON(const uint16_t* src_y,
  asm volatile(
      YUVTORGB_SETUP
      "movi        v19.8b, #255                  \n"
-      "1:                                        \n" READYUV410 NVTORGB RGBTORGB8
+      "1:                                        \n" READYUV410 NVTORGB
+          RGBTORGB8
      "subs        %w[width], %w[width], #8      \n"
      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt        1b                            \n"
@ -422,7 +421,8 @@ void I212ToARGBRow_NEON(const uint16_t* src_y,
  asm volatile(
      YUVTORGB_SETUP
      "movi        v19.8b, #255                  \n"
-      "1:                                        \n" READYUV212 NVTORGB RGBTORGB8
+      "1:                                        \n" READYUV212 NVTORGB
+          RGBTORGB8
      "subs        %w[width], %w[width], #8      \n"
      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
      "b.gt        1b                            \n"
@ -526,22 +526,23 @@ void P210ToAR30Row_NEON(const uint16_t* src_y,
  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  const uint16_t limit = 0x3ff0;
-  asm volatile(YUVTORGB_SETUP
+  asm volatile(
+      YUVTORGB_SETUP
      "dup         v22.8h, %w[limit]             \n"
      "movi        v23.8h, #0xc0, lsl #8         \n"  // A
      "ldr         q2, [%[kIndices]]             \n"
      "1:                                        \n" READYUVP210 NVTORGB
      "subs        %w[width], %w[width], #8      \n" STOREAR30
      "b.gt        1b                            \n"
-               : [src_y] "+r"(src_y),                     // %[src_y]
-                 [src_uv] "+r"(src_uv),                   // %[src_uv]
-                 [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
-                 [width] "+r"(width)                      // %[width]
-               : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
-                 [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
-                 [limit] "r"(limit),                      // %[limit]
-                 [kIndices] "r"(kP210LoadShuffleIndices)  // %[kIndices]
-               : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+      : [src_y] "+r"(src_y),                     // %[src_y]
+        [src_uv] "+r"(src_uv),                   // %[src_uv]
+        [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
+        [width] "+r"(width)                      // %[width]
+      : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
+        [limit] "r"(limit),                      // %[limit]
+        [kIndices] "r"(kP210LoadShuffleIndices)  // %[kIndices]
+      : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
 }

 void P410ToAR30Row_NEON(const uint16_t* src_y,
@ -552,22 +553,23 @@ void P410ToAR30Row_NEON(const uint16_t* src_y,
  const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
  const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
  uint16_t limit = 0x3ff0;
-  asm volatile(YUVTORGB_SETUP
+  asm volatile(
+      YUVTORGB_SETUP
      "dup         v22.8h, %w[limit]             \n"
      "movi        v23.8h, #0xc0, lsl #8         \n"  // A
      "ldr         q2, [%[kIndices]]             \n"
      "1:                                        \n" READYUVP410 NVTORGB
      "subs        %w[width], %w[width], #8      \n" STOREAR30
      "b.gt        1b                            \n"
-               : [src_y] "+r"(src_y),                     // %[src_y]
-                 [src_uv] "+r"(src_uv),                   // %[src_uv]
-                 [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
-                 [width] "+r"(width)                      // %[width]
-               : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
-                 [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
-                 [limit] "r"(limit),                      // %[limit]
-                 [kIndices] "r"(kP410LoadShuffleIndices)  // %[kIndices]
-               : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+      : [src_y] "+r"(src_y),                     // %[src_y]
+        [src_uv] "+r"(src_uv),                   // %[src_uv]
+        [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
+        [width] "+r"(width)                      // %[width]
+      : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
+        [limit] "r"(limit),                      // %[limit]
+        [kIndices] "r"(kP410LoadShuffleIndices)  // %[kIndices]
+      : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
 }

 void I422ToAR30Row_NEON(const uint8_t* src_y,
@ -820,7 +822,8 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
      READYUV422 I4XXTORGB RGBTORGB8_TOP
      "subs        %w[width], %w[width], #8      \n"  //
      ARGBTOARGB1555_FROM_TOP
-      "st1         {v19.8h}, [%[dst_argb1555]], #16 \n"  // store 8 pixels RGB1555.
+      "st1         {v19.8h}, [%[dst_argb1555]], #16 \n"  // store 8 pixels
+                                                         // RGB1555.
      "b.gt        1b                            \n"
      : [src_y] "+r"(src_y),                               // %[src_y]
        [src_u] "+r"(src_u),                               // %[src_u]
@ -3460,7 +3463,8 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
      "movi        v6.16b, #66                   \n"  // R * 0.2578 coefficient
      "movi        v7.16b, #16                   \n"  // Add 16 constant
      "1:                                        \n"
-      "ldp         q0, q3, [%0], #32             \n"  // load 16 ARGB1555 pixels.
+      "ldp         q0, q3, [%0], #32             \n"  // load 16 ARGB1555
+                                                      // pixels.
      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
      RGB555TOARGB
      "umull       v16.8h, v0.8b, v4.8b          \n"  // B
@ -3492,7 +3496,8 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
      "movi        v26.16b, #66                  \n"  // R * 0.2578 coefficient
      "movi        v27.16b, #16                  \n"  // Add 16 constant
      "1:                                        \n"
-      "ldp         q0, q3, [%0], #32             \n"  // load 16 ARGB4444 pixels.
+      "ldp         q0, q3, [%0], #32             \n"  // load 16 ARGB4444
+                                                      // pixels.
      "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
      ARGB4444TORGB
      "umull       v16.8h, v0.8b, v24.8b         \n"  // B
@ -4136,7 +4141,8 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
                       uint32_t value) {
  asm volatile(
      "dup         v0.4s, %w3                    \n"  // duplicate scale value.
-      "zip1        v0.16b, v0.16b, v0.16b        \n"  // v0.16b aarrggbbaarrggbb.
+      "zip1        v0.16b, v0.16b, v0.16b        \n"  // v0.16b
+                                                      // aarrggbbaarrggbb.
      "ushr        v0.8h, v0.8h, #1              \n"  // scale / 2.

      // 8 pixel loop.
@ -5277,11 +5283,11 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
      "dup         v2.8h, %w3                    \n"
      "1:                                        \n"
      "ldp         q0, q1, [%0], #32             \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
      "mul         v0.8h, v0.8h, v2.8h           \n"
      "prfm        pldl1keep, [%0, 448]          \n"
      "mul         v1.8h, v1.8h, v2.8h           \n"
      "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
-      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
      "b.gt        1b                            \n"
      : "+r"(src_y),  // %0
        "+r"(dst_y),  // %1
@ -5298,6 +5304,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
      "dup         v4.8h, %w3                    \n"
      "1:                                        \n"
      "ldp         q2, q3, [%0], #32             \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
      "umull       v0.4s, v2.4h, v4.4h           \n"
      "umull2      v1.4s, v2.8h, v4.8h           \n"
      "umull       v2.4s, v3.4h, v4.4h           \n"
@ -5306,7 +5313,6 @@ void DivideRow_16_NEON(const uint16_t* src_y,
      "uzp2        v0.8h, v0.8h, v1.8h           \n"
      "uzp2        v1.8h, v2.8h, v3.8h           \n"
      "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
-      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
      "b.gt        1b                            \n"
      : "+r"(src_y),  // %0
        "+r"(dst_y),  // %1
@ -5332,11 +5338,11 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
      "dup         v2.8h, %w3                    \n"
      "1:                                        \n"
      "ldp         q0, q1, [%0], #32             \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
      "uqshl       v0.8h, v0.8h, v2.8h           \n"
      "uqshl       v1.8h, v1.8h, v2.8h           \n"
      "prfm        pldl1keep, [%0, 448]          \n"
      "uzp2        v0.16b, v0.16b, v1.16b        \n"
-      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
      "str         q0, [%1], #16                 \n"  // store 16 pixels
      "b.gt        1b                            \n"
      : "+r"(src_y),  // %0
@ -5346,6 +5352,40 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
      : "cc", "memory", "v0", "v1", "v2");
 }

+// Use scale to convert J420 to I420
+// scale parameter is 8.8 fixed point but limited to 0 to 255
+// Function is based on DivideRow, but adds a bias
+// Does not clamp
+void Convert8To8Row_NEON(const uint8_t* src_y,
+                         uint8_t* dst_y,
+                         int scale,
+                         int bias,
+                         int width) {
+  asm volatile(
+      "dup         v4.16b, %w3                   \n"  // scale
+      "dup         v5.16b, %w4                   \n"  // bias
+      "1:                                        \n"
+      "ldp         q2, q3, [%0], #32             \n"
+      "subs        %w2, %w2, #32                 \n"  // 32 pixels per loop
+      "umull       v0.8h, v2.8b, v4.8b           \n"
+      "umull2      v1.8h, v2.16b, v4.16b         \n"
+      "umull       v2.8h, v3.8b, v4.8b           \n"
+      "umull2      v3.8h, v3.16b, v4.16b         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uzp2        v0.16b, v0.16b, v1.16b        \n"
+      "uzp2        v1.16b, v2.16b, v3.16b        \n"
+      "add         v0.16b, v0.16b, v5.16b        \n"  // add bias (16)
+      "add         v1.16b, v1.16b, v5.16b        \n"
+      "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(scale),   // %3
+        "r"(bias)     // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)

 #ifdef __cplusplus
--- a/source/scale.cc
+++ b/source/scale.cc
@ -2336,9 +2336,9 @@ int I420Scale(const uint8_t* src_y,
  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
  int r;

-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }

@ -2381,9 +2381,9 @@ int I420Scale_16(const uint16_t* src_y,
  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
  int r;

-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }

@ -2426,9 +2426,9 @@ int I420Scale_12(const uint16_t* src_y,
  int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
  int r;

-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }

@ -2470,9 +2470,9 @@ int I444Scale(const uint8_t* src_y,
              enum FilterMode filtering) {
  int r;

-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }

@ -2511,9 +2511,9 @@ int I444Scale_16(const uint16_t* src_y,
                 enum FilterMode filtering) {
  int r;

-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }

@ -2552,9 +2552,9 @@ int I444Scale_12(const uint16_t* src_y,
                 enum FilterMode filtering) {
  int r;

-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }

@ -2598,9 +2598,9 @@ int I422Scale(const uint8_t* src_y,
  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
  int r;

-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }

@ -2641,9 +2641,9 @@ int I422Scale_16(const uint16_t* src_y,
  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
  int r;

-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }

@ -2684,9 +2684,9 @@ int I422Scale_12(const uint16_t* src_y,
  int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
  int r;

-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
    return -1;
  }

--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@ -1369,7 +1369,8 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
      "uzp2        v1.8h, v2.8h, v3.8h           \n"
      "uzp2        v2.8h, v4.8h, v5.8h           \n"
      "uzp2        v3.8h, v6.8h, v7.8h           \n"
-      "subs        %w[dst_width], %w[dst_width], #32 \n"  // 32 elems per iteration.
+      "subs        %w[dst_width], %w[dst_width], #32 \n"  // 32 elems per
+                                                          // iteration.
      "stp         q0, q1, [%[dst_ptr]]          \n"
      "stp         q2, q3, [%[dst_ptr], #32]     \n"
      "add         %[dst_ptr], %[dst_ptr], #64   \n"
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@ -188,6 +188,7 @@ TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10)
 TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10)
 TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8)
 TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H012, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(J420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
 TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10)
 TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10)
 TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12)
@ -2107,6 +2108,28 @@ TEST_F(LibYUVConvertTest, TestRGB24ToI420) {
 }
 #endif

+TEST_F(LibYUVConvertTest, TestJ420ToI420) {
+  const uint8_t src_y[12] = {0, 0, 128, 128, 255, 255,
+                             0, 0, 128, 128, 255, 255};
+  const uint8_t src_u[3] = {0, 128, 255};
+  const uint8_t src_v[3] = {0, 128, 255};
+  uint8_t dst_y[12];
+  uint8_t dst_u[3];
+  uint8_t dst_v[3];
+  ASSERT_EQ(J420ToI420(src_y, 6, src_u, 3, src_v, 3, dst_y, 6, dst_u, 3, dst_v,
+                       3, 6, 2),
+            0);
+  EXPECT_EQ(dst_y[0], 16);
+  EXPECT_EQ(dst_y[2], 126);
+  EXPECT_EQ(dst_y[4], 235);
+  EXPECT_EQ(dst_u[0], 16);
+  EXPECT_EQ(dst_u[1], 128);
+  EXPECT_EQ(dst_u[2], 240);
+  EXPECT_EQ(dst_v[0], 16);
+  EXPECT_EQ(dst_v[1], 128);
+  EXPECT_EQ(dst_v[2], 240);
+}
+
 #endif  // !defined(LEAN_TESTS)

 }  // namespace libyuv
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -3802,6 +3802,37 @@ TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
  free_aligned_buffer_page_end(dst_pixels_y_c);
 }

+TEST_F(LibYUVPlanarTest, Convert8To8Plane) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels_y, kPixels);
+  align_buffer_page_end(dst_pixels_y_opt, kPixels);
+  align_buffer_page_end(dst_pixels_y_c, kPixels);
+
+  MemRandomize(src_pixels_y, kPixels);
+  memset(dst_pixels_y_opt, 0, kPixels);
+  memset(dst_pixels_y_c, 1, kPixels);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  Convert8To8Plane(src_pixels_y, benchmark_width_, dst_pixels_y_c,
+                   benchmark_width_, 220, 16, benchmark_width_,
+                   benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    Convert8To8Plane(src_pixels_y, benchmark_width_, dst_pixels_y_opt,
+                     benchmark_width_, 220, 16, benchmark_width_,
+                     benchmark_height_);
+  }
+
+  for (int i = 0; i < kPixels; ++i) {
+    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels_y);
+  free_aligned_buffer_page_end(dst_pixels_y_opt);
+  free_aligned_buffer_page_end(dst_pixels_y_c);
+}
+
 TEST_F(LibYUVPlanarTest, YUY2ToY) {
  const int kPixels = benchmark_width_ * benchmark_height_;
  align_buffer_page_end(src_pixels_y, kPixels * 2);