From 26277baf96fd95bf6efa4abab82775bde9bc5ccb Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Tue, 21 Jan 2025 15:56:56 -0800
Subject: [PATCH] J420ToI420 using planar 8 bit scaling

- Add Convert8To8Plane which scale and add 8 bit values allowing full range
  YUV to be converted to limited range YUV

libyuv_test '--gunit_filter=*J420ToI420*' --gunit_also_run_disabled_tests --libyuv_width=1280 --libyuv_height=720 --libyuv_repeat=1000 --libyuv_flags=-1 --libyuv_cpu_info=-1

Samsung S23
J420ToI420_Opt (45 ms)
I420ToI420_Opt (37 ms)

Skylake
J420ToI420_Opt (596 ms)
I420ToI420_Opt (99 ms)

Bug: 381327032
Change-Id: I380c3fa783491f2e3727af28b0ea9ce16d2bb8a4
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6182631
Reviewed-by: Wan-Teh Chang <wtc@google.com>
---
 README.chromium                   |   2 +-
 include/libyuv/convert.h          |  17 +++
 include/libyuv/planar_functions.h |  10 ++
 include/libyuv/row.h              |  17 +++
 include/libyuv/version.h          |   2 +-
 source/convert.cc                 |  72 +++++++++++++
 source/cpu_id.cc                  |   2 +-
 source/planar_functions.cc        |  75 +++++++++++++
 source/row_any.cc                 |  28 +++++
 source/row_common.cc              |  18 ++++
 source/row_neon64.cc              | 170 ++++++++++++++++++------------
 source/scale.cc                   |  54 +++++-----
 source/scale_neon64.cc            |   3 +-
 unit_test/convert_test.cc         |  23 ++++
 unit_test/planar_test.cc          |  31 ++++++
 15 files changed, 428 insertions(+), 96 deletions(-)

diff --git a/README.chromium b/README.chromium
index 5442e7f7c..18e76cc59 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
 Name: libyuv
 URL: https://chromium.googlesource.com/libyuv/libyuv/
-Version: 1899
+Version: 1900
 License: BSD
 License File: LICENSE
 Shipped: yes
diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h
index 5c7669b5d..79dcf0555 100644
--- a/include/libyuv/convert.h
+++ b/include/libyuv/convert.h
@@ -598,6 +598,23 @@ int I400ToI420(const uint8_t* src_y,
                int width,
                int height);
 
+// Convert J420 to I420.
+LIBYUV_API
+int J420ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height);
+
 // Convert I400 (grey) to NV21.
 LIBYUV_API
 int I400ToNV21(const uint8_t* src_y,
diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h
index 678074a14..5b79efffc 100644
--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@@ -78,6 +78,16 @@ void Convert8To16Plane(const uint8_t* src_y,
                        int width,
                        int height);
 
+LIBYUV_API
+void Convert8To8Plane(const uint8_t* src_y,
+                      int src_stride_y,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      int scale,  // 220 for Y, 225 for U,V
+                      int bias,   // 16
+                      int width,
+                      int height);
+
 // Set a plane of data to a 32 bit value.
 LIBYUV_API
 void SetPlane(uint8_t* dst_y,
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index fee2d2481..815ac6a5a 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -507,6 +507,7 @@ extern "C" {
 
 // The following are available on AArch64 platforms:
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#define HAS_CONVERT8TO8ROW_NEON
 #define HAS_ARGBTOAR30ROW_NEON
 #define HAS_ABGRTOAR30ROW_NEON
 #define HAS_I210ALPHATOARGBROW_NEON
@@ -3641,6 +3642,22 @@ void Convert16To8Row_SME(const uint16_t* src_y,
                          int scale,
                          int width);
 
+void Convert8To8Row_C(const uint8_t* src_y,
+                      uint8_t* dst_y,
+                      int scale,
+                      int bias,
+                      int width);
+void Convert8To8Row_NEON(const uint8_t* src_y,
+                         uint8_t* dst_y,
+                         int scale,
+                         int bias,
+                         int width);
+void Convert8To8Row_Any_NEON(const uint8_t* src_ptr,
+                             uint8_t* dst_ptr,
+                             int scale,
+                             int bias,
+                             int width);
+
 void CopyRow_SSE2(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_AVX(const uint8_t* src, uint8_t* dst, int width);
 void CopyRow_AVX512BW(const uint8_t* src, uint8_t* dst, int width);
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index 842fe201f..adf3e8538 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1899
+#define LIBYUV_VERSION 1900
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/convert.cc b/source/convert.cc
index 665f0d23d..0c974f5ff 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -4356,6 +4356,78 @@ int P010ToNV12(const uint16_t* src_y,
                              1, 1, 16);
 }
 
+static int Planar8bitTo8bit(const uint8_t* src_y,
+                            int src_stride_y,
+                            const uint8_t* src_u,
+                            int src_stride_u,
+                            const uint8_t* src_v,
+                            int src_stride_v,
+                            uint8_t* dst_y,
+                            int dst_stride_y,
+                            uint8_t* dst_u,
+                            int dst_stride_u,
+                            uint8_t* dst_v,
+                            int dst_stride_v,
+                            int width,
+                            int height,
+                            int subsample_x,
+                            int subsample_y,
+                            int scale_y,
+                            int bias_y,
+                            int scale_uv,
+                            int bias_uv) {
+  int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
+  int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
+  if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
+      height == 0) {
+    return -1;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    uv_height = -uv_height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_u = src_u + (uv_height - 1) * src_stride_u;
+    src_v = src_v + (uv_height - 1) * src_stride_v;
+    src_stride_y = -src_stride_y;
+    src_stride_u = -src_stride_u;
+    src_stride_v = -src_stride_v;
+  }
+
+  // Convert Y plane.
+  if (dst_y) {
+    Convert8To8Plane(src_y, src_stride_y, dst_y, dst_stride_y, scale_y, bias_y,
+                     width, height);
+  }
+  // Convert UV planes.
+  Convert8To8Plane(src_u, src_stride_u, dst_u, dst_stride_u, scale_uv, bias_uv,
+                   uv_width, uv_height);
+  Convert8To8Plane(src_v, src_stride_v, dst_v, dst_stride_v, scale_uv, bias_uv,
+                   uv_width, uv_height);
+  return 0;
+}
+
+LIBYUV_API
+int J420ToI420(const uint8_t* src_y,
+               int src_stride_y,
+               const uint8_t* src_u,
+               int src_stride_u,
+               const uint8_t* src_v,
+               int src_stride_v,
+               uint8_t* dst_y,
+               int dst_stride_y,
+               uint8_t* dst_u,
+               int dst_stride_u,
+               uint8_t* dst_v,
+               int dst_stride_v,
+               int width,
+               int height) {
+  return Planar8bitTo8bit(src_y, src_stride_y, src_u, src_stride_u, src_v,
+                          src_stride_v, dst_y, dst_stride_y, dst_u,
+                          dst_stride_u, dst_v, dst_stride_v, width, height, 1,
+                          1, 220, 16, 225, 16);
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv
diff --git a/source/cpu_id.cc b/source/cpu_id.cc
index f5cc968fb..e4acbecf4 100644
--- a/source/cpu_id.cc
+++ b/source/cpu_id.cc
@@ -450,7 +450,7 @@ static SAFEBUFFERS int GetCpuFlags(void) {
                   ((cpu_einfo7[3] & 0x00080000) ? kCpuHasAVX10 : 0) |
                   ((cpu_info7[3] & 0x02000000) ? kCpuHasAMXINT8 : 0);
       if (cpu_info0[0] >= 0x24 && (cpu_einfo7[3] & 0x00080000)) {
-        cpu_info |= ((cpu_info24[1] & 0xFF) >= 2) ? kCpuHasAVX10_2: 0;
+        cpu_info |= ((cpu_info24[1] & 0xFF) >= 2) ? kCpuHasAVX10_2 : 0;
       }
     }
   }
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 4c87b7d3d..f0763c41f 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -234,6 +234,81 @@ void Convert8To16Plane(const uint8_t* src_y,
   }
 }
 
+// Convert a plane of 8 bit data to 8 bit
+LIBYUV_API
+void Convert8To8Plane(const uint8_t* src_y,
+                      int src_stride_y,
+                      uint8_t* dst_y,
+                      int dst_stride_y,
+                      int scale,  // 220 for Y, 225 to UV
+                      int bias,   // 16
+                      int width,
+                      int height) {
+  int y;
+  void (*Convert8To8Row)(const uint8_t* src_y, uint8_t* dst_y, int scale,
+                         int bias, int width) = Convert8To8Row_C;
+
+  if (width <= 0 || height == 0) {
+    return;
+  }
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    dst_y = dst_y + (height - 1) * dst_stride_y;
+    dst_stride_y = -dst_stride_y;
+  }
+  // Coalesce rows.
+  if (src_stride_y == width && dst_stride_y == width) {
+    width *= height;
+    height = 1;
+    src_stride_y = dst_stride_y = 0;
+  }
+#if defined(HAS_CONVERT8TO8ROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    Convert8To8Row = Convert8To8Row_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      Convert8To8Row = Convert8To8Row_NEON;
+    }
+  }
+#endif
+#if defined(HAS_CONVERT8TO8ROW_SME)
+  if (TestCpuFlag(kCpuHasSME)) {
+    Convert8To8Row = Convert8To8Row_SME;
+  }
+#endif
+#if defined(HAS_CONVERT8TO8ROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    Convert8To8Row = Convert8To8Row_Any_SSSE3;
+    if (IS_ALIGNED(width, 8)) {
+      Convert8To8Row = Convert8To8Row_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_CONVERT8TO8ROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    Convert8To8Row = Convert8To8Row_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      Convert8To8Row = Convert8To8Row_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_CONVERT8TO8ROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    Convert8To8Row = Convert8To8Row_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      Convert8To8Row = Convert8To8Row_AVX512BW;
+    }
+  }
+#endif
+
+  // Convert plane
+  for (y = 0; y < height; ++y) {
+    Convert8To8Row(src_y, dst_y, scale, bias, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
 // Copy I422.
 LIBYUV_API
 int I422Copy(const uint8_t* src_y,
diff --git a/source/row_any.cc b/source/row_any.cc
index 70ab046ec..8344aa35f 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -1780,6 +1780,34 @@ ANY11C(DivideRow_16_Any_NEON, DivideRow_16_NEON, 2, 2, uint16_t, uint16_t, 15)
 #endif
 #undef ANY11C
 
+// Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
+#define ANY11SB(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)         \
+  void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int bias, \
+               int width) {                                               \
+    SIMD_ALIGNED(STYPE vin[64]);                                          \
+    SIMD_ALIGNED(DTYPE vout[64]);                                         \
+    memset(vin, 0, sizeof(vin)); /* for msan */                           \
+    int r = width & MASK;                                                 \
+    int n = width & ~MASK;                                                \
+    if (n > 0) {                                                          \
+      ANY_SIMD(src_ptr, dst_ptr, scale, bias, n);                         \
+    }                                                                     \
+    memcpy(vin, src_ptr + n, r * SBPP);                                   \
+    ANY_SIMD(vin, vout, scale, bias, MASK + 1);                           \
+    memcpy(dst_ptr + n, vout, r * BPP);                                   \
+  }
+
+#ifdef HAS_CONVERT8TO8ROW_NEON
+ANY11SB(Convert8To8Row_Any_NEON,
+        Convert8To8Row_NEON,
+        1,
+        1,
+        uint8_t,
+        uint8_t,
+        31)
+#endif
+#undef ANY11B
+
 // Any 1 to 1 with parameter and shorts to byte.  BPP measures in shorts.
 #define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK)             \
   void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
diff --git a/source/row_common.cc b/source/row_common.cc
index 4b5948201..cd16c1721 100644
--- a/source/row_common.cc
+++ b/source/row_common.cc
@@ -3240,6 +3240,24 @@ void Convert8To16Row_C(const uint8_t* src_y,
   }
 }
 
+// Use scale to convert J420 to I420
+// scale parameter is 8.8 fixed point but limited to 0 to 255
+// Function is based on DivideRow, but adds a bias
+// Does not clamp
+void Convert8To8Row_C(const uint8_t* src_y,
+                      uint8_t* dst_y,
+                      int scale,
+                      int bias,
+                      int width) {
+  int x;
+  assert(scale >= 0);
+  assert(scale <= 255);
+
+  for (x = 0; x < width; ++x) {
+    dst_y[x] = ((src_y[x] * scale) >> 8) + bias;
+  }
+}
+
 void CopyRow_C(const uint8_t* src, uint8_t* dst, int count) {
   memcpy(dst, src, count);
 }
diff --git a/source/row_neon64.cc b/source/row_neon64.cc
index a8ba41357..dc4ca2417 100644
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@@ -290,23 +290,22 @@ void I210ToAR30Row_NEON(const uint16_t* src_y,
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   uint16_t limit = 0x3ff0;
   uint16_t alpha = 0xc000;
-  asm volatile(
-      YUVTORGB_SETUP
+  asm volatile(YUVTORGB_SETUP
       "dup         v22.8h, %w[limit]             \n"
       "dup         v23.8h, %w[alpha]             \n"
       "1:                                        \n" READYUV210 NVTORGB
       "subs        %w[width], %w[width], #8      \n" STOREAR30
       "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),             // %[src_y]
-        [src_u] "+r"(src_u),             // %[src_u]
-        [src_v] "+r"(src_v),             // %[src_v]
-        [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
-        [width] "+r"(width)              // %[width]
-      : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
-        [limit] "r"(limit),              // %[limit]
-        [alpha] "r"(alpha)               // %[alpha]
-      : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+               : [src_y] "+r"(src_y),             // %[src_y]
+                 [src_u] "+r"(src_u),             // %[src_u]
+                 [src_v] "+r"(src_v),             // %[src_v]
+                 [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
+                 [width] "+r"(width)              // %[width]
+               : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
+                 [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
+                 [limit] "r"(limit),              // %[limit]
+                 [alpha] "r"(alpha)               // %[alpha]
+               : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
 }
 
 void I410ToAR30Row_NEON(const uint16_t* src_y,
@@ -319,23 +318,22 @@ void I410ToAR30Row_NEON(const uint16_t* src_y,
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   uint16_t limit = 0x3ff0;
   uint16_t alpha = 0xc000;
-  asm volatile(
-      YUVTORGB_SETUP
+  asm volatile(YUVTORGB_SETUP
       "dup         v22.8h, %w[limit]             \n"
       "dup         v23.8h, %w[alpha]             \n"
       "1:                                        \n" READYUV410 NVTORGB
       "subs        %w[width], %w[width], #8      \n" STOREAR30
       "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),             // %[src_y]
-        [src_u] "+r"(src_u),             // %[src_u]
-        [src_v] "+r"(src_v),             // %[src_v]
-        [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
-        [width] "+r"(width)              // %[width]
-      : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
-        [limit] "r"(limit),              // %[limit]
-        [alpha] "r"(alpha)               // %[alpha]
-      : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+               : [src_y] "+r"(src_y),             // %[src_y]
+                 [src_u] "+r"(src_u),             // %[src_u]
+                 [src_v] "+r"(src_v),             // %[src_v]
+                 [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
+                 [width] "+r"(width)              // %[width]
+               : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
+                 [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
+                 [limit] "r"(limit),              // %[limit]
+                 [alpha] "r"(alpha)               // %[alpha]
+               : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
 }
 
 void I212ToAR30Row_NEON(const uint16_t* src_y,
@@ -347,22 +345,21 @@ void I212ToAR30Row_NEON(const uint16_t* src_y,
   const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   const uint16_t limit = 0x3ff0;
-  asm volatile(
-      YUVTORGB_SETUP
+  asm volatile(YUVTORGB_SETUP
       "dup         v22.8h, %w[limit]             \n"
       "movi        v23.8h, #0xc0, lsl #8         \n"  // A
       "1:                                        \n" READYUV212 NVTORGB
       "subs        %w[width], %w[width], #8      \n" STOREAR30
       "b.gt        1b                            \n"
-      : [src_y] "+r"(src_y),             // %[src_y]
-        [src_u] "+r"(src_u),             // %[src_u]
-        [src_v] "+r"(src_v),             // %[src_v]
-        [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
-        [width] "+r"(width)              // %[width]
-      : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
-        [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
-        [limit] "r"(limit)               // %[limit]
-      : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+               : [src_y] "+r"(src_y),             // %[src_y]
+                 [src_u] "+r"(src_u),             // %[src_u]
+                 [src_v] "+r"(src_v),             // %[src_v]
+                 [dst_ar30] "+r"(dst_ar30),       // %[dst_ar30]
+                 [width] "+r"(width)              // %[width]
+               : [kUVCoeff] "r"(uv_coeff),        // %[kUVCoeff]
+                 [kRGBCoeffBias] "r"(rgb_coeff),  // %[kRGBCoeffBias]
+                 [limit] "r"(limit)               // %[limit]
+               : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
 }
 
 void I210ToARGBRow_NEON(const uint16_t* src_y,
@@ -374,7 +371,8 @@ void I210ToARGBRow_NEON(const uint16_t* src_y,
   asm volatile(
       YUVTORGB_SETUP
       "movi        v19.8b, #255                  \n"
-      "1:                                        \n" READYUV210 NVTORGB RGBTORGB8
+      "1:                                        \n" READYUV210 NVTORGB
+          RGBTORGB8
       "subs        %w[width], %w[width], #8      \n"
       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
       "b.gt        1b                            \n"
@@ -397,7 +395,8 @@ void I410ToARGBRow_NEON(const uint16_t* src_y,
   asm volatile(
       YUVTORGB_SETUP
       "movi        v19.8b, #255                  \n"
-      "1:                                        \n" READYUV410 NVTORGB RGBTORGB8
+      "1:                                        \n" READYUV410 NVTORGB
+          RGBTORGB8
       "subs        %w[width], %w[width], #8      \n"
       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
       "b.gt        1b                            \n"
@@ -422,7 +421,8 @@ void I212ToARGBRow_NEON(const uint16_t* src_y,
   asm volatile(
       YUVTORGB_SETUP
       "movi        v19.8b, #255                  \n"
-      "1:                                        \n" READYUV212 NVTORGB RGBTORGB8
+      "1:                                        \n" READYUV212 NVTORGB
+          RGBTORGB8
       "subs        %w[width], %w[width], #8      \n"
       "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
       "b.gt        1b                            \n"
@@ -526,22 +526,23 @@ void P210ToAR30Row_NEON(const uint16_t* src_y,
   const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   const uint16_t limit = 0x3ff0;
-  asm volatile(YUVTORGB_SETUP
+  asm volatile(
+      YUVTORGB_SETUP
       "dup         v22.8h, %w[limit]             \n"
       "movi        v23.8h, #0xc0, lsl #8         \n"  // A
       "ldr         q2, [%[kIndices]]             \n"
       "1:                                        \n" READYUVP210 NVTORGB
       "subs        %w[width], %w[width], #8      \n" STOREAR30
       "b.gt        1b                            \n"
-               : [src_y] "+r"(src_y),                     // %[src_y]
-                 [src_uv] "+r"(src_uv),                   // %[src_uv]
-                 [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
-                 [width] "+r"(width)                      // %[width]
-               : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
-                 [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
-                 [limit] "r"(limit),                      // %[limit]
-                 [kIndices] "r"(kP210LoadShuffleIndices)  // %[kIndices]
-               : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+      : [src_y] "+r"(src_y),                     // %[src_y]
+        [src_uv] "+r"(src_uv),                   // %[src_uv]
+        [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
+        [width] "+r"(width)                      // %[width]
+      : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
+        [limit] "r"(limit),                      // %[limit]
+        [kIndices] "r"(kP210LoadShuffleIndices)  // %[kIndices]
+      : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
 }
 
 void P410ToAR30Row_NEON(const uint16_t* src_y,
@@ -552,22 +553,23 @@ void P410ToAR30Row_NEON(const uint16_t* src_y,
   const uvec8* uv_coeff = &yuvconstants->kUVCoeff;
   const vec16* rgb_coeff = &yuvconstants->kRGBCoeffBias;
   uint16_t limit = 0x3ff0;
-  asm volatile(YUVTORGB_SETUP
+  asm volatile(
+      YUVTORGB_SETUP
       "dup         v22.8h, %w[limit]             \n"
       "movi        v23.8h, #0xc0, lsl #8         \n"  // A
       "ldr         q2, [%[kIndices]]             \n"
       "1:                                        \n" READYUVP410 NVTORGB
       "subs        %w[width], %w[width], #8      \n" STOREAR30
       "b.gt        1b                            \n"
-               : [src_y] "+r"(src_y),                     // %[src_y]
-                 [src_uv] "+r"(src_uv),                   // %[src_uv]
-                 [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
-                 [width] "+r"(width)                      // %[width]
-               : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
-                 [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
-                 [limit] "r"(limit),                      // %[limit]
-                 [kIndices] "r"(kP410LoadShuffleIndices)  // %[kIndices]
-               : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
+      : [src_y] "+r"(src_y),                     // %[src_y]
+        [src_uv] "+r"(src_uv),                   // %[src_uv]
+        [dst_ar30] "+r"(dst_ar30),               // %[dst_ar30]
+        [width] "+r"(width)                      // %[width]
+      : [kUVCoeff] "r"(uv_coeff),                // %[kUVCoeff]
+        [kRGBCoeffBias] "r"(rgb_coeff),          // %[kRGBCoeffBias]
+        [limit] "r"(limit),                      // %[limit]
+        [kIndices] "r"(kP410LoadShuffleIndices)  // %[kIndices]
+      : "cc", "memory", YUVTORGB_REGS, "v22", "v23");
 }
 
 void I422ToAR30Row_NEON(const uint8_t* src_y,
@@ -820,7 +822,8 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
       READYUV422 I4XXTORGB RGBTORGB8_TOP
       "subs        %w[width], %w[width], #8      \n"  //
       ARGBTOARGB1555_FROM_TOP
-      "st1         {v19.8h}, [%[dst_argb1555]], #16 \n"  // store 8 pixels RGB1555.
+      "st1         {v19.8h}, [%[dst_argb1555]], #16 \n"  // store 8 pixels
+                                                         // RGB1555.
       "b.gt        1b                            \n"
       : [src_y] "+r"(src_y),                               // %[src_y]
         [src_u] "+r"(src_u),                               // %[src_u]
@@ -3460,7 +3463,8 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
       "movi        v6.16b, #66                   \n"  // R * 0.2578 coefficient
       "movi        v7.16b, #16                   \n"  // Add 16 constant
       "1:                                        \n"
-      "ldp         q0, q3, [%0], #32             \n"  // load 16 ARGB1555 pixels.
+      "ldp         q0, q3, [%0], #32             \n"  // load 16 ARGB1555
+                                                      // pixels.
       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
       RGB555TOARGB
       "umull       v16.8h, v0.8b, v4.8b          \n"  // B
@@ -3492,7 +3496,8 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
       "movi        v26.16b, #66                  \n"  // R * 0.2578 coefficient
       "movi        v27.16b, #16                  \n"  // Add 16 constant
       "1:                                        \n"
-      "ldp         q0, q3, [%0], #32             \n"  // load 16 ARGB4444 pixels.
+      "ldp         q0, q3, [%0], #32             \n"  // load 16 ARGB4444
+                                                      // pixels.
       "subs        %w2, %w2, #16                 \n"  // 16 processed per loop.
       ARGB4444TORGB
       "umull       v16.8h, v0.8b, v24.8b         \n"  // B
@@ -4136,7 +4141,8 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
                        uint32_t value) {
   asm volatile(
       "dup         v0.4s, %w3                    \n"  // duplicate scale value.
-      "zip1        v0.16b, v0.16b, v0.16b        \n"  // v0.16b aarrggbbaarrggbb.
+      "zip1        v0.16b, v0.16b, v0.16b        \n"  // v0.16b
+                                                      // aarrggbbaarrggbb.
       "ushr        v0.8h, v0.8h, #1              \n"  // scale / 2.
 
       // 8 pixel loop.
@@ -5277,11 +5283,11 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
       "dup         v2.8h, %w3                    \n"
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
       "mul         v0.8h, v0.8h, v2.8h           \n"
       "prfm        pldl1keep, [%0, 448]          \n"
       "mul         v1.8h, v1.8h, v2.8h           \n"
       "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
-      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
       "b.gt        1b                            \n"
       : "+r"(src_y),  // %0
         "+r"(dst_y),  // %1
@@ -5298,6 +5304,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
       "dup         v4.8h, %w3                    \n"
       "1:                                        \n"
       "ldp         q2, q3, [%0], #32             \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
       "umull       v0.4s, v2.4h, v4.4h           \n"
       "umull2      v1.4s, v2.8h, v4.8h           \n"
       "umull       v2.4s, v3.4h, v4.4h           \n"
@@ -5306,7 +5313,6 @@ void DivideRow_16_NEON(const uint16_t* src_y,
       "uzp2        v0.8h, v0.8h, v1.8h           \n"
       "uzp2        v1.8h, v2.8h, v3.8h           \n"
       "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
-      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
       "b.gt        1b                            \n"
       : "+r"(src_y),  // %0
         "+r"(dst_y),  // %1
@@ -5332,11 +5338,11 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
       "dup         v2.8h, %w3                    \n"
       "1:                                        \n"
       "ldp         q0, q1, [%0], #32             \n"
+      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
       "uqshl       v0.8h, v0.8h, v2.8h           \n"
       "uqshl       v1.8h, v1.8h, v2.8h           \n"
       "prfm        pldl1keep, [%0, 448]          \n"
       "uzp2        v0.16b, v0.16b, v1.16b        \n"
-      "subs        %w2, %w2, #16                 \n"  // 16 src pixels per loop
       "str         q0, [%1], #16                 \n"  // store 16 pixels
       "b.gt        1b                            \n"
       : "+r"(src_y),  // %0
@@ -5346,6 +5352,40 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
       : "cc", "memory", "v0", "v1", "v2");
 }
 
+// Use scale to convert J420 to I420
+// scale parameter is 8.8 fixed point but limited to 0 to 255
+// Function is based on DivideRow, but adds a bias
+// Does not clamp
+void Convert8To8Row_NEON(const uint8_t* src_y,
+                         uint8_t* dst_y,
+                         int scale,
+                         int bias,
+                         int width) {
+  asm volatile(
+      "dup         v4.16b, %w3                   \n"  // scale
+      "dup         v5.16b, %w4                   \n"  // bias
+      "1:                                        \n"
+      "ldp         q2, q3, [%0], #32             \n"
+      "subs        %w2, %w2, #32                 \n"  // 32 pixels per loop
+      "umull       v0.8h, v2.8b, v4.8b           \n"
+      "umull2      v1.8h, v2.16b, v4.16b         \n"
+      "umull       v2.8h, v3.8b, v4.8b           \n"
+      "umull2      v3.8h, v3.16b, v4.16b         \n"
+      "prfm        pldl1keep, [%0, 448]          \n"
+      "uzp2        v0.16b, v0.16b, v1.16b        \n"
+      "uzp2        v1.16b, v2.16b, v3.16b        \n"
+      "add         v0.16b, v0.16b, v5.16b        \n"  // add bias (16)
+      "add         v1.16b, v1.16b, v5.16b        \n"
+      "stp         q0, q1, [%1], #32             \n"  // store 16 pixels
+      "b.gt        1b                            \n"
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(scale),   // %3
+        "r"(bias)     // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
+}
+
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 #ifdef __cplusplus
diff --git a/source/scale.cc b/source/scale.cc
index 868a84a28..76379fd6e 100644
--- a/source/scale.cc
+++ b/source/scale.cc
@@ -2336,9 +2336,9 @@ int I420Scale(const uint8_t* src_y,
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
   int r;
 
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
@@ -2381,9 +2381,9 @@ int I420Scale_16(const uint16_t* src_y,
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
   int r;
 
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
@@ -2426,9 +2426,9 @@ int I420Scale_12(const uint16_t* src_y,
   int dst_halfheight = SUBSAMPLE(dst_height, 1, 1);
   int r;
 
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
@@ -2470,9 +2470,9 @@ int I444Scale(const uint8_t* src_y,
               enum FilterMode filtering) {
   int r;
 
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
@@ -2511,9 +2511,9 @@ int I444Scale_16(const uint16_t* src_y,
                  enum FilterMode filtering) {
   int r;
 
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
@@ -2552,9 +2552,9 @@ int I444Scale_12(const uint16_t* src_y,
                  enum FilterMode filtering) {
   int r;
 
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
@@ -2598,9 +2598,9 @@ int I422Scale(const uint8_t* src_y,
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int r;
 
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
@@ -2641,9 +2641,9 @@ int I422Scale_16(const uint16_t* src_y,
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int r;
 
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
@@ -2684,9 +2684,9 @@ int I422Scale_12(const uint16_t* src_y,
   int dst_halfwidth = SUBSAMPLE(dst_width, 1, 1);
   int r;
 
-  if (!src_y || !src_u || !src_v || src_width <= 0 ||
-      src_height == 0 || src_width > 32768 || src_height > 32768 || !dst_y ||
-      !dst_u || !dst_v || dst_width <= 0 || dst_height <= 0) {
+  if (!src_y || !src_u || !src_v || src_width <= 0 || src_height == 0 ||
+      src_width > 32768 || src_height > 32768 || !dst_y || !dst_u || !dst_v ||
+      dst_width <= 0 || dst_height <= 0) {
     return -1;
   }
 
diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc
index de19989fc..848d55416 100644
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@@ -1369,7 +1369,8 @@ void ScaleRowDown2_16_NEON(const uint16_t* src_ptr,
       "uzp2        v1.8h, v2.8h, v3.8h           \n"
       "uzp2        v2.8h, v4.8h, v5.8h           \n"
       "uzp2        v3.8h, v6.8h, v7.8h           \n"
-      "subs        %w[dst_width], %w[dst_width], #32 \n"  // 32 elems per iteration.
+      "subs        %w[dst_width], %w[dst_width], #32 \n"  // 32 elems per
+                                                          // iteration.
       "stp         q0, q1, [%[dst_ptr]]          \n"
       "stp         q2, q3, [%[dst_ptr], #32]     \n"
       "add         %[dst_ptr], %[dst_ptr], #64   \n"
diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc
index ef30b12b5..e9e58d329 100644
--- a/unit_test/convert_test.cc
+++ b/unit_test/convert_test.cc
@@ -188,6 +188,7 @@ TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H010, uint16_t, 2, 2, 2, 10)
 TESTPLANARTOP(H010, uint16_t, 2, 2, 2, H420, uint8_t, 1, 2, 2, 10)
 TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2, 8)
 TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H012, uint16_t, 2, 2, 2, 8)
+TESTPLANARTOP(J420, uint8_t, 1, 2, 2, I420, uint8_t, 1, 2, 2, 8)
 TESTPLANARTOP(I010, uint16_t, 2, 2, 2, I410, uint16_t, 2, 1, 1, 10)
 TESTPLANARTOP(I210, uint16_t, 2, 2, 1, I410, uint16_t, 2, 1, 1, 10)
 TESTPLANARTOP(I012, uint16_t, 2, 2, 2, I412, uint16_t, 2, 1, 1, 12)
@@ -2107,6 +2108,28 @@ TEST_F(LibYUVConvertTest, TestRGB24ToI420) {
 }
 #endif
 
+TEST_F(LibYUVConvertTest, TestJ420ToI420) {
+  const uint8_t src_y[12] = {0, 0, 128, 128, 255, 255,
+                             0, 0, 128, 128, 255, 255};
+  const uint8_t src_u[3] = {0, 128, 255};
+  const uint8_t src_v[3] = {0, 128, 255};
+  uint8_t dst_y[12];
+  uint8_t dst_u[3];
+  uint8_t dst_v[3];
+  ASSERT_EQ(J420ToI420(src_y, 6, src_u, 3, src_v, 3, dst_y, 6, dst_u, 3, dst_v,
+                       3, 6, 2),
+            0);
+  EXPECT_EQ(dst_y[0], 16);
+  EXPECT_EQ(dst_y[2], 126);
+  EXPECT_EQ(dst_y[4], 235);
+  EXPECT_EQ(dst_u[0], 16);
+  EXPECT_EQ(dst_u[1], 128);
+  EXPECT_EQ(dst_u[2], 240);
+  EXPECT_EQ(dst_v[0], 16);
+  EXPECT_EQ(dst_v[1], 128);
+  EXPECT_EQ(dst_v[2], 240);
+}
+
 #endif  // !defined(LEAN_TESTS)
 
 }  // namespace libyuv
diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc
index ca3cbe769..576696bca 100644
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@@ -3802,6 +3802,37 @@ TEST_F(LibYUVPlanarTest, Convert16To8Plane) {
   free_aligned_buffer_page_end(dst_pixels_y_c);
 }
 
+TEST_F(LibYUVPlanarTest, Convert8To8Plane) {
+  const int kPixels = benchmark_width_ * benchmark_height_;
+  align_buffer_page_end(src_pixels_y, kPixels);
+  align_buffer_page_end(dst_pixels_y_opt, kPixels);
+  align_buffer_page_end(dst_pixels_y_c, kPixels);
+
+  MemRandomize(src_pixels_y, kPixels);
+  memset(dst_pixels_y_opt, 0, kPixels);
+  memset(dst_pixels_y_c, 1, kPixels);
+
+  MaskCpuFlags(disable_cpu_flags_);
+  Convert8To8Plane(src_pixels_y, benchmark_width_, dst_pixels_y_c,
+                   benchmark_width_, 220, 16, benchmark_width_,
+                   benchmark_height_);
+  MaskCpuFlags(benchmark_cpu_info_);
+
+  for (int i = 0; i < benchmark_iterations_; ++i) {
+    Convert8To8Plane(src_pixels_y, benchmark_width_, dst_pixels_y_opt,
+                     benchmark_width_, 220, 16, benchmark_width_,
+                     benchmark_height_);
+  }
+
+  for (int i = 0; i < kPixels; ++i) {
+    EXPECT_EQ(dst_pixels_y_opt[i], dst_pixels_y_c[i]);
+  }
+
+  free_aligned_buffer_page_end(src_pixels_y);
+  free_aligned_buffer_page_end(dst_pixels_y_opt);
+  free_aligned_buffer_page_end(dst_pixels_y_c);
+}
+
 TEST_F(LibYUVPlanarTest, YUY2ToY) {
   const int kPixels = benchmark_width_ * benchmark_height_;
   align_buffer_page_end(src_pixels_y, kPixels * 2);