Upstream all libyuv changes to version 1746 Prefetch for all arm functions - helps performance at higher resolutions Make MirrorPlane function public.

Bug: libyuv:855 Change-Id: I4020face6b52767ee78d81870314285d63e98b95 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2113650 Reviewed-by: Hsiu Wang <hsiu@google.com>
2026-01-01 03:12:16 +08:00 · 2020-03-20 15:22:53 -07:00 · 2020-03-20 15:22:53 -07:00 · b5e223ac4c
commit b5e223ac4c
parent 45f1f2b201
14 changed files with 1816 additions and 1493 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1744
+Version: 1746
 License: BSD
 License File: LICENSE

--- a/include/libyuv/planar_functions.h
+++ b/include/libyuv/planar_functions.h
@ -313,6 +313,15 @@ int ARGBMirror(const uint8_t* src_argb,
               int width,
               int height);

+// Mirror a plane of data.
+LIBYUV_API
+void MirrorPlane(const uint8_t* src_y,
+                 int src_stride_y,
+                 uint8_t* dst_y,
+                 int dst_stride_y,
+                 int width,
+                 int height);
+
 // Convert NV12 to RGB565.
 LIBYUV_API
 int NV12ToRGB565(const uint8_t* src_y,
--- a/include/libyuv/rotate.h
+++ b/include/libyuv/rotate.h
@ -118,6 +118,10 @@ void RotatePlane270(const uint8_t* src,
                    int width,
                    int height);

+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them. Deprecated.
 LIBYUV_API
 void RotateUV90(const uint8_t* src,
                int src_stride,
@ -128,10 +132,6 @@ void RotateUV90(const uint8_t* src,
                int width,
                int height);

-// Rotations for when U and V are interleaved.
-// These functions take one input pointer and
-// split the data into two buffers while
-// rotating them. Deprecated.
 LIBYUV_API
 void RotateUV180(const uint8_t* src,
                 int src_stride,
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 1744
+#define LIBYUV_VERSION 1746

 #endif  // INCLUDE_LIBYUV_VERSION_H_
--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
@ -33,8 +33,10 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
      "ld1        {v0.16b, v1.16b}, [%0], #32    \n"
      "ld1        {v2.16b, v3.16b}, [%1], #32    \n"
      "eor        v0.16b, v0.16b, v2.16b         \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "eor        v1.16b, v1.16b, v3.16b         \n"
      "cnt        v0.16b, v0.16b                 \n"
+      "prfm       pldl1keep, [%1, 448]           \n"
      "cnt        v1.16b, v1.16b                 \n"
      "subs       %w2, %w2, #32                  \n"
      "add        v0.16b, v0.16b, v1.16b         \n"
@ -65,8 +67,10 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
      "subs       %w2, %w2, #16                  \n"
      "usubl      v2.8h, v0.8b, v1.8b            \n"
      "usubl2     v3.8h, v0.16b, v1.16b          \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "smlal      v16.4s, v2.4h, v2.4h           \n"
      "smlal      v17.4s, v3.4h, v3.4h           \n"
+      "prfm       pldl1keep, [%1, 448]           \n"
      "smlal2     v18.4s, v2.8h, v2.8h           \n"
      "smlal2     v19.4s, v3.8h, v3.8h           \n"
      "b.gt       1b                             \n"
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -716,70 +716,6 @@ void MergeRGBPlane(const uint8_t* src_r,
  }
 }

-// Mirror a plane of data.
-void MirrorPlane(const uint8_t* src_y,
-                 int src_stride_y,
-                 uint8_t* dst_y,
-                 int dst_stride_y,
-                 int width,
-                 int height) {
-  int y;
-  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
-  // Negative height means invert the image.
-  if (height < 0) {
-    height = -height;
-    src_y = src_y + (height - 1) * src_stride_y;
-    src_stride_y = -src_stride_y;
-  }
-#if defined(HAS_MIRRORROW_NEON)
-  if (TestCpuFlag(kCpuHasNEON)) {
-    MirrorRow = MirrorRow_Any_NEON;
-    if (IS_ALIGNED(width, 32)) {
-      MirrorRow = MirrorRow_NEON;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3)) {
-    MirrorRow = MirrorRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 16)) {
-      MirrorRow = MirrorRow_SSSE3;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_AVX2)
-  if (TestCpuFlag(kCpuHasAVX2)) {
-    MirrorRow = MirrorRow_Any_AVX2;
-    if (IS_ALIGNED(width, 32)) {
-      MirrorRow = MirrorRow_AVX2;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_MSA)
-  if (TestCpuFlag(kCpuHasMSA)) {
-    MirrorRow = MirrorRow_Any_MSA;
-    if (IS_ALIGNED(width, 64)) {
-      MirrorRow = MirrorRow_MSA;
-    }
-  }
-#endif
-#if defined(HAS_MIRRORROW_MMI)
-  if (TestCpuFlag(kCpuHasMMI)) {
-    MirrorRow = MirrorRow_Any_MMI;
-    if (IS_ALIGNED(width, 8)) {
-      MirrorRow = MirrorRow_MMI;
-    }
-  }
-#endif
-
-  // Mirror plane
-  for (y = 0; y < height; ++y) {
-    MirrorRow(src_y, dst_y, width);
-    src_y += src_stride_y;
-    dst_y += dst_stride_y;
-  }
-}
-
 // Convert YUY2 to I422.
 LIBYUV_API
 int YUY2ToI422(const uint8_t* src_yuy2,
@ -1047,6 +983,68 @@ int YUY2ToY(const uint8_t* src_yuy2,
  return 0;
 }

+// Mirror a plane of data.
+// See Also I400Mirror
+LIBYUV_API
+void MirrorPlane(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y,
+                 int dst_stride_y, int width, int height) {
+  int y;
+  void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C;
+  // Negative height means invert the image.
+  if (height < 0) {
+    height = -height;
+    src_y = src_y + (height - 1) * src_stride_y;
+    src_stride_y = -src_stride_y;
+  }
+#if defined(HAS_MIRRORROW_NEON)
+  if (TestCpuFlag(kCpuHasNEON)) {
+    MirrorRow = MirrorRow_Any_NEON;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_NEON;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+  if (TestCpuFlag(kCpuHasSSSE3)) {
+    MirrorRow = MirrorRow_Any_SSSE3;
+    if (IS_ALIGNED(width, 16)) {
+      MirrorRow = MirrorRow_SSSE3;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+  if (TestCpuFlag(kCpuHasAVX2)) {
+    MirrorRow = MirrorRow_Any_AVX2;
+    if (IS_ALIGNED(width, 32)) {
+      MirrorRow = MirrorRow_AVX2;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    MirrorRow = MirrorRow_Any_MSA;
+    if (IS_ALIGNED(width, 64)) {
+      MirrorRow = MirrorRow_MSA;
+    }
+  }
+#endif
+#if defined(HAS_MIRRORROW_MMI)
+  if (TestCpuFlag(kCpuHasMMI)) {
+    MirrorRow = MirrorRow_Any_MMI;
+    if (IS_ALIGNED(width, 8)) {
+      MirrorRow = MirrorRow_MMI;
+    }
+  }
+#endif
+
+  // Mirror plane
+  for (y = 0; y < height; ++y) {
+    MirrorRow(src_y, dst_y, width);
+    src_y += src_stride_y;
+    dst_y += dst_stride_y;
+  }
+}
+
 // Mirror I400 with optional flipping
 LIBYUV_API
 int I400Mirror(const uint8_t* src_y,
--- a/source/rotate_argb.cc
+++ b/source/rotate_argb.cc
@ -21,17 +21,21 @@ namespace libyuv {
 extern "C" {
 #endif

-static void ARGBTranspose(const uint8_t* src_argb,
-                          int src_stride_argb,
-                          uint8_t* dst_argb,
-                          int dst_stride_argb,
-                          int width,
-                          int height) {
+static int ARGBTranspose(const uint8_t* src_argb,
+                         int src_stride_argb,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height) {
  int i;
  int src_pixel_step = src_stride_argb >> 2;
  void (*ScaleARGBRowDownEven)(
      const uint8_t* src_argb, ptrdiff_t src_stride_argb, int src_step,
      uint8_t* dst_argb, int dst_width) = ScaleARGBRowDownEven_C;
+  // Check stride is a multiple of 4.
+  if (src_stride_argb & 3) {
+    return -1;
+  }
 #if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    ScaleARGBRowDownEven = ScaleARGBRowDownEven_Any_SSE2;
@ -70,44 +74,45 @@ static void ARGBTranspose(const uint8_t* src_argb,
    dst_argb += dst_stride_argb;
    src_argb += 4;
  }
+  return 0;
 }

-void ARGBRotate90(const uint8_t* src_argb,
-                  int src_stride_argb,
-                  uint8_t* dst_argb,
-                  int dst_stride_argb,
-                  int width,
-                  int height) {
+static int ARGBRotate90(const uint8_t* src_argb,
+                        int src_stride_argb,
+                        uint8_t* dst_argb,
+                        int dst_stride_argb,
+                        int width,
+                        int height) {
  // Rotate by 90 is a ARGBTranspose with the source read
  // from bottom to top. So set the source pointer to the end
  // of the buffer and flip the sign of the source stride.
  src_argb += src_stride_argb * (height - 1);
  src_stride_argb = -src_stride_argb;
-  ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                height);
+  return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                       width, height);
 }

-void ARGBRotate270(const uint8_t* src_argb,
-                   int src_stride_argb,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   int width,
-                   int height) {
+static int ARGBRotate270(const uint8_t* src_argb,
+                         int src_stride_argb,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height) {
  // Rotate by 270 is a ARGBTranspose with the destination written
  // from bottom to top. So set the destination pointer to the end
  // of the buffer and flip the sign of the destination stride.
  dst_argb += dst_stride_argb * (width - 1);
  dst_stride_argb = -dst_stride_argb;
-  ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                height);
+  return ARGBTranspose(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                       width, height);
 }

-void ARGBRotate180(const uint8_t* src_argb,
-                   int src_stride_argb,
-                   uint8_t* dst_argb,
-                   int dst_stride_argb,
-                   int width,
-                   int height) {
+static int ARGBRotate180(const uint8_t* src_argb,
+                         int src_stride_argb,
+                         uint8_t* dst_argb,
+                         int dst_stride_argb,
+                         int width,
+                         int height) {
  // Swap first and last row and mirror the content. Uses a temporary row.
  align_buffer_64(row, width * 4);
  const uint8_t* src_bot = src_argb + src_stride_argb * (height - 1);
@ -190,6 +195,7 @@ void ARGBRotate180(const uint8_t* src_argb,
    dst_bot -= dst_stride_argb;
  }
  free_aligned_buffer_64(row);
+  return 0;
 }

 LIBYUV_API
@ -217,17 +223,14 @@ int ARGBRotate(const uint8_t* src_argb,
      return ARGBCopy(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
                      width, height);
    case kRotate90:
-      ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                   height);
-      return 0;
+      return ARGBRotate90(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                          width, height);
    case kRotate270:
-      ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                    height);
-      return 0;
+      return ARGBRotate270(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                           width, height);
    case kRotate180:
-      ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb, width,
-                    height);
-      return 0;
+      return ARGBRotate180(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+                           width, height);
    default:
      break;
  }
--- a/source/rotate_neon64.cc
+++ b/source/rotate_neon64.cc
@ -37,7 +37,7 @@ void TransposeWx8_NEON(const uint8_t* src,
      "sub         %w3, %w3, #8                     \n"

      // handle 8x8 blocks. this should be the majority of the plane
-      "1:                                          \n"
+      "1:                                        \n"
      "mov         %0, %1                        \n"

      "ld1        {v0.8b}, [%0], %5              \n"
@ -48,23 +48,39 @@ void TransposeWx8_NEON(const uint8_t* src,
      "ld1        {v5.8b}, [%0], %5              \n"
      "ld1        {v6.8b}, [%0], %5              \n"
      "ld1        {v7.8b}, [%0]                  \n"
+      "mov         %0, %1                        \n"

      "trn2     v16.8b, v0.8b, v1.8b             \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "trn1     v17.8b, v0.8b, v1.8b             \n"
+      "add        %0, %0, %5                     \n"
      "trn2     v18.8b, v2.8b, v3.8b             \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // row 1
      "trn1     v19.8b, v2.8b, v3.8b             \n"
+      "add        %0, %0, %5                     \n"
      "trn2     v20.8b, v4.8b, v5.8b             \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // row 2
      "trn1     v21.8b, v4.8b, v5.8b             \n"
+      "add        %0, %0, %5                     \n"
      "trn2     v22.8b, v6.8b, v7.8b             \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // row 3
      "trn1     v23.8b, v6.8b, v7.8b             \n"
+      "add        %0, %0, %5                     \n"

      "trn2     v3.4h, v17.4h, v19.4h            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // row 4
      "trn1     v1.4h, v17.4h, v19.4h            \n"
+      "add        %0, %0, %5                     \n"
      "trn2     v2.4h, v16.4h, v18.4h            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // row 5
      "trn1     v0.4h, v16.4h, v18.4h            \n"
+      "add        %0, %0, %5                     \n"
      "trn2     v7.4h, v21.4h, v23.4h            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // row 6
      "trn1     v5.4h, v21.4h, v23.4h            \n"
+      "add        %0, %0, %5                     \n"
      "trn2     v6.4h, v20.4h, v22.4h            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // row 7
      "trn1     v4.4h, v20.4h, v22.4h            \n"

      "trn2     v21.2s, v1.2s, v5.2s             \n"
@ -226,6 +242,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
      "ld1       {v5.16b}, [%0], %5              \n"
      "ld1       {v6.16b}, [%0], %5              \n"
      "ld1       {v7.16b}, [%0]                  \n"
+      "mov       %0, %1                          \n"

      "trn1      v16.16b, v0.16b, v1.16b         \n"
      "trn2      v17.16b, v0.16b, v1.16b         \n"
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@ -84,7 +84,7 @@ static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
                                128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};

 static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
-                                  0x8080u, 0x8080u, 0x8080u, 0x8080u};
+                               0x8080u, 0x8080u, 0x8080u, 0x8080u};

 #endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)

@ -1101,8 +1101,11 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  "lea       0x40(%0),%0                     \n" \
  "phaddw    %%xmm0,%%xmm6                   \n" \
  "phaddw    %%xmm2,%%xmm1                   \n" \
-  "paddw     %%" #round ",%%xmm6             \n" \
-  "paddw     %%" #round ",%%xmm1             \n" \
+  "prefetcht0 1280(%0)                       \n" \
+  "paddw     %%" #round                          \
+  ",%%xmm6             \n"                       \
+  "paddw     %%" #round                          \
+  ",%%xmm1             \n"                       \
  "psrlw     $0x8,%%xmm6                     \n" \
  "psrlw     $0x8,%%xmm1                     \n" \
  "packuswb  %%xmm1,%%xmm6                   \n" \
@ -1111,33 +1114,36 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) {
  "sub       $0x10,%2                        \n" \
  "jg        1b                              \n"

-#define RGBTOY_AVX2(round)                                                  \
-  "1:                                        \n"                            \
-  "vmovdqu    (%0),%%ymm0                    \n"                            \
-  "vmovdqu    0x20(%0),%%ymm1                \n"                            \
-  "vmovdqu    0x40(%0),%%ymm2                \n"                            \
-  "vmovdqu    0x60(%0),%%ymm3                \n"                            \
-  "vpsubb     %%ymm5, %%ymm0, %%ymm0         \n"                            \
-  "vpsubb     %%ymm5, %%ymm1, %%ymm1         \n"                            \
-  "vpsubb     %%ymm5, %%ymm2, %%ymm2         \n"                            \
-  "vpsubb     %%ymm5, %%ymm3, %%ymm3         \n"                            \
-  "vpmaddubsw %%ymm0,%%ymm4,%%ymm0           \n"                            \
-  "vpmaddubsw %%ymm1,%%ymm4,%%ymm1           \n"                            \
-  "vpmaddubsw %%ymm2,%%ymm4,%%ymm2           \n"                            \
-  "vpmaddubsw %%ymm3,%%ymm4,%%ymm3           \n"                            \
-  "lea       0x80(%0),%0                     \n"                            \
-  "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */             \
-  "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                            \
-  "vpaddw     %%" #round ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */ \
-  "vpaddw     %%" #round ",%%ymm2,%%ymm2     \n"                            \
-  "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                            \
-  "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                            \
-  "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */             \
-  "vpermd     %%ymm0,%%ymm6,%%ymm0           \n" /* unmutate. */            \
-  "vmovdqu    %%ymm0,(%1)                    \n"                            \
-  "lea       0x20(%1),%1                     \n"                            \
-  "sub       $0x20,%2                        \n"                            \
-  "jg        1b                              \n"                            \
+#define RGBTOY_AVX2(round)                                       \
+  "1:                                        \n"                 \
+  "vmovdqu    (%0),%%ymm0                    \n"                 \
+  "vmovdqu    0x20(%0),%%ymm1                \n"                 \
+  "vmovdqu    0x40(%0),%%ymm2                \n"                 \
+  "vmovdqu    0x60(%0),%%ymm3                \n"                 \
+  "vpsubb     %%ymm5, %%ymm0, %%ymm0         \n"                 \
+  "vpsubb     %%ymm5, %%ymm1, %%ymm1         \n"                 \
+  "vpsubb     %%ymm5, %%ymm2, %%ymm2         \n"                 \
+  "vpsubb     %%ymm5, %%ymm3, %%ymm3         \n"                 \
+  "vpmaddubsw %%ymm0,%%ymm4,%%ymm0           \n"                 \
+  "vpmaddubsw %%ymm1,%%ymm4,%%ymm1           \n"                 \
+  "vpmaddubsw %%ymm2,%%ymm4,%%ymm2           \n"                 \
+  "vpmaddubsw %%ymm3,%%ymm4,%%ymm3           \n"                 \
+  "lea       0x80(%0),%0                     \n"                 \
+  "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n" /* mutates. */  \
+  "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"                 \
+  "prefetcht0 1280(%0)                       \n"                 \
+  "vpaddw     %%" #round                                         \
+  ",%%ymm0,%%ymm0     \n" /* Add .5 for rounding. */             \
+  "vpaddw     %%" #round                                         \
+  ",%%ymm2,%%ymm2     \n"                                        \
+  "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"                 \
+  "vpsrlw     $0x8,%%ymm2,%%ymm2             \n"                 \
+  "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n" /* mutates. */  \
+  "vpermd     %%ymm0,%%ymm6,%%ymm0           \n" /* unmutate. */ \
+  "vmovdqu    %%ymm0,(%1)                    \n"                 \
+  "lea       0x20(%1),%1                     \n"                 \
+  "sub       $0x20,%2                        \n"                 \
+  "jg        1b                              \n"                 \
  "vzeroupper                                \n"

 #ifdef HAS_ARGBTOYROW_SSSE3
@ -1148,15 +1154,15 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
      "movdqa    %4,%%xmm5                       \n"
      "movdqa    %5,%%xmm7                       \n"

-      LABELALIGN
-      RGBTOY(xmm7)
+      LABELALIGN RGBTOY(xmm7)
      : "+r"(src_argb),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      : "m"(kARGBToY),   // %3
        "m"(kSub128),    // %4
        "m"(kAddY16)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOYROW_SSSE3

@ -1168,8 +1174,7 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) {
      "movdqa    %3,%%xmm4                       \n"
      "movdqa    %4,%%xmm5                       \n"

-      LABELALIGN
-      RGBTOY(xmm5)
+      LABELALIGN RGBTOY(xmm5)
      : "+r"(src_argb),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
@ -1187,8 +1192,7 @@ void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
      "movdqa    %3,%%xmm4                       \n"
      "movdqa    %4,%%xmm5                       \n"

-      LABELALIGN
-      RGBTOY(xmm5)
+      LABELALIGN RGBTOY(xmm5)
      : "+r"(src_rgba),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
@ -1210,8 +1214,7 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
      "vbroadcastf128 %5,%%ymm7                  \n"
      "vmovdqu    %6,%%ymm6                      \n"

-      LABELALIGN
-      RGBTOY_AVX2(ymm7)
+      LABELALIGN RGBTOY_AVX2(ymm7)
      : "+r"(src_argb),         // %0
        "+r"(dst_y),            // %1
        "+r"(width)             // %2
@ -1219,7 +1222,8 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
        "m"(kSub128),           // %4
        "m"(kAddY16),           // %5
        "m"(kPermdARGBToY_AVX)  // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOYROW_AVX2

@ -1232,8 +1236,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
      "vbroadcastf128 %5,%%ymm7                  \n"
      "vmovdqu    %6,%%ymm6                      \n"

-      LABELALIGN
-      RGBTOY_AVX2(ymm7)
+      LABELALIGN RGBTOY_AVX2(ymm7)
      : "+r"(src_abgr),         // %0
        "+r"(dst_y),            // %1
        "+r"(width)             // %2
@ -1241,7 +1244,8 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
        "m"(kSub128),           // %4
        "m"(kAddY16),           // %5
        "m"(kPermdARGBToY_AVX)  // %6
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ABGRTOYROW_AVX2

@ -1253,15 +1257,15 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) {
      "vbroadcastf128 %4,%%ymm5                  \n"
      "vmovdqu    %5,%%ymm6                      \n"

-      LABELALIGN
-      RGBTOY_AVX2(ymm5)
+      LABELALIGN RGBTOY_AVX2(ymm5)
      : "+r"(src_argb),         // %0
        "+r"(dst_y),            // %1
        "+r"(width)             // %2
      : "m"(kARGBToYJ),         // %3
        "m"(kSub128),           // %4
        "m"(kPermdARGBToY_AVX)  // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }
 #endif  // HAS_ARGBTOYJROW_AVX2

@ -1273,9 +1277,8 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
      "vbroadcastf128 %4,%%ymm5                  \n"
      "vmovdqu    %5,%%ymm6                      \n"

-      LABELALIGN
-      RGBTOY_AVX2(ymm5)
-      "vzeroupper                                \n"
+      LABELALIGN RGBTOY_AVX2(
+          ymm5) "vzeroupper                                \n"
      : "+r"(src_rgba),         // %0
        "+r"(dst_y),            // %1
        "+r"(width)             // %2
@ -1536,7 +1539,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0,
        "+r"(dst_v),                       // %2
        "+rm"(width)                       // %3
      : "r"((intptr_t)(src_stride_argb)),  // %4
-        "m"(kSub128),                   // %5
+        "m"(kSub128),                      // %5
        "m"(kARGBToVJ),                    // %6
        "m"(kARGBToUJ),                    // %7
        "m"(kShufARGBToUV_AVX)             // %8
@ -1606,7 +1609,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0,
      : "r"((intptr_t)(src_stride_argb)),  // %4
        "m"(kARGBToVJ),                    // %5
        "m"(kARGBToUJ),                    // %6
-        "m"(kSub128)                    // %7
+        "m"(kSub128)                       // %7
      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7");
 }
 #endif  // HAS_ARGBTOUVJROW_SSSE3
@ -1675,15 +1678,15 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
      "movdqa    %4,%%xmm5                       \n"
      "movdqa    %5,%%xmm7                       \n"

-      LABELALIGN
-      RGBTOY(xmm7)
+      LABELALIGN RGBTOY(xmm7)
      : "+r"(src_bgra),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      : "m"(kBGRAToY),   // %3
        "m"(kSub128),    // %4
        "m"(kAddY16)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }

 void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0,
@ -1755,15 +1758,15 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
      "movdqa    %4,%%xmm5                       \n"
      "movdqa    %5,%%xmm7                       \n"

-      LABELALIGN
-      RGBTOY(xmm7)
+      LABELALIGN RGBTOY(xmm7)
      : "+r"(src_abgr),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      : "m"(kABGRToY),   // %3
        "m"(kSub128),    // %4
        "m"(kAddY16)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }

 void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
@ -1772,15 +1775,15 @@ void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
      "movdqa    %4,%%xmm5                       \n"
      "movdqa    %5,%%xmm7                       \n"

-      LABELALIGN
-      RGBTOY(xmm7)
+      LABELALIGN RGBTOY(xmm7)
      : "+r"(src_rgba),  // %0
        "+r"(dst_y),     // %1
        "+r"(width)      // %2
      : "m"(kRGBAToY),   // %3
        "m"(kSub128),    // %4
        "m"(kAddY16)     // %5
-      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
+        "xmm7");
 }

 void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0,
--- a/source/row_mmi.cc
+++ b/source/row_mmi.cc
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -278,7 +278,8 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
          v20) "subs       %w4, %w4, #8                   \n" ARGBTORGB565
               "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels
                                                               // RGB565.
-               "b.gt       1b                             \n"
+               "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "b.gt       1b                             \n"
      : "+r"(src_y),       // %0
        "+r"(src_u),       // %1
        "+r"(src_v),       // %2
@ -315,7 +316,8 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
          v20) "subs       %w4, %w4, #8                   \n" ARGBTOARGB1555
               "st1        {v0.8h}, [%3], #16             \n"  // store 8 pixels
                                                               // RGB565.
-               "b.gt       1b                             \n"
+               "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "b.gt       1b                             \n"
      : "+r"(src_y),         // %0
        "+r"(src_u),         // %1
        "+r"(src_v),         // %2
@ -401,6 +403,7 @@ void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
      "orr        v22.8b, v20.8b, v20.8b         \n"
      "subs       %w2, %w2, #8                   \n"
      "st4        {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32     \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_y),     // %0
        "+r"(dst_argb),  // %1
@ -527,7 +530,8 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
          v20) "subs       %w3, %w3, #8                   \n" ARGBTORGB565
               "st1        {v0.8h}, [%2], 16              \n"  // store 8 pixels
                                                               // RGB565.
-               "b.gt       1b                             \n"
+               "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "b.gt       1b                             \n"
      : "+r"(src_y),       // %0
        "+r"(src_uv),      // %1
        "+r"(dst_rgb565),  // %2
@ -601,6 +605,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
      "st1        {v0.16b}, [%1], #16            \n"  // store U
      "st1        {v1.16b}, [%2], #16            \n"  // store V
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_uv),               // %0
        "+r"(dst_u),                // %1
@ -622,6 +627,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
      "ld1        {v1.16b}, [%1], #16            \n"  // load V
      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop
      "st2        {v0.16b,v1.16b}, [%2], #32     \n"  // store 16 pairs of UV
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_u),                // %0
        "+r"(src_v),                // %1
@ -645,6 +651,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
      "st1        {v0.16b}, [%1], #16            \n"  // store R
      "st1        {v1.16b}, [%2], #16            \n"  // store G
      "st1        {v2.16b}, [%3], #16            \n"  // store B
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_rgb),                    // %0
        "+r"(dst_r),                      // %1
@ -669,6 +676,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
      "ld1        {v2.16b}, [%2], #16            \n"  // load B
      "subs       %w4, %w4, #16                  \n"  // 16 processed per loop
      "st3        {v0.16b,v1.16b,v2.16b}, [%3], #48 \n"  // store 16 RGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_r),                      // %0
        "+r"(src_g),                      // %1
@ -687,6 +695,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
      "ldp        q0, q1, [%0], #32              \n"
      "subs       %w2, %w2, #32                  \n"  // 32 processed per loop
      "stp        q0, q1, [%1], #32              \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src),                  // %0
        "+r"(dst),                  // %1
@ -703,6 +712,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
      "1:                                        \n"
      "subs       %w1, %w1, #16                  \n"  // 16 bytes per loop
      "st1        {v0.16b}, [%0], #16            \n"  // store
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(dst),   // %0
        "+r"(width)  // %1
@ -716,6 +726,7 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
      "1:                                        \n"
      "subs       %w1, %w1, #4                   \n"  // 4 ints per loop
      "st1        {v0.16b}, [%0], #16            \n"  // store
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(dst),   // %0
        "+r"(width)  // %1
@ -739,6 +750,7 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
      "tbl        v1.16b, {v1.16b}, v3.16b       \n"
      "tbl        v0.16b, {v2.16b}, v3.16b       \n"
      "st1        {v0.16b, v1.16b}, [%1], #32    \n"  // store 32 pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src),           // %0
        "+r"(dst),           // %1
@ -763,6 +775,7 @@ void MirrorUVRow_NEON(const uint8_t* src_uv,
      "rev64      v1.8b, v1.8b                   \n"
      "st1        {v0.8b}, [%1], #8              \n"  // dst += 8
      "st1        {v1.8b}, [%2], #8              \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_uv),        // %0
        "+r"(dst_u),         // %1
@ -783,6 +796,7 @@ void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
      "rev64      v0.4s, v0.4s                   \n"
      "st1        {v0.D}[1], [%1], #8            \n"  // dst += 16
      "st1        {v0.D}[0], [%1], #8            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src),           // %0
        "+r"(dst),           // %1
@ -800,6 +814,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
      "ld3        {v1.8b,v2.8b,v3.8b}, [%0], #24 \n"  // load 8 pixels of RGB24.
      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
      "st4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%1], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_rgb24),  // %0
        "+r"(dst_argb),   // %1
@ -818,6 +833,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
      "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
      "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
      "st4        {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n"  // store b g r a
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_raw),   // %0
        "+r"(dst_argb),  // %1
@ -836,6 +852,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
      "orr        v2.8b, v4.8b, v4.8b            \n"  // move g
      "orr        v1.8b, v5.8b, v5.8b            \n"  // move r
      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store a b g r
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_raw),   // %0
        "+r"(dst_rgba),  // %1
@ -853,6 +870,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
      "orr        v3.8b, v1.8b, v1.8b            \n"  // move g
      "orr        v4.8b, v0.8b, v0.8b            \n"  // move r
      "st3        {v2.8b,v3.8b,v4.8b}, [%1], #24 \n"  // store b g r
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_raw),    // %0
        "+r"(dst_rgb24),  // %1
@ -885,6 +903,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
      RGB565TOARGB
      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_rgb565),  // %0
        "+r"(dst_argb),    // %1
@ -942,6 +961,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
      ARGB1555TOARGB
      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
                                                            // pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb1555),  // %0
        "+r"(dst_argb),      // %1
@ -972,7 +992,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
      ARGB4444TOARGB
      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
-                                                            // pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb4444),  // %0
        "+r"(dst_argb),      // %1
@ -989,8 +1009,8 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
      "1:                                        \n"
      "ld4        {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n"  // load 8 ARGB
      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
-      "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of
-                                                      // RGB24.
+      "st3        {v1.8b,v2.8b,v3.8b}, [%1], #24 \n"  // store 8 pixels of RGB24
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb),   // %0
        "+r"(dst_rgb24),  // %1
@ -1023,6 +1043,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of YUY2.
      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
      "st1        {v0.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_yuy2),  // %0
        "+r"(dst_y),     // %1
@ -1038,6 +1059,7 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"  // load 16 pixels of UYVY.
      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop.
      "st1        {v1.16b}, [%1], #16            \n"  // store 16 pixels of Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_uyvy),  // %0
        "+r"(dst_y),     // %1
@ -1057,6 +1079,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
      "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
      "st1        {v1.8b}, [%1], #8              \n"  // store 8 U.
      "st1        {v3.8b}, [%2], #8              \n"  // store 8 V.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_yuy2),  // %0
        "+r"(dst_u),     // %1
@ -1077,6 +1100,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
      "subs       %w3, %w3, #16                  \n"  // 16 pixels = 8 UVs.
      "st1        {v0.8b}, [%1], #8              \n"  // store 8 U.
      "st1        {v2.8b}, [%2], #8              \n"  // store 8 V.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_uyvy),  // %0
        "+r"(dst_u),     // %1
@ -1102,6 +1126,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
      "urhadd     v3.8b, v3.8b, v7.8b            \n"        // average rows of V
      "st1        {v1.8b}, [%2], #8              \n"        // store 8 U.
      "st1        {v3.8b}, [%3], #8              \n"        // store 8 V.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_yuy2),   // %0
        "+r"(src_yuy2b),  // %1
@ -1129,6 +1154,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
      "urhadd     v2.8b, v2.8b, v6.8b            \n"        // average rows of V
      "st1        {v0.8b}, [%2], #8              \n"        // store 8 U.
      "st1        {v2.8b}, [%3], #8              \n"        // store 8 V.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_uyvy),   // %0
        "+r"(src_uyvyb),  // %1
@ -1153,6 +1179,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
      "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
      "tbl        v1.16b, {v0.16b}, v2.16b       \n"  // look up 4 pixels
      "st1        {v1.16b}, [%1], #16            \n"  // store 4.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb),                   // %0
        "+r"(dst_argb),                   // %1
@ -1175,6 +1202,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
      "ld1        {v3.8b}, [%2], #8              \n"        // load 8 Vs
      "subs       %w4, %w4, #16                  \n"        // 16 pixels
      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_y),     // %0
        "+r"(src_u),     // %1
@ -1198,6 +1226,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
      "ld1        {v2.8b}, [%2], #8              \n"        // load 8 Vs
      "subs       %w4, %w4, #16                  \n"        // 16 pixels
      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n"  // Store 16 pixels.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_y),     // %0
        "+r"(src_u),     // %1
@ -1217,6 +1246,7 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop.
      ARGBTORGB565
      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels RGB565.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb),    // %0
        "+r"(dst_rgb565),  // %1
@ -1238,6 +1268,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
      "uqadd      v21.8b, v21.8b, v1.8b          \n"
      "uqadd      v22.8b, v22.8b, v1.8b          \n" ARGBTORGB565
      "st1        {v0.16b}, [%0], #16            \n"  // store 8 pixels RGB565.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(dst_rgb)   // %0
      : "r"(src_argb),  // %1
@ -1256,6 +1287,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
      ARGBTOARGB1555
      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels
                                                      // ARGB1555.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb),      // %0
        "+r"(dst_argb1555),  // %1
@ -1276,6 +1308,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
      ARGBTOARGB4444
      "st1        {v0.16b}, [%1], #16            \n"  // store 8 pixels
                                                      // ARGB4444.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb),      // %0
        "+r"(dst_argb4444),  // %1
@ -1299,6 +1332,7 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
      "uqrshrn    v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit Y
      "uqadd      v0.8b, v0.8b, v7.8b            \n"
      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_y),     // %1
@ -1316,6 +1350,7 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
                                                                // pixels
      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
      "st1        {v3.16b}, [%1], #16            \n"  // store 16 A's.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_a),     // %1
@ -1338,6 +1373,7 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
      "umlal      v3.8h, v2.8b, v6.8b            \n"  // R
      "uqrshrn    v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit Y
      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_y),     // %1
@ -1359,6 +1395,7 @@ void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
      "umlal      v0.8h, v3.8b, v6.8b            \n"  // R
      "uqrshrn    v3.8b, v0.8h, #8               \n"  // 16 bit to 8 bit Y
      "st1        {v3.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_y),     // %1
@ -1399,6 +1436,7 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,

      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels U.
      "st1        {v1.8b}, [%2], #8              \n"  // store 8 pixels V.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_u),     // %1
@ -1767,6 +1805,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
      "uqshrn     v1.8b, v17.8h, #8              \n"  // 16 bit to 8 bit V
      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_rgb565),    // %0
        "+r"(src_rgb565_1),  // %1
@ -1832,6 +1871,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb1555),    // %0
        "+r"(src_argb1555_1),  // %1
@ -1897,6 +1937,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
      "uqshrn     v1.8b, v3.8h, #8               \n"  // 16 bit to 8 bit V
      "st1        {v0.8b}, [%2], #8              \n"  // store 8 pixels U.
      "st1        {v1.8b}, [%3], #8              \n"  // store 8 pixels V.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb4444),    // %0
        "+r"(src_argb4444_1),  // %1
@ -1927,6 +1968,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
      "uqrshrn    v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit Y
      "uqadd      v0.8b, v0.8b, v27.8b           \n"
      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_rgb565),  // %0
        "+r"(dst_y),       // %1
@ -1954,6 +1996,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
      "uqrshrn    v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit Y
      "uqadd      v0.8b, v0.8b, v7.8b            \n"
      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb1555),  // %0
        "+r"(dst_y),         // %1
@ -1980,6 +2023,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
      "uqrshrn    v0.8b, v3.8h, #8               \n"  // 16 bit to 8 bit Y
      "uqadd      v0.8b, v0.8b, v27.8b           \n"
      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb4444),  // %0
        "+r"(dst_y),         // %1
@ -2003,6 +2047,7 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
      "uqrshrn    v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit Y
      "uqadd      v0.8b, v0.8b, v7.8b            \n"
      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_bgra),  // %0
        "+r"(dst_y),     // %1
@ -2026,6 +2071,7 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
      "uqrshrn    v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit Y
      "uqadd      v0.8b, v0.8b, v7.8b            \n"
      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_abgr),  // %0
        "+r"(dst_y),     // %1
@ -2049,6 +2095,7 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
      "uqrshrn    v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit Y
      "uqadd      v0.8b, v0.8b, v7.8b            \n"
      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_rgba),  // %0
        "+r"(dst_y),     // %1
@ -2072,6 +2119,7 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
      "uqrshrn    v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit Y
      "uqadd      v0.8b, v0.8b, v7.8b            \n"
      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_rgb24),  // %0
        "+r"(dst_y),      // %1
@ -2095,6 +2143,7 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
      "uqrshrn    v0.8b, v16.8h, #8              \n"  // 16 bit to 8 bit Y
      "uqadd      v0.8b, v0.8b, v7.8b            \n"
      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_raw),  // %0
        "+r"(dst_y),    // %1
@ -2116,6 +2165,7 @@ void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
      "umlal      v0.8h, v2.8b, v6.8b            \n"  // R
      "uqrshrn    v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit Y
      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_rgb24),  // %0
        "+r"(dst_yj),     // %1
@ -2135,8 +2185,10 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
      "umull      v0.8h, v0.8b, v4.8b            \n"  // B
      "umlal      v0.8h, v1.8b, v5.8b            \n"  // G
      "umlal      v0.8h, v2.8b, v6.8b            \n"  // R
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 cache lines ahead
      "uqrshrn    v0.8b, v0.8h, #8               \n"  // 16 bit to 8 bit Y
      "st1        {v0.8b}, [%1], #8              \n"  // store 8 pixels Y.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_raw),  // %0
        "+r"(dst_yj),   // %1
@ -2174,6 +2226,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
      "rshrn      v0.8b,  v2.8h, #8              \n"
      "rshrn2     v0.16b, v3.8h, #8              \n"
      "st1        {v0.16b}, [%0], #16            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      "b          99f                            \n"

@ -2290,6 +2343,7 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
      "uqrshrn    v2.8b, v6.8h, #8               \n"  // r >>= 8
      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 ARGB
                                                            // pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_argb),  // %1
@ -2331,6 +2385,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
      "uqxtn      v1.8b, v1.8h                   \n"
      "uqxtn      v2.8b, v2.8h                   \n"
      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(dst_argb),       // %0
        "+r"(width)           // %1
@ -2369,6 +2424,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
      "uqxtn      v6.8b, v6.8h                   \n"
      "uqxtn      v7.8b, v7.8h                   \n"
      "st4        {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_argb),  // %1
@ -2395,6 +2451,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
      "orr        v1.8b, v0.8b, v0.8b            \n"  // G
      "orr        v2.8b, v0.8b, v0.8b            \n"  // R
      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n"  // store 8 pixels.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_argb),  // %1
@ -2435,6 +2492,7 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
      "uqshrn     v1.8b, v5.8h, #7               \n"   // 16 bit to 8 bit G
      "uqshrn     v2.8b, v6.8h, #7               \n"   // 16 bit to 8 bit R
      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // store 8 pixels.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(dst_argb),  // %0
        "+r"(width)      // %1
@ -2495,6 +2553,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
      "sqshrun    v18.8b, v24.8h, #6             \n"  // 16 bit to 8 bit R
      "sqshrun    v19.8b, v25.8h, #6             \n"  // 16 bit to 8 bit A
      "st4        {v16.8b,v17.8b,v18.8b,v19.8b}, [%1], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb),   // %0
        "+r"(dst_argb),   // %1
@ -2525,6 +2584,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0,
      "rshrn      v2.8b, v2.8h, #8               \n"  // 16 bit to 8 bit R
      "rshrn      v3.8b, v3.8h, #8               \n"  // 16 bit to 8 bit A
      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb0),  // %0
        "+r"(src_argb1),  // %1
@ -2550,6 +2610,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0,
      "uqadd      v2.8b, v2.8b, v6.8b            \n"
      "uqadd      v3.8b, v3.8b, v7.8b            \n"
      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb0),  // %0
        "+r"(src_argb1),  // %1
@ -2575,6 +2636,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0,
      "uqsub      v2.8b, v2.8b, v6.8b            \n"
      "uqsub      v3.8b, v3.8b, v7.8b            \n"
      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb0),  // %0
        "+r"(src_argb1),  // %1
@ -2604,6 +2666,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
      "orr        v1.8b, v0.8b, v0.8b            \n"
      "orr        v2.8b, v0.8b, v0.8b            \n"
      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_sobelx),  // %0
        "+r"(src_sobely),  // %1
@ -2626,6 +2689,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
      "uqadd      v0.16b, v0.16b, v1.16b         \n"  // add
      "st1        {v0.16b}, [%2], #16            \n"  // store 16 pixels.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_sobelx),  // %0
        "+r"(src_sobely),  // %1
@ -2653,6 +2717,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
      "subs       %w3, %w3, #8                   \n"  // 8 processed per loop.
      "uqadd      v1.8b, v0.8b, v2.8b            \n"  // add
      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n"  // store 8 ARGB
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_sobelx),  // %0
        "+r"(src_sobely),  // %1
@ -2689,6 +2754,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
      "abs        v0.8h, v0.8h                   \n"
      "uqxtn      v0.8b, v0.8h                   \n"
      "st1        {v0.8b}, [%3], #8              \n"  // store 8 sobelx
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_y0),                           // %0
        "+r"(src_y1),                           // %1
@ -2727,6 +2793,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
      "abs        v0.8h, v0.8h                   \n"
      "uqxtn      v0.8b, v0.8h                   \n"
      "st1        {v0.8b}, [%2], #8              \n"  // store 8 sobely
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_y0),                           // %0
        "+r"(src_y1),                           // %1
@ -2754,6 +2821,7 @@ void HalfFloat1Row_NEON(const uint16_t* src,
      "fcvtn      v1.4h, v2.4s                   \n"  // 8 half floats
      "fcvtn2     v1.8h, v3.4s                   \n"
      "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src),   // %0
        "+r"(dst),   // %1
@ -2779,6 +2847,7 @@ void HalfFloatRow_NEON(const uint16_t* src,
      "uqshrn     v1.4h, v2.4s, #13              \n"  // isolate halffloat
      "uqshrn2    v1.8h, v3.4s, #13              \n"
      "st1        {v1.16b}, [%1], #16            \n"  // store 8 shorts
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src),                      // %0
        "+r"(dst),                      // %1
@ -2803,6 +2872,7 @@ void ByteToFloatRow_NEON(const uint8_t* src,
      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
      "fmul       v3.4s, v3.4s, %3.s[0]          \n"
      "st1        {v2.16b, v3.16b}, [%1], #32    \n"  // store 8 floats
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src),   // %0
        "+r"(dst),   // %1
@ -2828,6 +2898,7 @@ float ScaleMaxSamples_NEON(const float* src,
      "fmax       v5.4s, v5.4s, v1.4s            \n"  // max
      "fmax       v6.4s, v6.4s, v2.4s            \n"
      "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      "fmax       v5.4s, v5.4s, v6.4s            \n"  // max
      "fmaxv      %s3, v5.4s                     \n"  // signed max acculator
@ -2857,6 +2928,7 @@ float ScaleSumSamples_NEON(const float* src,
      "fmla       v5.4s, v1.4s, v1.4s            \n"  // sum of squares
      "fmla       v6.4s, v2.4s, v2.4s            \n"
      "st1        {v3.4s, v4.4s}, [%1], #32      \n"  // store 8 samples
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      "faddp      v5.4s, v5.4s, v6.4s            \n"
      "faddp      v5.4s, v5.4s, v5.4s            \n"
@ -2878,6 +2950,7 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
      "fmul       v1.4s, v1.4s, %3.s[0]          \n"  // scale
      "fmul       v2.4s, v2.4s, %3.s[0]          \n"  // scale
      "st1        {v1.4s, v2.4s}, [%1], #32      \n"  // store 8 samples
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src),   // %0
        "+r"(dst),   // %1
@ -2902,18 +2975,23 @@ void GaussCol_NEON(const uint16_t* src0,
      "ld1        {v1.8h}, [%0], #16             \n"  // load 8 samples, 5 rows
      "ld1        {v2.8h}, [%4], #16             \n"
      "uaddl      v0.4s, v1.4h, v2.4h            \n"  // * 1
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "uaddl2     v1.4s, v1.8h, v2.8h            \n"  // * 1
      "ld1        {v2.8h}, [%1], #16             \n"
      "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4
+      "prfm       pldl1keep, [%1, 448]           \n"
      "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4
      "ld1        {v2.8h}, [%2], #16             \n"
      "umlal      v0.4s, v2.4h, v7.4h            \n"  // * 6
+      "prfm       pldl1keep, [%2, 448]           \n"
      "umlal2     v1.4s, v2.8h, v7.8h            \n"  // * 6
      "ld1        {v2.8h}, [%3], #16             \n"
      "umlal      v0.4s, v2.4h, v6.4h            \n"  // * 4
+      "prfm       pldl1keep, [%3, 448]           \n"
      "umlal2     v1.4s, v2.8h, v6.8h            \n"  // * 4
      "subs       %w6, %w6, #8                   \n"  // 8 processed per loop
      "st1        {v0.4s,v1.4s}, [%5], #32       \n"  // store 8 samples
+      "prfm       pldl1keep, [%4, 448]           \n"
      "b.gt       1b                             \n"
      : "+r"(src0),  // %0
        "+r"(src1),  // %1
@ -2946,6 +3024,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
      "ld1        {v4.4s,v5.4s}, [%3], #32       \n"
      "add        v2.4s, v2.4s, v4.4s            \n"  // add rows for * 4
      "add        v3.4s, v3.4s, v5.4s            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "mla        v0.4s, v2.4s, v6.4s            \n"  // * 4
      "mla        v1.4s, v3.4s, v6.4s            \n"  // * 4
      "subs       %w5, %w5, #8                   \n"  // 8 processed per loop
@ -2982,14 +3061,19 @@ void GaussCol_F32_NEON(const float* src0,
      "fmla       v0.4s, v2.4s, v6.4s            \n"  // * 4
      "ld1        {v4.4s, v5.4s}, [%2], #32      \n"
      "fmla       v1.4s, v3.4s, v6.4s            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "fmla       v0.4s, v4.4s, v7.4s            \n"  // * 6
      "ld1        {v2.4s, v3.4s}, [%3], #32      \n"
      "fmla       v1.4s, v5.4s, v7.4s            \n"
+      "prfm       pldl1keep, [%1, 448]           \n"
      "fmla       v0.4s, v2.4s, v6.4s            \n"  // * 4
      "ld1        {v4.4s, v5.4s}, [%4], #32      \n"
      "fmla       v1.4s, v3.4s, v6.4s            \n"
+      "prfm       pldl1keep, [%2, 448]           \n"
      "fadd       v0.4s, v0.4s, v4.4s            \n"  // * 1
+      "prfm       pldl1keep, [%3, 448]           \n"
      "fadd       v1.4s, v1.4s, v5.4s            \n"
+      "prfm       pldl1keep, [%4, 448]           \n"
      "subs       %w6, %w6, #8                   \n"  // 8 processed per loop
      "st1        {v0.4s, v1.4s}, [%5], #32      \n"  // store 8 samples
      "b.gt       1b                             \n"
@ -3024,6 +3108,7 @@ void GaussRow_F32_NEON(const float* src,
      "fadd       v3.4s, v3.4s, v5.4s            \n"
      "fmla       v0.4s, v2.4s, v6.4s            \n"  // * 4
      "fmla       v1.4s, v3.4s, v6.4s            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "fmul       v0.4s, v0.4s, v8.4s            \n"  // / 256
      "fmul       v1.4s, v1.4s, v8.4s            \n"
      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
@ -3052,6 +3137,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
      "zip1       v1.16b, v1.16b, v1.16b         \n"     // replicate U values
      "subs       %w3, %w3, #16                  \n"     // 16 pixels per loop
      "st3        {v0.16b,v1.16b,v2.16b}, [%2], #48 \n"  // store 16 YUV pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_y),      // %0
        "+r"(src_vu),     // %1
@ -3079,6 +3165,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
      "uqrshrn    v2.8b, v1.8h, #2               \n"
      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
      "st2        {v2.8b,v3.8b}, [%2], #16       \n"  // store 8 pixels UV.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_ayuv),    // %0
        "+r"(src_ayuv_1),  // %1
@ -3107,6 +3194,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
      "uqrshrn    v1.8b, v1.8h, #2               \n"
      "subs       %w3, %w3, #16                  \n"  // 16 processed per loop.
      "st2        {v0.8b,v1.8b}, [%2], #16       \n"  // store 8 pixels VU.
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_ayuv),    // %0
        "+r"(src_ayuv_1),  // %1
@ -3124,6 +3212,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
                                                                // pixels
      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop
      "st1        {v2.16b}, [%1], #16            \n"  // store 16 Y pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_ayuv),  // %0
        "+r"(dst_y),     // %1
@ -3140,6 +3229,7 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
      "orr        v2.16b, v0.16b, v0.16b         \n"  // move U after V
      "subs       %w2, %w2, #16                  \n"  // 16 pixels per loop
      "st2        {v1.16b, v2.16b}, [%1], #32    \n"  // store 16 VU pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_uv),  // %0
        "+r"(dst_vu),  // %1
--- a/source/scale_neon64.cc
+++ b/source/scale_neon64.cc
@ -31,6 +31,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
      // load even pixels into v0, odd into v1
      "ld2        {v0.16b,v1.16b}, [%0], #32     \n"
      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "st1        {v1.16b}, [%1], #16            \n"  // store odd pixels
      "b.gt       1b                             \n"
      : "+r"(src_ptr),   // %0
@ -54,6 +55,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
      "subs       %w2, %w2, #16                  \n"  // 16 processed per loop
      "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
      "st1        {v0.16b}, [%1], #16            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_ptr),   // %0
        "+r"(dst),       // %1
@ -82,6 +84,8 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
      "rshrn      v0.8b, v0.8h, #2               \n"  // round and pack
      "rshrn2     v0.16b, v1.8h, #2              \n"
      "st1        {v0.16b}, [%2], #16            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%1, 448]           \n"
      "b.gt       1b                             \n"
      : "+r"(src_ptr),     // %0
        "+r"(src_stride),  // %1
@ -102,6 +106,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
      "ld4     {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32  \n"  // src line 0
      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
      "st1     {v2.8b}, [%1], #8                 \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_ptr),   // %0
        "+r"(dst_ptr),   // %1
@ -131,6 +136,10 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
      "addp    v0.8h, v0.8h, v0.8h               \n"
      "rshrn   v0.8b, v0.8h, #4                  \n"  // divide by 16 w/rounding
      "st1    {v0.s}[0], [%1], #4                \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%2, 448]           \n"
+      "prfm       pldl1keep, [%3, 448]           \n"
+      "prfm       pldl1keep, [%4, 448]           \n"
      "b.gt       1b                             \n"
      : "+r"(src_ptr),   // %0
        "+r"(dst_ptr),   // %1
@ -156,7 +165,8 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
      "subs      %w2, %w2, #24                           \n"
      "orr       v2.16b, v3.16b, v3.16b                  \n"  // order v0,v1,v2
      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
-      "b.gt      1b                                      \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "b.gt       1b                             \n"
      : "+r"(src_ptr),   // %0
        "+r"(dst_ptr),   // %1
        "+r"(dst_width)  // %2
@ -211,7 +221,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,

      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"

-      "b.gt      1b                                      \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%3, 448]           \n"
+      "b.gt       1b                             \n"
      : "+r"(src_ptr),    // %0
        "+r"(dst_ptr),    // %1
        "+r"(dst_width),  // %2
@ -252,7 +264,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
      "uqrshrn   v2.8b, v4.8h, #2                        \n"

      "st3       {v0.8b,v1.8b,v2.8b}, [%1], #24          \n"
-      "b.gt      1b                                      \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%3, 448]           \n"
+      "b.gt       1b                             \n"
      : "+r"(src_ptr),    // %0
        "+r"(dst_ptr),    // %1
        "+r"(dst_width),  // %2
@ -286,7 +300,8 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
      "tbl       v2.16b, {v0.16b,v1.16b}, v3.16b         \n"
      "st1       {v2.8b}, [%1], #8                       \n"
      "st1       {v2.s}[2], [%1], #4                     \n"
-      "b.gt      1b                                      \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "b.gt       1b                             \n"
      : "+r"(src_ptr),   // %0
        "+r"(dst_ptr),   // %1
        "+r"(dst_width)  // %2
@ -400,7 +415,10 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,

      "st1       {v3.8b}, [%1], #8                       \n"
      "st1       {v3.s}[2], [%1], #4                     \n"
-      "b.gt      1b                                      \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%2, 448]           \n"
+      "prfm       pldl1keep, [%3, 448]           \n"
+      "b.gt       1b                             \n"
      : "+r"(src_ptr),         // %0
        "+r"(dst_ptr),         // %1
        "+r"(tmp_src_stride),  // %2
@ -504,7 +522,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,

      "st1       {v3.8b}, [%1], #8                       \n"
      "st1       {v3.s}[2], [%1], #4                     \n"
-      "b.gt      1b                                      \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%2, 448]           \n"
+      "b.gt       1b                             \n"
      : "+r"(src_ptr),         // %0
        "+r"(dst_ptr),         // %1
        "+r"(tmp_src_stride),  // %2
@ -528,7 +548,8 @@ void ScaleAddRow_NEON(const uint8_t* src_ptr,
      "uaddw    v1.8h, v1.8h, v0.8b              \n"
      "st1      {v1.8h, v2.8h}, [%1], #32        \n"  // store accumulator
      "subs     %w2, %w2, #16                    \n"  // 16 processed per loop
-      "b.gt     1b                               \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "b.gt       1b                             \n"
      : "+r"(src_ptr),   // %0
        "+r"(dst_ptr),   // %1
        "+r"(src_width)  // %2
@ -599,7 +620,7 @@ void ScaleFilterCols_NEON(uint8_t* dst_ptr,
    "add       v1.4s, v1.4s, v0.4s             \n"
    "add       v2.4s, v2.4s, v0.4s             \n"
    "subs      %w2, %w2, #8                    \n"  // 8 processed per loop
-    "b.gt      1b                              \n"
+      "b.gt       1b                             \n"
  : "+r"(dst_ptr),          // %0
    "+r"(src_ptr),          // %1
    "+r"(dst_width),        // %2
@ -647,6 +668,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
      "rshrn        v0.8b, v6.8h, #8             \n"
      "rshrn2       v0.16b, v7.8h, #8            \n"
      "st1          {v0.16b}, [%0], #16          \n"
+      "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%2, 448]           \n"
      "b.gt         1b                           \n"
      "b            99f                          \n"

@ -658,6 +681,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
      "urhadd       v0.16b, v0.16b, v1.16b       \n"
      "urhadd       v0.16b, v0.16b, v1.16b       \n"
      "st1          {v0.16b}, [%0], #16          \n"
+      "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%2, 448]           \n"
      "b.gt         25b                          \n"
      "b            99f                          \n"

@ -668,6 +693,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
      "subs         %w3, %w3, #16                \n"
      "urhadd       v0.16b, v0.16b, v1.16b       \n"
      "st1          {v0.16b}, [%0], #16          \n"
+      "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%2, 448]           \n"
      "b.gt         50b                          \n"
      "b            99f                          \n"

@ -679,6 +706,8 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
      "urhadd       v0.16b, v0.16b, v1.16b       \n"
      "urhadd       v0.16b, v0.16b, v1.16b       \n"
      "st1          {v0.16b}, [%0], #16          \n"
+      "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%2, 448]           \n"
      "b.gt         75b                          \n"
      "b            99f                          \n"

@ -687,6 +716,7 @@ void ScaleFilterRows_NEON(uint8_t* dst_ptr,
      "ld1          {v0.16b}, [%1], #16          \n"
      "subs         %w3, %w3, #16                \n"
      "st1          {v0.16b}, [%0], #16          \n"
+      "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
      "b.gt         100b                         \n"

      "99:                                       \n"
@ -713,6 +743,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
      "mov        v2.16b, v3.16b                 \n"
      "st2        {v1.4s,v2.4s}, [%1], #32       \n"  // store 8 odd pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_ptr),   // %0
        "+r"(dst),       // %1
@ -736,6 +767,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
      "urhadd     v0.16b, v0.16b, v1.16b         \n"  // rounding half add
      "urhadd     v1.16b, v2.16b, v3.16b         \n"
      "st2        {v0.4s,v1.4s}, [%1], #32       \n"  // store 8 pixels
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb),  // %0
        "+r"(dst_argb),  // %1
@ -769,6 +801,8 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
      "rshrn      v2.8b, v2.8h, #2               \n"
      "rshrn      v3.8b, v3.8h, #2               \n"
      "st4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32     \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%1, 448]           \n"
      "b.gt       1b                             \n"
      : "+r"(src_ptr),     // %0
        "+r"(src_stride),  // %1
@ -794,6 +828,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
      "ld1        {v0.s}[3], [%0], %3            \n"
      "subs       %w2, %w2, #4                   \n"  // 4 pixels per loop.
      "st1        {v0.16b}, [%1], #16            \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(src_argb),                // %0
        "+r"(dst_argb),                // %1
@ -838,6 +873,8 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
      "rshrn2     v0.16b, v4.8h, #2              \n"  // next 2 pixels.
      "subs       %w3, %w3, #4                   \n"  // 4 pixels per loop.
      "st1     {v0.16b}, [%2], #16               \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%1, 448]           \n"
      "b.gt       1b                             \n"
      : "+r"(src_argb),                // %0
        "+r"(src_stride),              // %1
@ -878,6 +915,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
      // clang-format on
      "st1        {v0.4s, v1.4s}, [%0], #32      \n"  // store pixels
      "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
+      "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
      "b.gt       1b                             \n"
      : "+r"(dst_argb),   // %0
        "+r"(src_argb),   // %1
@ -949,7 +987,8 @@ void ScaleARGBFilterCols_NEON(uint8_t* dst_argb,
    "st1     {v0.4s}, [%0], #16                \n"  // store pixels
    "add     v5.4s, v5.4s, v6.4s               \n"
    "subs    %w2, %w2, #4                      \n"  // 4 processed per loop
-    "b.gt    1b                                \n"
+      "prfm       pldl1keep, [%1, 448]           \n"  // prefetch 7 lines ahead
+      "b.gt       1b                             \n"
  : "+r"(dst_argb),         // %0
    "+r"(src_argb),         // %1
    "+r"(dst_width),        // %2
@ -984,6 +1023,8 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
      "rshrn      v0.4h, v0.4s, #2               \n"  // round and pack
      "rshrn2     v0.8h, v1.4s, #2               \n"
      "st1        {v0.8h}, [%2], #16             \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%1, 448]           \n"
      "b.gt       1b                             \n"
      : "+r"(src_ptr),     // %0
        "+r"(src_stride),  // %1
@ -1032,6 +1073,8 @@ void ScaleRowUp2_16_NEON(const uint16_t* src_ptr,
      "uqrshrn    v17.4h, v18.4s, #4             \n"
      "uqrshrn2   v17.8h, v4.4s, #4              \n"
      "st2        {v16.8h-v17.8h}, [%2], #32     \n"
+      "prfm       pldl1keep, [%0, 448]           \n"  // prefetch 7 lines ahead
+      "prfm       pldl1keep, [%1, 448]           \n"
      "b.gt       1b                             \n"
      : "+r"(src_ptr),     // %0
        "+r"(src_stride),  // %1
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -804,6 +804,23 @@ TEST_F(LibYUVPlanarTest, TestARGBMirror) {
  }
 }

+TEST_F(LibYUVPlanarTest, TestMirrorPlane) {
+  SIMD_ALIGNED(uint8_t orig_pixels[1280]);
+  SIMD_ALIGNED(uint8_t dst_pixels[1280]);
+
+  for (int i = 0; i < 1280; ++i) {
+    orig_pixels[i] = i;
+  }
+  MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
+
+  for (int i = 0; i < 1280; ++i) {
+    EXPECT_EQ(i & 255, dst_pixels[1280 - 1 - i]);
+  }
+  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
+    MirrorPlane(&orig_pixels[0], 0, &dst_pixels[0], 0, 1280, 1);
+  }
+}
+
 TEST_F(LibYUVPlanarTest, TestShade) {
  SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
  SIMD_ALIGNED(uint8_t shade_pixels[1280][4]);
@ -3315,8 +3332,8 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
    }
 #else
    GaussCol_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
-               &orig_pixels[1280 * 3], &orig_pixels[1280 * 4], &dst_pixels_opt[0],
-               1280);
+               &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+               &dst_pixels_opt[0], 1280);
 #endif
  }

@ -3369,36 +3386,24 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_F32_Opt) {
  for (int i = 0; i < 1280 * 5; ++i) {
    orig_pixels[i] = static_cast<float>(i);
  }
-   GaussCol_F32_C(&orig_pixels[0],
-                  &orig_pixels[1280],
-                  &orig_pixels[1280 * 2],
-                  &orig_pixels[1280 * 3],
-                  &orig_pixels[1280 * 4],
-                  &dst_pixels_c[0], 1280);
+  GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+                 &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
+                 &dst_pixels_c[0], 1280);
  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
    int has_neon = TestCpuFlag(kCpuHasNEON);
    if (has_neon) {
-      GaussCol_F32_NEON(&orig_pixels[0],
-                        &orig_pixels[1280],
-                        &orig_pixels[1280 * 2],
-                        &orig_pixels[1280 * 3],
-                        &orig_pixels[1280 * 4],
-                        &dst_pixels_opt[0], 1280);
+      GaussCol_F32_NEON(&orig_pixels[0], &orig_pixels[1280],
+                        &orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
+                        &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
    } else {
-      GaussCol_F32_C(&orig_pixels[0],
-                     &orig_pixels[1280],
-                     &orig_pixels[1280 * 2],
-                     &orig_pixels[1280 * 3],
-                     &orig_pixels[1280 * 4],
-                     &dst_pixels_opt[0], 1280);
+      GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280],
+                     &orig_pixels[1280 * 2], &orig_pixels[1280 * 3],
+                     &orig_pixels[1280 * 4], &dst_pixels_opt[0], 1280);
    }
 #else
-    GaussCol_F32_C(&orig_pixels[0],
-                   &orig_pixels[1280],
-                   &orig_pixels[1280 * 2],
-                   &orig_pixels[1280 * 3],
-                   &orig_pixels[1280 * 4],
+    GaussCol_F32_C(&orig_pixels[0], &orig_pixels[1280], &orig_pixels[1280 * 2],
+                   &orig_pixels[1280 * 3], &orig_pixels[1280 * 4],
                   &dst_pixels_opt[0], 1280);
 #endif
  }
@ -3455,18 +3460,18 @@ TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) {

  MaskCpuFlags(disable_cpu_flags_);
  GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
-                 (float*)(dst_pixels_c), benchmark_width_,
-                 benchmark_width_, benchmark_height_);
+                 (float*)(dst_pixels_c), benchmark_width_, benchmark_width_,
+                 benchmark_height_);
  MaskCpuFlags(benchmark_cpu_info_);

  for (int i = 0; i < benchmark_iterations_; ++i) {
    GaussPlane_F32((const float*)(orig_pixels), benchmark_width_,
-                   (float*)(dst_pixels_opt), benchmark_width_,
-                   benchmark_width_, benchmark_height_);
+                   (float*)(dst_pixels_opt), benchmark_width_, benchmark_width_,
+                   benchmark_height_);
  }
-  for (int i = 0; i < benchmark_width_ * benchmark_height_ ; ++i) {
-    EXPECT_NEAR(((float*)(dst_pixels_c))  [i],
-                ((float*)(dst_pixels_opt))[i], 1.f) << i;
+  for (int i = 0; i < benchmark_width_ * benchmark_height_; ++i) {
+    EXPECT_NEAR(((float*)(dst_pixels_c))[i], ((float*)(dst_pixels_opt))[i], 1.f)
+        << i;
  }

  free_aligned_buffer_page_end(dst_pixels_c);
--- a/unit_test/rotate_argb_test.cc
+++ b/unit_test/rotate_argb_test.cc
@ -183,4 +183,46 @@ TEST_F(LibYUVRotateTest, DISABLED_RotatePlane270_Odd) {
                  benchmark_cpu_info_);
 }

+TEST_F(LibYUVRotateTest, RotatePlane90_TestStride) {
+  int argb_plane_size = benchmark_width_ * 4 * abs(benchmark_height_);
+
+  align_buffer_page_end(src_argb, argb_plane_size);
+  align_buffer_page_end(dst_argb, argb_plane_size);
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+                          benchmark_width_ * 4, benchmark_width_,
+                          benchmark_height_, kRotate0));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+                          benchmark_width_ * 4 - 1, benchmark_width_ - 1,
+                          benchmark_height_, kRotate0));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+                          benchmark_width_ * 4, benchmark_width_,
+                          benchmark_height_, kRotate180));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+                          benchmark_width_ * 4 - 1, benchmark_width_ - 1,
+                          benchmark_height_, kRotate180));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+                          abs(benchmark_height_) * 4, benchmark_width_,
+                          benchmark_height_, kRotate90));
+
+  EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+                           abs(benchmark_height_) * 4, benchmark_width_ - 1,
+                           benchmark_height_, kRotate90));
+
+  EXPECT_EQ(0, ARGBRotate(src_argb, benchmark_width_ * 4, dst_argb,
+                          abs(benchmark_height_) * 4, benchmark_width_,
+                          benchmark_height_, kRotate270));
+
+  EXPECT_EQ(-1, ARGBRotate(src_argb, benchmark_width_ * 4 - 1, dst_argb,
+                           abs(benchmark_height_) * 4, benchmark_width_ - 1,
+                           benchmark_height_, kRotate270));
+
+  free_aligned_buffer_page_end(dst_argb);
+  free_aligned_buffer_page_end(src_argb);
+}
+
 }  // namespace libyuv