[AArch64] Avoid LD2 in YUY2ToARGBRow_NEON

In this case we have an LD2 instruction followed by a pair of permutes (ZIP1 and TBL). On some micro-architectures LD2 involves use of the vector pipelines, so in these cases it is preferable to do an LD1 and then a different pair of permutes (TRN + TBL) instead to avoid the extra vector pipeline usage. Reduction in runtime on selected kernels (no observed performance delta on Cortex-A55): Kernel | Cortex-A76 | Cortex-X2 UYVYToARGBRow_NEON | -2.6% | -8.8% YUY2ToARGBRow_NEON | -6.2% | -4.9% Bug: libyuv:976 Change-Id: I7ca45e0c7bf7cb50cc5ab37c6a01215d9689039a Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/5366652 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
2026-01-01 03:12:16 +08:00 · 2024-03-12 22:08:57 +00:00 · 2024-03-12 22:08:57 +00:00 · 8d0d885c2f
commit 8d0d885c2f
parent 188e4e3afb
1 changed files with 27 additions and 21 deletions
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -55,8 +55,12 @@ extern "C" {

 static const uvec8 kNV12Table = {0, 0, 2, 2, 4, 4, 6, 6,
                                 1, 1, 3, 3, 5, 5, 7, 7};
+static const uvec8 kNV12InterleavedTable = {0, 0, 4, 4, 8,  8,  12, 12,
+                                            2, 2, 6, 6, 10, 10, 14, 14};
 static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7,
                                 0, 0, 2, 2, 4, 4, 6, 6};
+static const uvec8 kNV21InterleavedTable = {1, 1, 5, 5, 9,  9,  13, 13,
+                                            3, 3, 7, 7, 11, 11, 15, 15};

 // Read 8 Y and 4 UV from NV12 or NV21
 #define READNV12                                 \
@ -68,17 +72,17 @@ static const uvec8 kNV21Table = {1, 1, 3, 3, 5, 5, 7, 7,
  "prfm       pldl1keep, [%[src_uv], 448]    \n"

 // Read 8 YUY2
-#define READYUY2                                     \
-  "ld2        {v0.8b, v1.8b}, [%[src_yuy2]], #16 \n" \
-  "zip1       v0.16b, v0.16b, v0.16b         \n"     \
-  "prfm       pldl1keep, [%[src_yuy2], 448]  \n"     \
-  "tbl        v1.16b, {v1.16b}, v2.16b       \n"
+#define READYUY2                                 \
+  "ld1        {v3.16b}, [%[src_yuy2]], #16   \n" \
+  "trn1       v0.16b, v3.16b, v3.16b         \n" \
+  "prfm       pldl1keep, [%[src_yuy2], 448]  \n" \
+  "tbl        v1.16b, {v3.16b}, v2.16b       \n"

 // Read 8 UYVY
-#define READUYVY                                     \
-  "ld2        {v3.8b, v4.8b}, [%[src_uyvy]], #16 \n" \
-  "zip1       v0.16b, v4.16b, v4.16b         \n"     \
-  "prfm       pldl1keep, [%[src_uyvy], 448]  \n"     \
+#define READUYVY                                 \
+  "ld1        {v3.16b}, [%[src_uyvy]], #16   \n" \
+  "trn2       v0.16b, v3.16b, v3.16b         \n" \
+  "prfm       pldl1keep, [%[src_uyvy], 448]  \n" \
  "tbl        v1.16b, {v3.16b}, v2.16b       \n"

 // UB VR UG VG
@ -569,18 +573,19 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
-      "movi        v19.8b, #255                  \n"
-      "ldr         q2, [%[kNV12Table]]           \n"
-      "1:                                        \n" READYUY2 YUVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n"
+      "movi        v19.8b, #255                   \n"
+      "ldr         q2, [%[kNV21InterleavedTable]] \n"
+      "1:                                         \n" READYUY2 YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8       \n"
      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt        1b                            \n"
+      "b.gt        1b                             \n"
      : [src_yuy2] "+r"(src_yuy2),                          // %[src_yuy2]
        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
        [width] "+r"(width)                                 // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
-        [kNV12Table] "r"(&kNV12Table)
+        [kNV21InterleavedTable] "r"(&kNV21InterleavedTable)
      : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
 }

@ -590,18 +595,19 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
                        int width) {
  asm volatile(
      YUVTORGB_SETUP
-      "movi        v19.8b, #255                  \n"
-      "ldr         q2, [%[kNV12Table]]           \n"
-      "1:                                        \n" READUYVY YUVTORGB RGBTORGB8
-      "subs        %w[width], %w[width], #8      \n"
+      "movi        v19.8b, #255                   \n"
+      "ldr         q2, [%[kNV12InterleavedTable]] \n"
+      "1:                                         \n" READUYVY YUVTORGB
+          RGBTORGB8
+      "subs        %w[width], %w[width], #8       \n"
      "st4         {v16.8b,v17.8b,v18.8b,v19.8b}, [%[dst_argb]], #32 \n"
-      "b.gt        1b                            \n"
+      "b.gt        1b                             \n"
      : [src_uyvy] "+r"(src_uyvy),                          // %[src_yuy2]
        [dst_argb] "+r"(dst_argb),                          // %[dst_argb]
        [width] "+r"(width)                                 // %[width]
      : [kUVCoeff] "r"(&yuvconstants->kUVCoeff),            // %[kUVCoeff]
        [kRGBCoeffBias] "r"(&yuvconstants->kRGBCoeffBias),  // %[kRGBCoeffBias]
-        [kNV12Table] "r"(&kNV12Table)
+        [kNV12InterleavedTable] "r"(&kNV12InterleavedTable)
      : "cc", "memory", YUVTORGB_REGS, "v2", "v19");
 }