GaussRow_NEON from int to short

[ RUN ] LibYUVPlanarTest.TestGaussRow_Opt [ OK ] LibYUVPlanarTest.TestGaussRow_Opt (601 ms) [ RUN ] LibYUVPlanarTest.TestGaussCol_Opt [ OK ] LibYUVPlanarTest.TestGaussCol_Opt (522 ms) TBR=kjellander@chromium.org BUG=libyuv:719 TEST=LibYUVPlanarTest.TestGaussRow_Opt Change-Id: I1242b98672538e889f3ab48f215d6dabc7144ea7 Reviewed-on: https://chromium-review.googlesource.com/627478 Reviewed-by: Cheng Wang <wangcheng@google.com> Reviewed-by: Frank Barchard <fbarchard@google.com>
2026-02-14 14:19:52 +08:00 · 2017-08-23 16:01:55 -07:00 · 2017-08-23 16:01:55 -07:00 · ad2409443c
commit ad2409443c
parent 1cc539f7d6
3 changed files with 175 additions and 53 deletions
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -2672,6 +2672,15 @@ void ScaleSamples_C(const float* src, float* dst, float scale, int width) {
  }
 }
 void GaussRow_C(const uint32* src, uint16* dst, int width) {
  int i;
  for (i = 0; i < width; ++i) {
    *dst++ =
        (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
    ++src;
  }
 }
 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
 void GaussCol_C(const uint16* src0,
                const uint16* src1,
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -2692,13 +2692,6 @@ void ScaleSamples_NEON(const float* src, float* dst, float scale, int width) {
      : "cc", "memory", "v1", "v2");
 }
 static vec16 kGauseCoefficients[4] = {
    {1, 4, 6, 4, 1, 0, 0, 0},
    {0, 1, 4, 6, 4, 1, 0, 0},
    {0, 0, 1, 4, 6, 4, 1, 0},
    {0, 0, 0, 1, 4, 6, 4, 1},
 };
 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
 void GaussCol_NEON(const uint16* src0,
                   const uint16* src1,
@ -2719,15 +2712,15 @@ void GaussCol_NEON(const uint16* src0,
      "ld1        {v5.8h}, [%4], #16             \n"
      "subs       %w6, %w6, #8                   \n"  // 8 processed per loop
-      "uaddl       v0.4s, v1.4h, v5.4h            \n"  // * 1
+      "uaddl       v0.4s, v1.4h, v5.4h           \n"  // * 1
-      "uaddl2      v1.4s, v1.8h, v5.8h            \n"  // * 1
+      "uaddl2      v1.4s, v1.8h, v5.8h           \n"  // * 1
-      "umlal       v0.4s, v2.4h, v6.4h            \n"  // * 4
+      "umlal       v0.4s, v2.4h, v6.4h           \n"  // * 4
-      "umlal2      v1.4s, v2.8h, v6.8h            \n"  // * 4
+      "umlal2      v1.4s, v2.8h, v6.8h           \n"  // * 4
-      "umlal       v0.4s, v3.4h, v7.4h            \n"  // * 6
+      "umlal       v0.4s, v3.4h, v7.4h           \n"  // * 6
-      "umlal2      v1.4s, v3.8h, v7.8h            \n"  // * 6
+      "umlal2      v1.4s, v3.8h, v7.8h           \n"  // * 6
-      "umlal       v0.4s, v4.4h, v6.4h            \n"  // * 4
+      "umlal       v0.4s, v4.4h, v6.4h           \n"  // * 4
-      "umlal2      v1.4s, v4.8h, v6.8h            \n"  // * 4
+      "umlal2      v1.4s, v4.8h, v6.8h           \n"  // * 4
      "st1        {v0.4s,v1.4s}, [%5], #32       \n"  // store 8 samples
      "b.gt       1b                             \n"
@ -2743,41 +2736,164 @@ void GaussCol_NEON(const uint16* src0,
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
 }
 #if 0
  a8:	ad7f8d82 	ldp	q2, q3, [x12,#-16]
  ac:	3cdf8186 	ldur	q6, [x12,#-8]
  b0:	3cdf4184 	ldur	q4, [x12,#-12]
  b4:	3cc04185 	ldur	q5, [x12,#4]
  b8:	3cc08187 	ldur	q7, [x12,#8]
  bc:	3cdfc190 	ldur	q16, [x12,#-4]
  c0:	3cc0c191 	ldur	q17, [x12,#12]
  c4:	3dc00592 	ldr	q18, [x12,#16]
  c8:	4ea094c2 	mla	v2.4s, v6.4s, v0.4s    #6
  cc:	4ea48604 	add	v4.4s, v16.4s, v4.4s
  d0:	4ea58625 	add	v5.4s, v17.4s, v5.4s
  d4:	4ea38442 	add	v2.4s, v2.4s, v3.4s
  d8:	4ea094e3 	mla	v3.4s, v7.4s, v0.4s    #6
  dc:	4f225484 	shl	v4.4s, v4.4s, #2
  e0:	4f2254a5 	shl	v5.4s, v5.4s, #2
  e4:	4eb28463 	add	v3.4s, v3.4s, v18.4s
  e8:	4ea48442 	add	v2.4s, v2.4s, v4.4s
  ec:	4ea58463 	add	v3.4s, v3.4s, v5.4s
  f0:	4ea18442 	add	v2.4s, v2.4s, v1.4s    #128
  f4:	4ea18463 	add	v3.4s, v3.4s, v1.4s    #128
  f8:	0f188442 	shrn	v2.4h, v2.4s, #8
  fc:	0f188463 	shrn	v3.4h, v3.4s, #8
 100:	f10021ad 	subs	x13, x13, #0x8
 104:	6d3f8d62 	stp	d2, d3, [x11,#-8]
 108:	9100416b 	add	x11, x11, #0x10
 10c:	9100818c 	add	x12, x12, #0x20
 110:	54fffcc1 	b.ne	a8 <GaussRow_C+0xa8>
 #endif
 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
-void GaussRow_NEON(const uint16* src0, uint16* dst, int width) {
+void GaussRow_NEON3(const uint32* src, uint16* dst, int width) {
  asm volatile(
-      "ld1       {v20.8h,v21.8h,v22.8h,v23.8h}, [%3]  \n"
+      "movi       v0.4s, #6                      \n"  // constant 6
      "add	%0, %0, #0x10                \n"                      
      "add	%1, %1, #0x8                  \n" 
      "1:                                        \n"
      "ld1        {v0.8h}, [%0], %4              \n"  // load 8 source samples
      "subs       %w2, %w2, #4                   \n"  // 4 processed per loop
-      "umull      v1.4s, v0.4h, v20.4h           \n"  // first pixel
+      "ldp	q2, q3, [%0,#-16]                 \n"            
-      "umlal2     v1.4s, v0.8h, v20.8h           \n"
+      "ldur	q6, [%0,#-8]                    \n" 
-      "addv       s1, v1.4s                      \n"
+      "ldur	q4, [%0,#-12]                    \n" 
-
+      "ldur	q5, [%0,#4]                    \n" 
-      "umull      v2.4s, v0.4h, v21.4h           \n"  // second pixel
+      "ldur	q7, [%0,#8]                    \n" 
-      "umlal2     v2.4s, v0.8h, v21.8h           \n"
+      "ldur	q16, [%0,#-4]                    \n" 
-      "addv       s2, v2.4s                      \n"
+      "ldur	q17, [%0,#12]                    \n" 
-
+      "ldr	q18, [%0,#16]                    \n" 
-      "umull      v3.4s, v0.4h, v22.4h           \n"  // third pixel
+      "mla	v2.4s, v6.4s, v0.4s                 \n" 
-      "umlal2     v3.4s, v0.8h, v22.8h           \n"
+      "add	v4.4s, v16.4s, v4.4s             \n" 
-      "addv       s3, v3.4s                      \n"
+      "add	v5.4s, v17.4s, v5.4s             \n" 
-
+      "add	v2.4s, v2.4s, v3.4s             \n" 
-      "umull      v4.4s, v0.4h, v23.4h           \n"  // forth pixel
+      "mla	v3.4s, v7.4s, v0.4s                 \n" 
-      "umlal2     v4.4s, v0.8h, v23.8h           \n"
+      "shl	v4.4s, v4.4s, #2             \n" 
-      "addv       s4, v4.4s                      \n"
+      "shl	v5.4s, v5.4s, #2             \n" 
-
+      "add	v3.4s, v3.4s, v18.4s             \n" 
-      "st4       {v1.s,v2.s,v3.s,v4.s}[0], [%1], #16  \n"  // store 4 samples
+      "add	v2.4s, v2.4s, v4.4s             \n" 
      "add	v3.4s, v3.4s, v5.4s             \n" 
      "add	v2.4s, v2.4s, v1.4s                 \n" 
      "add	v3.4s, v3.4s, v1.4s                 \n" 
      "shrn	v2.4h, v2.4s, #8             \n" 
      "shrn	v3.4h, v3.4s, #8             \n" 
      "subs	%w2, %w2, #0x8                   \n" 
      "stp	d2, d3, [%1,#-8]                  \n" 
      "add	%1, %1, #0x10                  \n" 
      "add	%0, %0, #0x20                \n"                      
      "b.gt       1b                             \n"
-      : "+r"(src0),                   // %0
+      : "+r"(src),                    // %0
        "+r"(dst),                    // %1
        "+r"(width)                   // %2
-      : "r"(&kGauseCoefficients[0]),  // %3
+      :
-        "r"(8LL)                      // %4
+      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
-      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v20", "v21", "v22",
+        "v16", "v17", "v18" );
-        "v23");
+}
 void GaussRow_NEON2(const uint32* src, uint16* dst, int width) {
  int i;
  for (i = 0; i < width; ++i) {
    *dst++ =
        (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4] + 128) >> 8;
    ++src;
  }
 }
 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
 void GaussRow_NEON(const uint32* src, uint16* dst, int width) {
  const uint32* src1 = src + 1;
  const uint32* src2 = src + 2;
  const uint32* src3 = src + 3;
  asm volatile(
      "movi       v6.4s, #4                      \n"  // constant 4
      "movi       v7.4s, #6                      \n"  // constant 6
      "1:                                        \n"
      "ld1        {v1.4s,v2.4s,v3.4s}, [%0], %6  \n"  // load 12 source samples
      "ld1        {v4.4s,v5.4s}, [%1], #32       \n"
      "ld1        {v16.4s,v17.4s}, [%2], #32     \n"
      "ld1        {v18.4s,v19.4s}, [%3], #32     \n"
      "subs       %w5, %w5, #8                   \n"  // 8 processed per loop
      "add        v0.4s, v1.4s, v2.4s            \n"  // * 1
      "add        v1.4s, v2.4s, v3.4s            \n"  // * 1
      "add        v2.4s, v4.4s, v18.4s           \n"  // add rows for * 4
      "add        v3.4s, v5.4s, v19.4s           \n"
      "mla        v0.4s, v2.4s, v6.4s            \n"  // * 4
      "mla        v1.4s, v3.4s, v6.4s            \n"  // * 4
      "mla        v0.4s, v16.4s, v7.4s           \n"  // * 6
      "mla        v1.4s, v17.4s, v7.4s           \n"  // * 6
      "uqrshrn    v0.4h, v0.4s, #8               \n"  // round and pack
      "uqrshrn2   v0.8h, v1.4s, #8               \n"
      "st1        {v0.8h}, [%4], #16             \n"  // store 8 samples
      "b.gt       1b                             \n"
      : "+r"(src),   // %0
        "+r"(src1),  // %1
        "+r"(src2),  // %2
        "+r"(src3),  // %3
        "+r"(dst),   // %4
        "+r"(width)  // %5
      : "r"(32LL)    // %6 
      : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
        "v16", "v17", "v18", "v19" );
 }
 // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row.
 void GaussRow_NEON4(const uint32* src, uint16* dst, int width) {
  const uint32* src1 = src + 1;
  const uint32* src2 = src + 2;
  const uint32* src3 = src + 3;
  asm volatile(
      "movi       v6.4s, #4                      \n"  // constant 4
      "movi       v7.4s, #6                      \n"  // constant 6
      "1:                                        \n"
      "ld1        {v0.4s,v1.4s}, [%0], %6        \n"  // load 8 source samples
      "ld1        {v2.4s}, [%1], #16             \n"
      "ld1        {v3.4s}, [%2], #16             \n"
      "ld1        {v4.4s}, [%3], #16             \n"
      "subs       %w5, %w5, #4                   \n"  // 4 processed per loop
      "mla        v0.4s, v2.4s, v6.4s            \n"  // * 4
      "mla        v0.4s, v3.4s, v7.4s            \n"  // * 6
      "mla        v0.4s, v4.4s, v6.4s            \n"  // * 4
      "add        v0.4s, v0.4s, v1.4s            \n"  // * 1
      "uqrshrn    v0.4h, v0.4s, #8               \n"  // round and pack
      "st1        {v0.4h}, [%4], #8             \n"  // store 8 samples
      "b.gt       1b                             \n"
      : "+r"(src),   // %0
        "+r"(src1),  // %1
        "+r"(src2),  // %2
        "+r"(src3),  // %3
        "+r"(dst),   // %4
        "+r"(width)  // %5
      : "r"(16LL)    // %6
      : "cc", "memory", "v0", "v1", "v0", "v1", "v2", "v3", "v4", "v6", "v7" );
 }
 #endif  // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
--- a/unit_test/planar_test.cc
+++ b/unit_test/planar_test.cc
@ -2725,23 +2725,22 @@ TEST_F(LibYUVPlanarTest, TestScaleSamples_Opt) {
  EXPECT_EQ(0, diff);
 }
-#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+extern "C" void GaussRow_NEON(const uint32* src, uint16* dst, int width);
-
+extern "C" void GaussRow_C(const uint32* src, uint16* dst, int width);
 extern "C" void GaussRow_NEON(const uint16* src0, uint32* dst, int width);
 TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
-  SIMD_ALIGNED(uint16 orig_pixels[1280 + 4]);
+  SIMD_ALIGNED(uint32 orig_pixels[1280 + 4]);
-  SIMD_ALIGNED(uint32 dst_pixels_c[1280]);
+  SIMD_ALIGNED(uint16 dst_pixels_c[1280]);
-  SIMD_ALIGNED(uint32 dst_pixels_opt[1280]);
+  SIMD_ALIGNED(uint16 dst_pixels_opt[1280]);
  memset(orig_pixels, 0, sizeof(orig_pixels));
  memset(dst_pixels_c, 1, sizeof(dst_pixels_c));
  memset(dst_pixels_opt, 2, sizeof(dst_pixels_opt));
  for (int i = 0; i < 1280 + 4; ++i) {
-    orig_pixels[i] = i;
+    orig_pixels[i] = i * 256;
  }
-  GaussRow_NEON(&orig_pixels[0], &dst_pixels_c[0], 1280);
+  GaussRow_C(&orig_pixels[0], &dst_pixels_c[0], 1280);
  MaskCpuFlags(benchmark_cpu_info_);
  for (int i = 0; i < benchmark_pixels_div1280_; ++i) {
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
@ -2749,10 +2748,10 @@ TEST_F(LibYUVPlanarTest, TestGaussRow_Opt) {
    if (has_neon) {
      GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
    } else {
-      GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
+      GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
    }
 #else
-    GaussRow_NEON(&orig_pixels[0], &dst_pixels_opt[0], 1280);
+    GaussRow_C(&orig_pixels[0], &dst_pixels_opt[0], 1280);
 #endif
  }
@ -2824,6 +2823,4 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
  EXPECT_EQ(dst_pixels_c[1279], 61424);
 }
 #endif  // aarch64
 }  // namespace libyuv