From 6546096269c3f452effde05cb7f449428fa658d0 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 7 Jun 2016 10:44:28 -0700 Subject: [PATCH] ARGBExtractAlpha 16 pixels at a time for ARM arm64 8 TestARGBExtractAlpha (10019 ms) <-original 64 bit code arm64 8 x2 TestARGBExtractAlpha (7639 ms) arm64 16 TestARGBExtractAlpha (7369 ms) <- new 64 bit code thumb32 8 TestARGBExtractAlpha (9505 ms) <- original 32 bit code thumb32 8 x2 TestARGBExtractAlpha (7400 ms) thumb32 8 x2i TestARGBExtractAlpha (7266 ms) <- new 32 bit code arm32 8 TestARGBExtractAlpha (10002 ms) BUG=libyuv:572 TESTED=local test on nexus 9 R=harryjin@google.com, wangcheng@google.com Review URL: https://codereview.chromium.org/2035573002 . --- README.chromium | 2 +- include/libyuv/version.h | 2 +- libyuv.gyp | 2 ++ source/planar_functions.cc | 4 ++-- source/row_any.cc | 2 +- source/row_neon.cc | 9 +++++---- source/row_neon64.cc | 9 ++++----- 7 files changed, 16 insertions(+), 14 deletions(-) diff --git a/README.chromium b/README.chromium index 9e3a46c71..c460a7b3f 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1593 +Version: 1594 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 1fc63dcfc..72a751740 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1593 +#define LIBYUV_VERSION 1594 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/libyuv.gyp b/libyuv.gyp index ed7c40c7c..db4b54904 100644 --- a/libyuv.gyp +++ b/libyuv.gyp @@ -61,6 +61,7 @@ '-mfpu=vfp', '-mfpu=vfpv3', '-mfpu=vfpv3-d16', + # '-mthumb', # arm32 not thumb ], 'conditions': [ # Disable LTO in libyuv_neon target due to gcc 4.9 compiler bug. @@ -74,6 +75,7 @@ ['target_arch != "arm64"', { 'cflags': [ '-mfpu=neon', + # '-marm', # arm32 not thumb ], }], ], diff --git a/source/planar_functions.cc b/source/planar_functions.cc index b1b1f2c83..237ab6831 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -2404,8 +2404,8 @@ int ARGBExtractAlpha(const uint8* src_argb, int src_stride, #endif #if defined(HAS_ARGBEXTRACTALPHAROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { - ARGBExtractAlphaRow = IS_ALIGNED(width, 8) ? ARGBExtractAlphaRow_NEON - : ARGBExtractAlphaRow_Any_NEON; + ARGBExtractAlphaRow = IS_ALIGNED(width, 16) ? ARGBExtractAlphaRow_NEON + : ARGBExtractAlphaRow_Any_NEON; } #endif diff --git a/source/row_any.cc b/source/row_any.cc index 94ae0edd7..494164fd0 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -470,7 +470,7 @@ ANY11(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, 0, 4, 4, 7) ANY11(ARGBExtractAlphaRow_Any_SSE2, ARGBExtractAlphaRow_SSE2, 0, 4, 1, 7) #endif #ifdef HAS_ARGBEXTRACTALPHAROW_NEON -ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 7) +ANY11(ARGBExtractAlphaRow_Any_NEON, ARGBExtractAlphaRow_NEON, 0, 4, 1, 15) #endif #undef ANY11 diff --git a/source/row_neon.cc b/source/row_neon.cc index 7574cee85..9e60237ec 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1302,16 +1302,17 @@ void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { asm volatile ( "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels - "subs %2, %2, #8 \n" // 8 processed per loop + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels + "subs %2, %2, #16 \n" // 16 processed per loop MEMACCESS(1) - "vst1.8 {d3}, [%1]! \n" // store 8 A's. + "vst1.8 {q3}, [%1]! \n" // store 16 A's. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1 "+r"(width) // %2 : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List ); } diff --git a/source/row_neon64.cc b/source/row_neon64.cc index e5f2dc8f3..80e1515b2 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -450,7 +450,6 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { - int64 width64 = (int64)(width); asm volatile ( YUVTORGB_SETUP "movi v23.8b, #255 \n" @@ -463,7 +462,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 - "+r"(width64) // %2 + "+r"(width) // %2 : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), [kUVToG]"r"(&kYuvI601Constants.kUVToG), [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), @@ -1404,10 +1403,10 @@ void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { asm volatile ( "1: \n" MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load row 8 pixels - "subs %w2, %w2, #8 \n" // 8 processed per loop + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pixels + "subs %w2, %w2, #16 \n" // 16 processed per loop MEMACCESS(1) - "st1 {v3.8b}, [%1], #8 \n" // store 8 A's. + "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_a), // %1