diff --git a/README.chromium b/README.chromium index cb31b649d..f803c4cd3 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1084 +Version: 1086 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 4a2974749..7e52c18af 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -58,6 +58,13 @@ extern "C" { #if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37 #define LIBYUV_DISABLE_NEON #endif +// clang >= 3.5.0 required for Arm64. +#if defined(__clang__) && defined(__aarch64__) && !defined(LIBYUV_DISABLE_NEON) +#if (__clang_major__ < 3) || (__clang_major__ == 3 && (__clang_minor__ < 5)) +#define LIBYUV_DISABLE_NEON +#endif // clang >= 3.5 +#endif // __clang__ + // The following are available on all x86 platforms: #if !defined(LIBYUV_DISABLE_X86) && \ diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 5ed87e8df..c44212381 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1084 +#define LIBYUV_VERSION 1086 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/libyuv.gyp b/libyuv.gyp index 52e7554fb..3a0b84f82 100644 --- a/libyuv.gyp +++ b/libyuv.gyp @@ -25,7 +25,7 @@ 'conditions': [ ['(target_arch == "armv7" or target_arch == "armv7s" or \ (target_arch == "arm" and arm_version >= 7) or target_arch == "arm64")\ - and target_subarch != 64 and (arm_neon == 1 or arm_neon_optional == 1)', + and (arm_neon == 1 or arm_neon_optional == 1)', { 'build_neon': 1, }], @@ -47,11 +47,6 @@ '-mfpu=vfpv3-d16', ], 'conditions': [ - ['target_arch != "arm64"', { - 'cflags': [ - '-mfpu=neon', - ], - }], # Disable LTO in libyuv_neon target due to gcc 4.9 compiler bug. ['use_lto == 1', { 'cflags!': [ @@ -60,6 +55,9 @@ ], }], ], + 'cflags': [ + '-mfpu=neon', + ], 'include_dirs': [ 'include', '.', @@ -93,11 +91,6 @@ # Allows libyuv.a redistributable library without external dependencies. 'standalone_static_library': 1, 'conditions': [ - ['OS == "ios" and target_subarch == 64', { - 'defines': [ - 'LIBYUV_DISABLE_NEON' - ], - }], ['OS != "ios" and libyuv_disable_jpeg != 1', { 'defines': [ 'HAVE_JPEG' @@ -126,15 +119,6 @@ 'dependencies': [ 'libyuv_neon', ], - 'conditions': [ - # TODO LIBYUV_NEON is temporary disabled. When all arm64 port has - # been done, enable it. - ['target_arch !="arm64"', { - 'defines': [ - 'LIBYUV_NEON', - ] - }], - ], }], # MemorySanitizer does not support assembly code yet. # http://crbug.com/344505 @@ -151,6 +135,7 @@ # 'LIBYUV_DISABLE_MIPS', # Enable the following macro to build libyuv as a shared library (dll). # 'LIBYUV_USING_SHARED_LIBRARY', + # TODO(fbarchard): Make these into gyp defines. ], 'include_dirs': [ 'include', diff --git a/source/cpu_id.cc b/source/cpu_id.cc index deb4c4465..8f8a403ee 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -15,7 +15,8 @@ #endif #if !defined(__pnacl__) && !defined(__CLR_VER) && \ !defined(__native_client__) && \ - defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) + defined(_MSC_VER) && (_MSC_FULL_VER >= 160040219) && \ + (defined(_M_IX86) || defined(_M_X64)) #include // For _xgetbv() #endif diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 6fe60b8c4..09775c457 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -1297,8 +1297,8 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, #ifdef HAS_YUY2TOUVROW_NEON void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_yuy2b = src_yuy2 + stride_yuy2; asm volatile ( - "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2 ".p2align 2 \n" "1: \n" MEMACCESS(0) @@ -1314,7 +1314,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, "st1 {v3.8b}, [%3], #8 \n" // store 8 V. "b.gt 1b \n" : "+r"(src_yuy2), // %0 - "+r"(stride_yuy2), // %1 + "+r"(src_yuy2b), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(pix) // %4 @@ -1327,8 +1327,8 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, #ifdef HAS_UYVYTOUVROW_NEON void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix) { + const uint8* src_uyvyb = src_uyvy + stride_uyvy; asm volatile ( - "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy ".p2align 2 \n" "1: \n" MEMACCESS(0) @@ -1344,7 +1344,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, "st1 {v2.8b}, [%3], #8 \n" // store 8 V. "b.gt 1b \n" : "+r"(src_uyvy), // %0 - "+r"(stride_uyvy), // %1 + "+r"(src_uyvyb), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 "+r"(pix) // %4 @@ -1357,9 +1357,9 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, #ifdef HAS_HALFROW_NEON void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, uint8* dst_uv, int pix) { + const uint8* src_uvb = src_uv + src_uv_stride; asm volatile ( // change the stride to row 2 pointer - "add %x1, %x0, %w1, sxtw \n" "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels. @@ -1371,7 +1371,7 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, "st1 {v0.16b}, [%2], #16 \n" "b.gt 1b \n" : "+r"(src_uv), // %0 - "+r"(src_uv_stride), // %1 + "+r"(src_uvb), // %1 "+r"(dst_uv), // %2 "+r"(pix) // %3 : @@ -1682,11 +1682,11 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( - "movi v20.8h, #112 / 2 \n" // UB / VR 0.875 coefficient - "movi v21.8h, #74 / 2 \n" // UG -0.5781 coefficient - "movi v22.8h, #38 / 2 \n" // UR -0.2969 coefficient - "movi v23.8h, #18 / 2 \n" // VB -0.1406 coefficient - "movi v24.8h, #94 / 2 \n" // VG -0.7344 coefficient + "movi v20.8h, #56 \n" // UB / VR 0.875 / 2 coefficient + "movi v21.8h, #37 \n" // UG -0.5781 / 2 coefficient + "movi v22.8h, #19 \n" // UR -0.2969 / 2 coefficient + "movi v23.8h, #9 \n" // VB -0.1406 / 2 coefficient + "movi v24.8h, #47 \n" // VG -0.7344 / 2 coefficient "movi v25.16b, #0x80 \n" // 128.5 ".p2align 2 \n" "1: \n" @@ -1732,11 +1732,11 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( - "movi v20.8h, #112 / 2 \n" // UB / VR 0.875 coefficient - "movi v21.8h, #74 / 2 \n" // UG -0.5781 coefficient - "movi v22.8h, #38 / 2 \n" // UR -0.2969 coefficient - "movi v23.8h, #18 / 2 \n" // VB -0.1406 coefficient - "movi v24.8h, #94 / 2 \n" // VG -0.7344 coefficient + "movi v20.8h, #56 \n" // UB / VR 0.875 / 2 coefficient + "movi v21.8h, #37 \n" // UG -0.5781 / 2 coefficient + "movi v22.8h, #19 \n" // UR -0.2969 / 2 coefficient + "movi v23.8h, #9 \n" // VB -0.1406 / 2 coefficient + "movi v24.8h, #47 \n" // VG -0.7344 / 2 coefficient "movi v25.16b, #0x80 \n" // 128.5 ".p2align 2 \n" "1: \n" @@ -1800,16 +1800,18 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */ // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. +// TODO(fbarchard): consider ptrdiff_t for all strides. + #ifdef HAS_ARGBTOUVROW_NEON void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #37 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #19 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #9 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #47 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" @@ -1908,11 +1910,11 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_bgra - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #37 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #19 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #9 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #47 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" @@ -1959,11 +1961,11 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_abgr - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #37 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #19 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #9 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #47 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" @@ -2010,11 +2012,11 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgba - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #37 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #19 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #9 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #47 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" @@ -2061,11 +2063,11 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_rgb24 - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #37 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #19 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #9 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #47 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" @@ -2112,11 +2114,11 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_raw - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #37 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #19 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #9 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #47 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" @@ -2164,11 +2166,11 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #37 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #19 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #9 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #47 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" @@ -2236,11 +2238,11 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #37 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #19 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #9 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #47 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" @@ -2308,11 +2310,11 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "add %1, %0, %1 \n" // src_stride + src_argb - "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient - "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient - "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient - "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient - "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient + "vmov.s16 q10, #56 \n" // UB / VR 0.875 coefficient + "vmov.s16 q11, #37 \n" // UG -0.5781 coefficient + "vmov.s16 q12, #19 \n" // UR -0.2969 coefficient + "vmov.s16 q13, #9 \n" // VB -0.1406 coefficient + "vmov.s16 q14, #47 \n" // VG -0.7344 coefficient "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" @@ -2748,9 +2750,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // Blend 1 pixels. "1: \n" MEMACCESS(0) - "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}[0], [%0], #4 \n" // load 1 pixel ARGB0. + "ld4 {v0.b,v1.b,v2.b,v3.b}[0], [%0], #4 \n" // load 1 pixel ARGB0. MEMACCESS(1) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}[0], [%1], #4 \n" // load 1 pixel ARGB1. + "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel ARGB1. "subs %3, %3, #1 \n" // 1 processed per loop. "umull v16.8h, v4.8b, v3.8b \n" // db * a "umull v17.8h, v5.8b, v3.8b \n" // dg * a @@ -2766,7 +2768,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "uqadd v2.8b, v2.8b, v6.8b \n" // + sr "movi v3.8b, #255 \n" // a = 255 MEMACCESS(2) - "st4 {v0.8b,v1.8b,v2.8b,v3.8b}[0], [%2], #4 \n" // store 1 pixel. + "st4 {v0.b,v1.b,v2.b,v3.b}[0], [%2], #4 \n" // store 1 pixel. "b.ge 1b \n" "99: \n" diff --git a/source/row_win.cc b/source/row_win.cc index f58fc5138..d79c35396 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -21,7 +21,8 @@ extern "C" { #endif // This module is for Visual C. -#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) +#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER) && \ + (defined(_M_IX86) || defined(_M_X64)) #define YG 74 /* (int8)(1.164 * 64 + 0.5) */ diff --git a/source/scale_neon64.cc b/source/scale_neon64.cc index 90bc74817..e31a6c930 100644 --- a/source/scale_neon64.cc +++ b/source/scale_neon64.cc @@ -105,12 +105,12 @@ asm volatile ( MEMACCESS(0) "ld1 {v0.16b}, [%0], #16 \n" // load up 16x4 MEMACCESS(3) - "ld1 {v1.16b}, [%3], #16 \n" + "ld1 {v1.16b}, [%2], #16 \n" MEMACCESS(4) - "ld1 {v2.16b}, [%4], #16 \n" + "ld1 {v2.16b}, [%3], #16 \n" MEMACCESS(5) - "ld1 {v3.16b}, [%5], #16 \n" - "subs %2, %2, #4 \n" + "ld1 {v3.16b}, [%4], #16 \n" + "subs %5, %5, #4 \n" "uaddlp v0.8h, v0.16b \n" "uadalp v0.8h, v1.16b \n" "uadalp v0.8h, v2.16b \n" @@ -122,10 +122,10 @@ asm volatile ( "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_ptr1), // %3 - "+r"(src_ptr2), // %4 - "+r"(src_ptr3) // %5 + "+r"(src_ptr1), // %2 + "+r"(src_ptr2), // %3 + "+r"(src_ptr3), // %4 + "+r"(dst_width) // %5 : : "v0", "v1", "v2", "v3", "memory", "cc" ); @@ -144,7 +144,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 "subs %2, %2, #24 \n" - "mov v2.8b, v3.8b \n" // order v0, v1, v2 + "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 MEMACCESS(1) "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" "b.gt 1b \n" @@ -309,6 +309,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { const uint8* src_ptr1 = src_ptr + src_stride * 2; + ptrdiff_t tmp_src_stride = src_stride; asm volatile ( MEMACCESS(5) @@ -317,7 +318,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, "ld1 {v30.16b}, [%6] \n" MEMACCESS(7) "ld1 {v31.8h}, [%7] \n" - "add %3, %3, %0 \n" + "add %2, %2, %0 \n" "1: \n" // 00 40 01 41 02 42 03 43 @@ -327,10 +328,10 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" MEMACCESS(3) - "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" + "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" MEMACCESS(4) - "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%4], #32 \n" - "subs %2, %2, #12 \n" + "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" + "subs %4, %4, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 @@ -420,9 +421,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, "b.gt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride), // %3 - "+r"(src_ptr1) // %4 + "+r"(tmp_src_stride), // %2 + "+r"(src_ptr1), // %3 + "+r"(dst_width) // %4 : "r"(&kMult38_Div6), // %5 "r"(&kShuf38_2), // %6 "r"(&kMult38_Div9) // %7 @@ -438,12 +439,14 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { + // TODO(fbarchard): use src_stride directly for clang 3.5+. + ptrdiff_t tmp_src_stride = src_stride; asm volatile ( MEMACCESS(4) "ld1 {v30.8h}, [%4] \n" MEMACCESS(5) "ld1 {v31.16b}, [%5] \n" - "add %3, %3, %0 \n" + "add %2, %2, %0 \n" "1: \n" // 00 40 01 41 02 42 03 43 @@ -454,7 +457,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" MEMACCESS(3) "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" - "subs %2, %2, #12 \n" + "subs %3, %3, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 @@ -528,12 +531,12 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, MEMACCESS(1) "st1 {v3.s}[2], [%1], #4 \n" "b.gt 1b \n" - : "+r"(src_ptr), // %0 - "+r"(dst_ptr), // %1 - "+r"(dst_width), // %2 - "+r"(src_stride) // %3 - : "r"(&kMult38_Div6), // %4 - "r"(&kShuf38_2) // %5 + : "+r"(src_ptr), // %0 + "+r"(dst_ptr), // %1 + "+r"(tmp_src_stride), // %2 + "+r"(dst_width) // %3 + : "r"(&kMult38_Div6), // %4 + "r"(&kShuf38_2) // %5 : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v30", "v31", "memory", "cc" );