diff --git a/README.chromium b/README.chromium index 5c4393f8b..8a708a5cd 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 720 +Version: 721 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 43bb9815a..15ece343a 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 720 +#define LIBYUV_VERSION 721 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/compare_neon.cc b/source/compare_neon.cc index 9ae113d04..a4e777506 100644 --- a/source/compare_neon.cc +++ b/source/compare_neon.cc @@ -27,8 +27,8 @@ uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) { ".p2align 2 \n" "1: \n" - "vld1.u8 {q0}, [%0]! \n" - "vld1.u8 {q1}, [%1]! \n" + "vld1.8 {q0}, [%0]! \n" + "vld1.8 {q1}, [%1]! \n" "subs %2, %2, #16 \n" "vsubl.u8 q2, d0, d2 \n" "vsubl.u8 q3, d1, d3 \n" diff --git a/source/row_neon.cc b/source/row_neon.cc index 53da16afa..0bb55e717 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -20,57 +20,57 @@ extern "C" { // Read 8 Y, 4 U and 4 V from 422 #define READYUV422 \ - "vld1.u8 {d0}, [%0]! \n" \ - "vld1.u32 {d2[0]}, [%1]! \n" \ - "vld1.u32 {d2[1]}, [%2]! \n" + "vld1.8 {d0}, [%0]! \n" \ + "vld1.32 {d2[0]}, [%1]! \n" \ + "vld1.32 {d2[1]}, [%2]! \n" // Read 8 Y, 2 U and 2 V from 422 #define READYUV411 \ - "vld1.u8 {d0}, [%0]! \n" \ - "vld1.u16 {d2[0]}, [%1]! \n" \ - "vld1.u16 {d2[1]}, [%2]! \n" \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.16 {d2[0]}, [%1]! \n" \ + "vld1.16 {d2[1]}, [%2]! \n" \ "vmov.u8 d3, d2 \n" \ "vzip.u8 d2, d3 \n" // Read 8 Y, 8 U and 8 V from 444 #define READYUV444 \ - "vld1.u8 {d0}, [%0]! \n" \ - "vld1.u8 {d2}, [%1]! \n" \ - "vld1.u8 {d3}, [%2]! \n" \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ + "vld1.8 {d3}, [%2]! \n" \ "vpaddl.u8 q1, q1 \n" \ "vrshrn.u16 d2, q1, #1 \n" // Read 8 Y, and set 4 U and 4 V to 128 #define READYUV400 \ - "vld1.u8 {d0}, [%0]! \n" \ + "vld1.8 {d0}, [%0]! \n" \ "vmov.u8 d2, #128 \n" // Read 8 Y and 4 UV from NV12 #define READNV12 \ - "vld1.u8 {d0}, [%0]! \n" \ - "vld1.u8 {d2}, [%1]! \n" \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ "vuzp.u8 d2, d3 \n" \ "vtrn.u32 d2, d3 \n" // Read 8 Y and 4 VU from NV21 #define READNV21 \ - "vld1.u8 {d0}, [%0]! \n" \ - "vld1.u8 {d2}, [%1]! \n" \ + "vld1.8 {d0}, [%0]! \n" \ + "vld1.8 {d2}, [%1]! \n" \ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ "vuzp.u8 d3, d2 \n" \ "vtrn.u32 d2, d3 \n" // Read 8 YUY2 #define READYUY2 \ - "vld2.u8 {d0, d2}, [%0]! \n" \ + "vld2.8 {d0, d2}, [%0]! \n" \ "vmov.u8 d3, d2 \n" \ "vuzp.u8 d2, d3 \n" \ "vtrn.u32 d2, d3 \n" // Read 8 UYVY #define READUYVY \ - "vld2.u8 {d2, d3}, [%0]! \n" \ + "vld2.8 {d2, d3}, [%0]! \n" \ "vmov.u8 d0, d3 \n" \ "vmov.u8 d3, d2 \n" \ "vuzp.u8 d2, d3 \n" \ @@ -113,8 +113,8 @@ void I444ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - "vld1.u8 {d24}, [%5] \n" - "vld1.u8 {d25}, [%6] \n" + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -144,8 +144,8 @@ void I422ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - "vld1.u8 {d24}, [%5] \n" - "vld1.u8 {d25}, [%6] \n" + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -175,8 +175,8 @@ void I411ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - "vld1.u8 {d24}, [%5] \n" - "vld1.u8 {d25}, [%6] \n" + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -206,8 +206,8 @@ void I422ToBGRARow_NEON(const uint8* src_y, uint8* dst_bgra, int width) { asm volatile ( - "vld1.u8 {d24}, [%5] \n" - "vld1.u8 {d25}, [%6] \n" + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -238,8 +238,8 @@ void I422ToABGRRow_NEON(const uint8* src_y, uint8* dst_abgr, int width) { asm volatile ( - "vld1.u8 {d24}, [%5] \n" - "vld1.u8 {d25}, [%6] \n" + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -270,8 +270,8 @@ void I422ToRGBARow_NEON(const uint8* src_y, uint8* dst_rgba, int width) { asm volatile ( - "vld1.u8 {d24}, [%5] \n" - "vld1.u8 {d25}, [%6] \n" + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -301,8 +301,8 @@ void I422ToRGB24Row_NEON(const uint8* src_y, uint8* dst_rgb24, int width) { asm volatile ( - "vld1.u8 {d24}, [%5] \n" - "vld1.u8 {d25}, [%6] \n" + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -331,8 +331,8 @@ void I422ToRAWRow_NEON(const uint8* src_y, uint8* dst_raw, int width) { asm volatile ( - "vld1.u8 {d24}, [%5] \n" - "vld1.u8 {d25}, [%6] \n" + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -374,8 +374,8 @@ void I422ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( - "vld1.u8 {d24}, [%5] \n" - "vld1.u8 {d25}, [%6] \n" + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -420,8 +420,8 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, uint8* dst_argb1555, int width) { asm volatile ( - "vld1.u8 {d24}, [%5] \n" - "vld1.u8 {d25}, [%6] \n" + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -461,8 +461,8 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, uint8* dst_argb4444, int width) { asm volatile ( - "vld1.u8 {d24}, [%5] \n" - "vld1.u8 {d25}, [%6] \n" + "vld1.8 {d24}, [%5] \n" + "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -492,8 +492,8 @@ void YToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - "vld1.u8 {d24}, [%3] \n" - "vld1.u8 {d25}, [%4] \n" + "vld1.8 {d24}, [%3] \n" + "vld1.8 {d25}, [%4] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -522,7 +522,7 @@ void I400ToARGBRow_NEON(const uint8* src_y, ".p2align 2 \n" "vmov.u8 d23, #255 \n" "1: \n" - "vld1.u8 {d20}, [%0]! \n" + "vld1.8 {d20}, [%0]! \n" "vmov d21, d20 \n" "vmov d22, d20 \n" "subs %2, %2, #8 \n" @@ -541,8 +541,8 @@ void NV12ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - "vld1.u8 {d24}, [%4] \n" - "vld1.u8 {d25}, [%5] \n" + "vld1.8 {d24}, [%4] \n" + "vld1.8 {d25}, [%5] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -570,8 +570,8 @@ void NV21ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( - "vld1.u8 {d24}, [%4] \n" - "vld1.u8 {d25}, [%5] \n" + "vld1.8 {d24}, [%4] \n" + "vld1.8 {d25}, [%5] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -599,8 +599,8 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( - "vld1.u8 {d24}, [%4] \n" - "vld1.u8 {d25}, [%5] \n" + "vld1.8 {d24}, [%4] \n" + "vld1.8 {d25}, [%5] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -628,8 +628,8 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( - "vld1.u8 {d24}, [%4] \n" - "vld1.u8 {d25}, [%5] \n" + "vld1.8 {d24}, [%4] \n" + "vld1.8 {d25}, [%5] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -656,8 +656,8 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, uint8* dst_argb, int width) { asm volatile ( - "vld1.u8 {d24}, [%3] \n" - "vld1.u8 {d25}, [%4] \n" + "vld1.8 {d24}, [%3] \n" + "vld1.8 {d25}, [%4] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -683,8 +683,8 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, uint8* dst_argb, int width) { asm volatile ( - "vld1.u8 {d24}, [%3] \n" - "vld1.u8 {d25}, [%4] \n" + "vld1.8 {d24}, [%3] \n" + "vld1.8 {d25}, [%4] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" "vmov.u16 q15, #16 \n" @@ -712,10 +712,10 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, asm volatile ( ".p2align 2 \n" "1: \n" - "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pairs of UV + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "subs %3, %3, #16 \n" // 16 processed per loop - "vst1.u8 {q0}, [%1]! \n" // store U - "vst1.u8 {q1}, [%2]! \n" // store V + "vst1.8 {q0}, [%1]! \n" // store U + "vst1.8 {q1}, [%2]! \n" // store V "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 @@ -732,8 +732,8 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, asm volatile ( ".p2align 2 \n" "1: \n" - "vld1.u8 {q0}, [%0]! \n" // load U - "vld1.u8 {q1}, [%1]! \n" // load V + "vld1.8 {q0}, [%0]! \n" // load U + "vld1.8 {q1}, [%1]! \n" // load V "subs %3, %3, #16 \n" // 16 processed per loop "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV "bgt 1b \n" @@ -747,14 +747,14 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, ); } -// Copy multiple of 32. vld4.u8 allow unaligned and is fastest on a15. +// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile ( ".p2align 2 \n" "1: \n" - "vld1.u8 {d0, d1, d2, d3}, [%0]! \n" // load 32 + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "subs %2, %2, #32 \n" // 32 processed per loop - "vst1.u8 {d0, d1, d2, d3}, [%1]! \n" // store 32 + "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -770,7 +770,7 @@ void SetRow_NEON(uint8* dst, uint32 v32, int count) { "vdup.u32 q0, %2 \n" // duplicate 4 ints "1: \n" "subs %1, %1, #16 \n" // 16 bytes per loop - "vst1.u8 {q0}, [%0]! \n" // store + "vst1.8 {q0}, [%0]! \n" // store "bgt 1b \n" : "+r"(dst), // %0 "+r"(count) // %1 @@ -1037,9 +1037,9 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { asm volatile ( ".p2align 2 \n" "1: \n" - "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "subs %2, %2, #16 \n" // 16 processed per loop. - "vst1.u8 {q0}, [%1]! \n" // store 16 pixels of Y. + "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 @@ -1053,9 +1053,9 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( ".p2align 2 \n" "1: \n" - "vld2.u8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "subs %2, %2, #16 \n" // 16 processed per loop. - "vst1.u8 {q1}, [%1]! \n" // store 16 pixels of Y. + "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 @@ -1072,8 +1072,8 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - "vst1.u8 {d1}, [%1]! \n" // store 8 U. - "vst1.u8 {d3}, [%2]! \n" // store 8 V. + "vst1.8 {d1}, [%1]! \n" // store 8 U. + "vst1.8 {d3}, [%2]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 @@ -1091,8 +1091,8 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. - "vst1.u8 {d0}, [%1]! \n" // store 8 U. - "vst1.u8 {d2}, [%2]! \n" // store 8 V. + "vst1.8 {d0}, [%1]! \n" // store 8 U. + "vst1.8 {d2}, [%2]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 @@ -1114,8 +1114,8 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. "vrhadd.u8 d1, d1, d5 \n" // average rows of U "vrhadd.u8 d3, d3, d7 \n" // average rows of V - "vst1.u8 {d1}, [%2]! \n" // store 8 U. - "vst1.u8 {d3}, [%3]! \n" // store 8 V. + "vst1.8 {d1}, [%2]! \n" // store 8 U. + "vst1.8 {d3}, [%3]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(stride_yuy2), // %1 @@ -1138,8 +1138,8 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. "vrhadd.u8 d0, d0, d4 \n" // average rows of U "vrhadd.u8 d2, d2, d6 \n" // average rows of V - "vst1.u8 {d0}, [%2]! \n" // store 8 U. - "vst1.u8 {d2}, [%3]! \n" // store 8 V. + "vst1.8 {d0}, [%2]! \n" // store 8 U. + "vst1.8 {d2}, [%3]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(stride_uyvy), // %1 @@ -1157,11 +1157,11 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, // change the stride to row 2 pointer "add %1, %0 \n" "1: \n" - "vld1.u8 {q0}, [%0]! \n" // load row 1 16 pixels. + "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. "subs %3, %3, #16 \n" // 16 processed per loop - "vld1.u8 {q1}, [%1]! \n" // load row 2 16 pixels. + "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. "vrhadd.u8 q0, q1 \n" // average row 1 and 2 - "vst1.u8 {q0}, [%2]! \n" + "vst1.8 {q0}, [%2]! \n" "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(src_uv_stride), // %1 @@ -1178,12 +1178,12 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, asm volatile ( "vmov.u32 d6[0], %3 \n" // selector "1: \n" - "vld1.u8 {q0, q1}, [%0]! \n" // load row 8 pixels. + "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels "vtrn.u32 d4, d5 \n" // combine 8 pixels - "vst1.u8 {d4}, [%1]! \n" // store 8. + "vst1.8 {d4}, [%1]! \n" // store 8. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 @@ -1197,13 +1197,13 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { asm volatile ( - "vld1.u8 {q2}, [%3] \n" // shuffler + "vld1.8 {q2}, [%3] \n" // shuffler "1: \n" - "vld1.u8 {q0}, [%0]! \n" // load 4 pixels. + "vld1.8 {q0}, [%0]! \n" // load 4 pixels. "subs %2, %2, #4 \n" // 4 processed per loop "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels - "vst1.u8 {q1}, [%1]! \n" // store 4. + "vst1.8 {q1}, [%1]! \n" // store 4. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 @@ -1224,7 +1224,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y, "vld1.8 {d1}, [%1]! \n" // load 8 Us "vld1.8 {d3}, [%2]! \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels - "vst4.u8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 @@ -1247,7 +1247,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, "vld1.8 {d0}, [%1]! \n" // load 8 Us "vld1.8 {d2}, [%2]! \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels - "vst4.u8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. + "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 @@ -2181,8 +2181,8 @@ void InterpolateRow_NEON(uint8* dst_ptr, "vdup.8 d4, %4 \n" // General purpose row blend. "1: \n" - "vld1.u8 {q0}, [%1]! \n" - "vld1.u8 {q1}, [%2]! \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vmull.u8 q13, d0, d4 \n" "vmull.u8 q14, d1, d4 \n" @@ -2190,47 +2190,47 @@ void InterpolateRow_NEON(uint8* dst_ptr, "vmlal.u8 q14, d3, d5 \n" "vrshrn.u16 d0, q13, #8 \n" "vrshrn.u16 d1, q14, #8 \n" - "vst1.u8 {q0}, [%0]! \n" + "vst1.8 {q0}, [%0]! \n" "bgt 1b \n" "b 99f \n" // Blend 25 / 75. "25: \n" - "vld1.u8 {q0}, [%1]! \n" - "vld1.u8 {q1}, [%2]! \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" - "vst1.u8 {q0}, [%0]! \n" + "vst1.8 {q0}, [%0]! \n" "bgt 25b \n" "b 99f \n" // Blend 50 / 50. "50: \n" - "vld1.u8 {q0}, [%1]! \n" - "vld1.u8 {q1}, [%2]! \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" - "vst1.u8 {q0}, [%0]! \n" + "vst1.8 {q0}, [%0]! \n" "bgt 50b \n" "b 99f \n" // Blend 75 / 25. "75: \n" - "vld1.u8 {q1}, [%1]! \n" - "vld1.u8 {q0}, [%2]! \n" + "vld1.8 {q1}, [%1]! \n" + "vld1.8 {q0}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" - "vst1.u8 {q0}, [%0]! \n" + "vst1.8 {q0}, [%0]! \n" "bgt 75b \n" "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" - "vld1.u8 {q0}, [%1]! \n" + "vld1.8 {q0}, [%1]! \n" "subs %3, %3, #16 \n" - "vst1.u8 {q0}, [%0]! \n" + "vst1.8 {q0}, [%0]! \n" "bgt 100b \n" "99: \n" @@ -2478,7 +2478,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { void ARGBColorMatrixRow_NEON(uint8* dst_argb, const int8* matrix_argb, int width) { asm volatile ( - "vld1.u8 {q2}, [%2] \n" // load 3 ARGB vectors. + "vld1.8 {q2}, [%2] \n" // load 3 ARGB vectors. "vmovl.s8 q0, d4 \n" // B,G coefficients s16. "vmovl.s8 q1, d5 \n" // R coefficients s16. @@ -2670,22 +2670,22 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, asm volatile ( ".p2align 2 \n" "1: \n" - "vld1.u8 {d0}, [%0],%5 \n" // top - "vld1.u8 {d1}, [%0],%6 \n" + "vld1.8 {d0}, [%0],%5 \n" // top + "vld1.8 {d1}, [%0],%6 \n" "vsubl.u8 q0, d0, d1 \n" - "vld1.u8 {d2}, [%1],%5 \n" // center * 2 - "vld1.u8 {d3}, [%1],%6 \n" + "vld1.8 {d2}, [%1],%5 \n" // center * 2 + "vld1.8 {d3}, [%1],%6 \n" "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n" - "vld1.u8 {d2}, [%2],%5 \n" // bottom - "vld1.u8 {d3}, [%2],%6 \n" + "vld1.8 {d2}, [%2],%5 \n" // bottom + "vld1.8 {d3}, [%2],%6 \n" "subs %4, %4, #8 \n" // 8 pixels "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vabs.s16 q0, q0 \n" "vqmovn.u16 d0, q0 \n" - "vst1.u8 {d0}, [%3]! \n" // store 8 sobelx + "vst1.8 {d0}, [%3]! \n" // store 8 sobelx "bgt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 @@ -2707,22 +2707,22 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, asm volatile ( ".p2align 2 \n" "1: \n" - "vld1.u8 {d0}, [%0],%4 \n" // left - "vld1.u8 {d1}, [%1],%4 \n" + "vld1.8 {d0}, [%0],%4 \n" // left + "vld1.8 {d1}, [%1],%4 \n" "vsubl.u8 q0, d0, d1 \n" - "vld1.u8 {d2}, [%0],%4 \n" // center * 2 - "vld1.u8 {d3}, [%1],%4 \n" + "vld1.8 {d2}, [%0],%4 \n" // center * 2 + "vld1.8 {d3}, [%1],%4 \n" "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n" - "vld1.u8 {d2}, [%0],%5 \n" // right - "vld1.u8 {d3}, [%1],%5 \n" + "vld1.8 {d2}, [%0],%5 \n" // right + "vld1.8 {d3}, [%1],%5 \n" "subs %3, %3, #8 \n" // 8 pixels "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vabs.s16 q0, q0 \n" "vqmovn.u16 d0, q0 \n" - "vst1.u8 {d0}, [%2]! \n" // store 8 sobely + "vst1.8 {d0}, [%2]! \n" // store 8 sobely "bgt 1b \n" : "+r"(src_y0), // %0 "+r"(src_y1), // %1 diff --git a/source/scale_argb_neon.cc b/source/scale_argb_neon.cc index 1b297b53d..51b008724 100644 --- a/source/scale_argb_neon.cc +++ b/source/scale_argb_neon.cc @@ -24,11 +24,11 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, asm volatile ( "1: \n" // load even pixels into q0, odd into q1 - "vld2.u32 {q0, q1}, [%0]! \n" - "vld2.u32 {q2, q3}, [%0]! \n" + "vld2.32 {q0, q1}, [%0]! \n" + "vld2.32 {q2, q3}, [%0]! \n" "subs %2, %2, #8 \n" // 8 processed per loop - "vst1.u8 {q1}, [%1]! \n" // store odd pixels - "vst1.u8 {q3}, [%1]! \n" + "vst1.8 {q1}, [%1]! \n" // store odd pixels + "vst1.8 {q3}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 @@ -61,7 +61,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "vrshrn.u16 d1, q1, #2 \n" "vrshrn.u16 d2, q2, #2 \n" "vrshrn.u16 d3, q3, #2 \n" - "vst4.u8 {d0, d1, d2, d3}, [%2]! \n" + "vst4.8 {d0, d1, d2, d3}, [%2]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 diff --git a/source/scale_neon.cc b/source/scale_neon.cc index 33240396b..a370349a7 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -27,9 +27,9 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, asm volatile ( "1: \n" // load even pixels into q0, odd into q1 - "vld2.u8 {q0, q1}, [%0]! \n" + "vld2.8 {q0, q1}, [%0]! \n" "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.u8 {q1}, [%1]! \n" // store odd pixels + "vst1.8 {q1}, [%1]! \n" // store odd pixels "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 @@ -45,8 +45,8 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, // change the stride to row 2 pointer "add %1, %0 \n" "1: \n" - "vld1.u8 {q0, q1}, [%0]! \n" // load row 1 and post inc - "vld1.u8 {q2, q3}, [%1]! \n" // load row 2 and post inc + "vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc + "vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc "subs %3, %3, #16 \n" // 16 processed per loop "vpaddl.u8 q0, q0 \n" // row 1 add adjacent "vpaddl.u8 q1, q1 \n" @@ -54,7 +54,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "vpadal.u8 q1, q3 \n" "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack "vrshrn.u16 d1, q1, #2 \n" - "vst1.u8 {q0}, [%2]! \n" + "vst1.8 {q0}, [%2]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(src_stride), // %1 @@ -69,9 +69,9 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst_ptr, int dst_width) { asm volatile ( "1: \n" - "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "subs %2, %2, #8 \n" // 8 processed per loop - "vst1.u8 {d2}, [%1]! \n" + "vst1.8 {d2}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -88,10 +88,10 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "add r5, r4, %3 \n" "add %3, r5, %3 \n" "1: \n" - "vld1.u8 {q0}, [%0]! \n" // load up 16x4 - "vld1.u8 {q1}, [r4]! \n" - "vld1.u8 {q2}, [r5]! \n" - "vld1.u8 {q3}, [%3]! \n" + "vld1.8 {q0}, [%0]! \n" // load up 16x4 + "vld1.8 {q1}, [r4]! \n" + "vld1.8 {q2}, [r5]! \n" + "vld1.8 {q3}, [%3]! \n" "subs %2, %2, #4 \n" "vpaddl.u8 q0, q0 \n" "vpadal.u8 q0, q1 \n" @@ -100,7 +100,7 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "vpaddl.u16 q0, q0 \n" "vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding "vmovn.u16 d0, q0 \n" - "vst1.u32 {d0[0]}, [%1]! \n" + "vst1.32 {d0[0]}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -118,10 +118,10 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, uint8* dst_ptr, int dst_width) { asm volatile ( "1: \n" - "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "subs %2, %2, #24 \n" "vmov d2, d3 \n" // order d0, d1, d2 - "vst3.u8 {d0, d1, d2}, [%1]! \n" + "vst3.8 {d0, d1, d2}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -138,8 +138,8 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, "vmov.u8 d24, #3 \n" "add %3, %0 \n" "1: \n" - "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "subs %2, %2, #24 \n" // filter src line 0 with src line 1 @@ -175,7 +175,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, "vmlal.u8 q8, d3, d24 \n" "vqrshrn.u16 d2, q8, #2 \n" - "vst3.u8 {d0, d1, d2}, [%1]! \n" + "vst3.8 {d0, d1, d2}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 @@ -194,8 +194,8 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, "vmov.u8 d24, #3 \n" "add %3, %0 \n" "1: \n" - "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 - "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1 "subs %2, %2, #24 \n" // average src line 0 with src line 1 "vrhadd.u8 q0, q0, q2 \n" @@ -214,7 +214,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, "vmlal.u8 q3, d3, d24 \n" "vqrshrn.u16 d2, q3, #2 \n" - "vst3.u8 {d0, d1, d2}, [%1]! \n" + "vst3.8 {d0, d1, d2}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -242,14 +242,14 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst_ptr, int dst_width) { asm volatile ( - "vld1.u8 {q3}, [%3] \n" + "vld1.8 {q3}, [%3] \n" "1: \n" - "vld1.u8 {d0, d1, d2, d3}, [%0]! \n" + "vld1.8 {d0, d1, d2, d3}, [%0]! \n" "subs %2, %2, #12 \n" "vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n" "vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n" - "vst1.u8 {d4}, [%1]! \n" - "vst1.u32 {d5[0]}, [%1]! \n" + "vst1.8 {d4}, [%1]! \n" + "vst1.32 {d5[0]}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -264,9 +264,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( - "vld1.u16 {q13}, [%4] \n" - "vld1.u8 {q14}, [%5] \n" - "vld1.u8 {q15}, [%6] \n" + "vld1.16 {q13}, [%4] \n" + "vld1.8 {q14}, [%5] \n" + "vld1.8 {q15}, [%6] \n" "add r4, %0, %3, lsl #1 \n" "add %3, %0 \n" "1: \n" @@ -275,9 +275,9 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, // d1 = 10 50 11 51 12 52 13 53 // d2 = 20 60 21 61 22 62 23 63 // d3 = 30 70 31 71 32 72 33 73 - "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" - "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" - "vld4.u8 {d16, d17, d18, d19}, [r4]! \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" + "vld4.8 {d16, d17, d18, d19}, [r4]! \n" "subs %2, %2, #12 \n" // Shuffle the input data around to get align the data @@ -354,8 +354,8 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, "vtbl.u8 d3, {d0, d1, d2}, d28 \n" "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - "vst1.u8 {d3}, [%1]! \n" - "vst1.u32 {d4[0]}, [%1]! \n" + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -374,8 +374,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( - "vld1.u16 {q13}, [%4] \n" - "vld1.u8 {q14}, [%5] \n" + "vld1.16 {q13}, [%4] \n" + "vld1.8 {q14}, [%5] \n" "add %3, %0 \n" "1: \n" @@ -383,8 +383,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, // d1 = 10 50 11 51 12 52 13 53 // d2 = 20 60 21 61 22 62 23 63 // d3 = 30 70 31 71 32 72 33 73 - "vld4.u8 {d0, d1, d2, d3}, [%0]! \n" - "vld4.u8 {d4, d5, d6, d7}, [%3]! \n" + "vld4.8 {d0, d1, d2, d3}, [%0]! \n" + "vld4.8 {d4, d5, d6, d7}, [%3]! \n" "subs %2, %2, #12 \n" // Shuffle the input data around to get align the data @@ -450,8 +450,8 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, "vtbl.u8 d3, {d0, d1, d2}, d28 \n" "vtbl.u8 d4, {d0, d1, d2}, d29 \n" - "vst1.u8 {d3}, [%1]! \n" - "vst1.u32 {d4[0]}, [%1]! \n" + "vst1.8 {d3}, [%1]! \n" + "vst1.32 {d4[0]}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst_ptr), // %1 @@ -483,8 +483,8 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "vdup.8 d4, %4 \n" // General purpose row blend. "1: \n" - "vld1.u8 {q0}, [%1]! \n" - "vld1.u8 {q1}, [%2]! \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vmull.u8 q13, d0, d4 \n" "vmull.u8 q14, d1, d4 \n" @@ -492,51 +492,51 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "vmlal.u8 q14, d3, d5 \n" "vrshrn.u16 d0, q13, #8 \n" "vrshrn.u16 d1, q14, #8 \n" - "vst1.u8 {q0}, [%0]! \n" + "vst1.8 {q0}, [%0]! \n" "bgt 1b \n" "b 99f \n" // Blend 25 / 75. "25: \n" - "vld1.u8 {q0}, [%1]! \n" - "vld1.u8 {q1}, [%2]! \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" - "vst1.u8 {q0}, [%0]! \n" + "vst1.8 {q0}, [%0]! \n" "bgt 25b \n" "b 99f \n" // Blend 50 / 50. "50: \n" - "vld1.u8 {q0}, [%1]! \n" - "vld1.u8 {q1}, [%2]! \n" + "vld1.8 {q0}, [%1]! \n" + "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" - "vst1.u8 {q0}, [%0]! \n" + "vst1.8 {q0}, [%0]! \n" "bgt 50b \n" "b 99f \n" // Blend 75 / 25. "75: \n" - "vld1.u8 {q1}, [%1]! \n" - "vld1.u8 {q0}, [%2]! \n" + "vld1.8 {q1}, [%1]! \n" + "vld1.8 {q0}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" - "vst1.u8 {q0}, [%0]! \n" + "vst1.8 {q0}, [%0]! \n" "bgt 75b \n" "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" - "vld1.u8 {q0}, [%1]! \n" + "vld1.8 {q0}, [%1]! \n" "subs %3, %3, #16 \n" - "vst1.u8 {q0}, [%0]! \n" + "vst1.8 {q0}, [%0]! \n" "bgt 100b \n" "99: \n" - "vst1.u8 {d1[7]}, [%0] \n" + "vst1.8 {d1[7]}, [%0] \n" : "+r"(dst_ptr), // %0 "+r"(src_ptr), // %1 "+r"(src_stride), // %2