diff --git a/README.chromium b/README.chromium index 22c107b98..c5136bd63 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1012 +Version: 1013 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index cdb6eeabd..222cf183e 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -449,7 +449,7 @@ typedef uint8 uvec8[16]; #if defined(__arm__) #undef MEMACCESS #if defined(__native_client__) -#define MEMACCESS(base) "bic %" #base ", #0xc0000000\n" +#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n" #else #define MEMACCESS(base) "\n" #endif diff --git a/include/libyuv/version.h b/include/libyuv/version.h index fd47537c1..b61c534d7 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1012 +#define LIBYUV_VERSION 1013 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_neon.cc b/source/row_neon.cc index 4f5158f91..a84e3e468 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -16,39 +16,50 @@ extern "C" { #endif // This module is for GCC Neon -#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \ - !defined(__native_client__) +#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) // Read 8 Y, 4 U and 4 V from 422 #define READYUV422 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ "vld1.32 {d2[0]}, [%1]! \n" \ + MEMACCESS(2) \ "vld1.32 {d2[1]}, [%2]! \n" // Read 8 Y, 2 U and 2 V from 422 #define READYUV411 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ "vld1.16 {d2[0]}, [%1]! \n" \ + MEMACCESS(2) \ "vld1.16 {d2[1]}, [%2]! \n" \ "vmov.u8 d3, d2 \n" \ "vzip.u8 d2, d3 \n" // Read 8 Y, 8 U and 8 V from 444 #define READYUV444 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ "vld1.8 {d2}, [%1]! \n" \ + MEMACCESS(2) \ "vld1.8 {d3}, [%2]! \n" \ "vpaddl.u8 q1, q1 \n" \ "vrshrn.u16 d2, q1, #1 \n" // Read 8 Y, and set 4 U and 4 V to 128 #define READYUV400 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ "vmov.u8 d2, #128 \n" // Read 8 Y and 4 UV from NV12 #define READNV12 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ "vld1.8 {d2}, [%1]! \n" \ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ "vuzp.u8 d2, d3 \n" \ @@ -56,7 +67,9 @@ extern "C" { // Read 8 Y and 4 VU from NV21 #define READNV21 \ + MEMACCESS(0) \ "vld1.8 {d0}, [%0]! \n" \ + MEMACCESS(1) \ "vld1.8 {d2}, [%1]! \n" \ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\ "vuzp.u8 d3, d2 \n" \ @@ -64,6 +77,7 @@ extern "C" { // Read 8 YUY2 #define READYUY2 \ + MEMACCESS(0) \ "vld2.8 {d0, d2}, [%0]! \n" \ "vmov.u8 d3, d2 \n" \ "vuzp.u8 d2, d3 \n" \ @@ -71,6 +85,7 @@ extern "C" { // Read 8 UYVY #define READUYVY \ + MEMACCESS(0) \ "vld2.8 {d2, d3}, [%0]! \n" \ "vmov.u8 d0, d3 \n" \ "vmov.u8 d3, d2 \n" \ @@ -114,7 +129,9 @@ void I444ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -125,6 +142,7 @@ void I444ToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -145,7 +163,9 @@ void I422ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -156,6 +176,7 @@ void I422ToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -176,7 +197,9 @@ void I411ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -187,6 +210,7 @@ void I411ToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -207,7 +231,9 @@ void I422ToBGRARow_NEON(const uint8* src_y, uint8* dst_bgra, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -219,6 +245,7 @@ void I422ToBGRARow_NEON(const uint8* src_y, "subs %4, %4, #8 \n" "vswp.u8 d20, d22 \n" "vmov.u8 d19, #255 \n" + MEMACCESS(3) "vst4.8 {d19, d20, d21, d22}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -239,7 +266,9 @@ void I422ToABGRRow_NEON(const uint8* src_y, uint8* dst_abgr, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -251,6 +280,7 @@ void I422ToABGRRow_NEON(const uint8* src_y, "subs %4, %4, #8 \n" "vswp.u8 d20, d22 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(3) "vst4.8 {d20, d21, d22, d23}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -271,7 +301,9 @@ void I422ToRGBARow_NEON(const uint8* src_y, uint8* dst_rgba, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -282,6 +314,7 @@ void I422ToRGBARow_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" "vmov.u8 d19, #255 \n" + MEMACCESS(3) "vst4.8 {d19, d20, d21, d22}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -302,7 +335,9 @@ void I422ToRGB24Row_NEON(const uint8* src_y, uint8* dst_rgb24, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -312,6 +347,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y, READYUV422 YUV422TORGB "subs %4, %4, #8 \n" + MEMACCESS(3) "vst3.8 {d20, d21, d22}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -332,7 +368,9 @@ void I422ToRAWRow_NEON(const uint8* src_y, uint8* dst_raw, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -343,6 +381,7 @@ void I422ToRAWRow_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" "vswp.u8 d20, d22 \n" + MEMACCESS(3) "vst3.8 {d20, d21, d22}, [%3]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -375,7 +414,9 @@ void I422ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -386,6 +427,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y, YUV422TORGB "subs %4, %4, #8 \n" ARGBTORGB565 + MEMACCESS(3) "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565. "bgt 1b \n" : "+r"(src_y), // %0 @@ -421,7 +463,9 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, uint8* dst_argb1555, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -433,6 +477,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" ARGBTOARGB1555 + MEMACCESS(3) "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555. "bgt 1b \n" : "+r"(src_y), // %0 @@ -462,7 +507,9 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, uint8* dst_argb4444, int width) { asm volatile ( + MEMACCESS(5) "vld1.8 {d24}, [%5] \n" + MEMACCESS(6) "vld1.8 {d25}, [%6] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -475,6 +522,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, "subs %4, %4, #8 \n" "vmov.u8 d23, #255 \n" ARGBTOARGB4444 + MEMACCESS(3) "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444. "bgt 1b \n" : "+r"(src_y), // %0 @@ -493,7 +541,10 @@ void YToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(3) + MEMACCESS(3) "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) "vld1.8 {d25}, [%4] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -504,6 +555,7 @@ void YToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %2, %2, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -523,10 +575,12 @@ void I400ToARGBRow_NEON(const uint8* src_y, ".p2align 2 \n" "vmov.u8 d23, #255 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d20}, [%0]! \n" "vmov d21, d20 \n" "vmov d22, d20 \n" "subs %2, %2, #8 \n" + MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -542,7 +596,9 @@ void NV12ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(4) "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) "vld1.8 {d25}, [%5] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -553,6 +609,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %3, %3, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(2) "vst4.8 {d20, d21, d22, d23}, [%2]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -571,7 +628,9 @@ void NV21ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(4) "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) "vld1.8 {d25}, [%5] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -582,6 +641,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y, YUV422TORGB "subs %3, %3, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(2) "vst4.8 {d20, d21, d22, d23}, [%2]! \n" "bgt 1b \n" : "+r"(src_y), // %0 @@ -600,7 +660,9 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( + MEMACCESS(4) "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) "vld1.8 {d25}, [%5] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -611,6 +673,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, YUV422TORGB "subs %3, %3, #8 \n" ARGBTORGB565 + MEMACCESS(2) "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. "bgt 1b \n" : "+r"(src_y), // %0 @@ -629,7 +692,9 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, uint8* dst_rgb565, int width) { asm volatile ( + MEMACCESS(4) "vld1.8 {d24}, [%4] \n" + MEMACCESS(5) "vld1.8 {d25}, [%5] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -640,6 +705,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y, YUV422TORGB "subs %3, %3, #8 \n" ARGBTORGB565 + MEMACCESS(2) "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565. "bgt 1b \n" : "+r"(src_y), // %0 @@ -657,7 +723,9 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(3) "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) "vld1.8 {d25}, [%4] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -668,6 +736,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2, YUV422TORGB "subs %2, %2, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" : "+r"(src_yuy2), // %0 @@ -684,7 +753,9 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, uint8* dst_argb, int width) { asm volatile ( + MEMACCESS(3) "vld1.8 {d24}, [%3] \n" + MEMACCESS(4) "vld1.8 {d25}, [%4] \n" "vmov.u8 d26, #128 \n" "vmov.u16 q14, #74 \n" @@ -695,6 +766,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy, YUV422TORGB "subs %2, %2, #8 \n" "vmov.u8 d23, #255 \n" + MEMACCESS(1) "vst4.8 {d20, d21, d22, d23}, [%1]! \n" "bgt 1b \n" : "+r"(src_uyvy), // %0 @@ -713,9 +785,12 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(1) "vst1.8 {q0}, [%1]! \n" // store U + MEMACCESS(2) "vst1.8 {q1}, [%2]! \n" // store V "bgt 1b \n" : "+r"(src_uv), // %0 @@ -733,9 +808,12 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load U + MEMACCESS(1) "vld1.8 {q1}, [%1]! \n" // load V "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(2) "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV "bgt 1b \n" : @@ -753,8 +831,10 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "subs %2, %2, #32 \n" // 32 processed per loop + MEMACCESS(1) "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32 "bgt 1b \n" : "+r"(src), // %0 @@ -771,6 +851,7 @@ void SetRow_NEON(uint8* dst, uint32 v32, int count) { "vdup.u32 q0, %2 \n" // duplicate 4 ints "1: \n" "subs %1, %1, #16 \n" // 16 bytes per loop + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" // store "bgt 1b \n" : "+r"(dst), // %0 @@ -799,10 +880,13 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0], r3 \n" // src -= 16 "subs %2, #16 \n" // 16 pixels per loop. "vrev64.8 q0, q0 \n" + MEMACCESS(1) "vst1.8 {d1}, [%1]! \n" // dst += 16 + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" "bgt 1b \n" : "+r"(src), // %0 @@ -823,10 +907,13 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16 "subs %3, #8 \n" // 8 pixels per loop. "vrev64.8 q0, q0 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // dst += 8 + MEMACCESS(2) "vst1.8 {d1}, [%2]! \n" "bgt 1b \n" : "+r"(src_uv), // %0 @@ -847,10 +934,13 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0], r3 \n" // src -= 16 "subs %2, #4 \n" // 4 pixels per loop. "vrev64.32 q0, q0 \n" + MEMACCESS(1) "vst1.8 {d1}, [%1]! \n" // dst += 16 + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" "bgt 1b \n" : "+r"(src), // %0 @@ -866,8 +956,10 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { "vmov.u8 d4, #255 \n" // Alpha ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. + MEMACCESS(1) "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_rgb24), // %0 @@ -883,9 +975,11 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { "vmov.u8 d4, #255 \n" // Alpha ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B + MEMACCESS(1) "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_raw), // %0 @@ -913,9 +1007,11 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { "vmov.u8 d3, #255 \n" // Alpha ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB + MEMACCESS(1) "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_rgb565), // %0 @@ -959,9 +1055,11 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, "vmov.u8 d3, #255 \n" // Alpha ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB + MEMACCESS(1) "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb1555), // %0 @@ -988,9 +1086,11 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, "vmov.u8 d3, #255 \n" // Alpha ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB + MEMACCESS(1) "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb4444), // %0 @@ -1005,8 +1105,10 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. + MEMACCESS(1) "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1021,9 +1123,11 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vswp.u8 d1, d3 \n" // swap R, B + MEMACCESS(1) "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1038,8 +1142,10 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "subs %2, %2, #16 \n" // 16 processed per loop. + MEMACCESS(1) "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_yuy2), // %0 @@ -1054,8 +1160,10 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "subs %2, %2, #16 \n" // 16 processed per loop. + MEMACCESS(1) "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_uyvy), // %0 @@ -1071,9 +1179,12 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) "vst1.8 {d1}, [%1]! \n" // store 8 U. + MEMACCESS(2) "vst1.8 {d3}, [%2]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_yuy2), // %0 @@ -1090,9 +1201,12 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 U. + MEMACCESS(2) "vst1.8 {d2}, [%2]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_uyvy), // %0 @@ -1110,12 +1224,16 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, "add %1, %0, %1 \n" // stride + src_yuy2 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. "vrhadd.u8 d1, d1, d5 \n" // average rows of U "vrhadd.u8 d3, d3, d7 \n" // average rows of V + MEMACCESS(2) "vst1.8 {d1}, [%2]! \n" // store 8 U. + MEMACCESS(3) "vst1.8 {d3}, [%3]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_yuy2), // %0 @@ -1134,12 +1252,16 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, "add %1, %0, %1 \n" // stride + src_uyvy ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. + MEMACCESS(1) "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. "vrhadd.u8 d0, d0, d4 \n" // average rows of U "vrhadd.u8 d2, d2, d6 \n" // average rows of V + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 U. + MEMACCESS(3) "vst1.8 {d2}, [%3]! \n" // store 8 V. "bgt 1b \n" : "+r"(src_uyvy), // %0 @@ -1158,10 +1280,13 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, // change the stride to row 2 pointer "add %1, %0 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. "subs %3, %3, #16 \n" // 16 processed per loop + MEMACCESS(1) "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. "vrhadd.u8 q0, q1 \n" // average row 1 and 2 + MEMACCESS(2) "vst1.8 {q0}, [%2]! \n" "bgt 1b \n" : "+r"(src_uv), // %0 @@ -1179,11 +1304,13 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, asm volatile ( "vmov.u32 d6[0], %3 \n" // selector "1: \n" + MEMACCESS(0) "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels "vtrn.u32 d4, d5 \n" // combine 8 pixels + MEMACCESS(1) "vst1.8 {d4}, [%1]! \n" // store 8. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1199,8 +1326,10 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, uint32 /*selector*/, int pix) { asm volatile ( "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop + MEMACCESS(1) "vst1.8 {d1}, [%1]! \n" // store 8 G's. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1215,12 +1344,15 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { asm volatile ( + MEMACCESS(3) "vld1.8 {q2}, [%3] \n" // shuffler "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 4 pixels. "subs %2, %2, #4 \n" // 4 processed per loop "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + MEMACCESS(1) "vst1.8 {q1}, [%1]! \n" // store 4. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1238,10 +1370,14 @@ void I422ToYUY2Row_NEON(const uint8* src_y, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + MEMACCESS(1) "vld1.8 {d1}, [%1]! \n" // load 8 Us + MEMACCESS(2) "vld1.8 {d3}, [%2]! \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels + MEMACCESS(3) "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. "bgt 1b \n" : "+r"(src_y), // %0 @@ -1261,10 +1397,14 @@ void I422ToUYVYRow_NEON(const uint8* src_y, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + MEMACCESS(1) "vld1.8 {d0}, [%1]! \n" // load 8 Us + MEMACCESS(2) "vld1.8 {d2}, [%2]! \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels + MEMACCESS(3) "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. "bgt 1b \n" : "+r"(src_y), // %0 @@ -1281,9 +1421,11 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTORGB565 + MEMACCESS(1) "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1299,9 +1441,11 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB1555 + MEMACCESS(1) "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1318,9 +1462,11 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, "vmov.u8 d4, #0x0f \n" // bits to clear with vbic. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. ARGBTOARGB4444 + MEMACCESS(1) "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1339,6 +1485,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "vmov.u8 d27, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B @@ -1346,6 +1493,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "vmlal.u8 q2, d2, d26 \n" // R "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d27 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1363,12 +1511,14 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B "vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d2, d26 \n" // R "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1391,6 +1541,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B @@ -1406,7 +1557,9 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1430,7 +1583,9 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. @@ -1451,7 +1606,9 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1476,12 +1633,16 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(0) "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels. + MEMACCESS(0) "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels. "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts. @@ -1509,7 +1670,9 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. + MEMACCESS(2) "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1548,12 +1711,16 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1565,7 +1732,9 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1592,12 +1761,16 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. + MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1609,7 +1782,9 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -1635,12 +1810,16 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels. "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels. + MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels. "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts. @@ -1652,7 +1831,9 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q3, q2, q1) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_bgra), // %0 @@ -1678,12 +1859,16 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels. + MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1695,7 +1880,9 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q2, q1, q0) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_abgr), // %0 @@ -1721,12 +1908,16 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels. + MEMACCESS(0) "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels. "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels. + MEMACCESS(1) "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels. "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts. @@ -1738,7 +1929,9 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgba), // %0 @@ -1764,12 +1957,16 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels. + MEMACCESS(0) "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels. + MEMACCESS(1) "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels. "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1781,7 +1978,9 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q0, q1, q2) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgb24), // %0 @@ -1807,12 +2006,16 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels. + MEMACCESS(0) "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels. "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts. + MEMACCESS(1) "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels. + MEMACCESS(1) "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels. "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. @@ -1824,7 +2027,9 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "subs %4, %4, #16 \n" // 32 processed per loop. RGBTOUV(q2, q1, q0) + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_raw), // %0 @@ -1851,22 +2056,26 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. RGB565TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels. RGB565TOARGB "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels. RGB565TOARGB "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels. RGB565TOARGB "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. @@ -1888,7 +2097,9 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_rgb565), // %0 @@ -1915,22 +2126,26 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. RGB555TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels. RGB555TOARGB "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels. RGB555TOARGB "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels. RGB555TOARGB "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. @@ -1952,7 +2167,9 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb1555), // %0 @@ -1979,22 +2196,26 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, "vmov.u16 q15, #0x8080 \n" // 128.5 ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts. "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels. ARGB4444TOARGB "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts. "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts. "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts. + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels. ARGB4444TOARGB "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts. @@ -2016,7 +2237,9 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 pixels U. + MEMACCESS(3) "vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "bgt 1b \n" : "+r"(src_argb4444), // %0 @@ -2038,6 +2261,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { "vmov.u8 d27, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. RGB565TOARGB @@ -2046,6 +2270,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { "vmlal.u8 q2, d2, d26 \n" // R "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d27 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_rgb565), // %0 @@ -2064,6 +2289,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { "vmov.u8 d27, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB1555TOARGB @@ -2072,6 +2298,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { "vmlal.u8 q2, d2, d26 \n" // R "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d27 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb1555), // %0 @@ -2090,6 +2317,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { "vmov.u8 d27, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "subs %2, %2, #8 \n" // 8 processed per loop. ARGB4444TOARGB @@ -2098,6 +2326,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { "vmlal.u8 q2, d2, d26 \n" // R "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d27 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_argb4444), // %0 @@ -2116,6 +2345,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { "vmov.u8 d7, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d1, d4 \n" // R @@ -2123,6 +2353,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { "vmlal.u8 q8, d3, d6 \n" // B "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_bgra), // %0 @@ -2141,6 +2372,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { "vmov.u8 d7, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d0, d4 \n" // R @@ -2148,6 +2380,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { "vmlal.u8 q8, d2, d6 \n" // B "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_abgr), // %0 @@ -2166,6 +2399,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { "vmov.u8 d7, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d1, d4 \n" // B @@ -2173,6 +2407,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { "vmlal.u8 q8, d3, d6 \n" // R "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_rgba), // %0 @@ -2191,6 +2426,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { "vmov.u8 d7, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d0, d4 \n" // B @@ -2198,6 +2434,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { "vmlal.u8 q8, d2, d6 \n" // R "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_rgb24), // %0 @@ -2216,6 +2453,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { "vmov.u8 d7, #16 \n" // Add 16 constant ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q8, d0, d4 \n" // B @@ -2223,6 +2461,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { "vmlal.u8 q8, d2, d6 \n" // R "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y "vqadd.u8 d0, d7 \n" + MEMACCESS(1) "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y. "bgt 1b \n" : "+r"(src_raw), // %0 @@ -2253,7 +2492,9 @@ void InterpolateRow_NEON(uint8* dst_ptr, "vdup.8 d4, %4 \n" // General purpose row blend. "1: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vmull.u8 q13, d0, d4 \n" @@ -2262,46 +2503,58 @@ void InterpolateRow_NEON(uint8* dst_ptr, "vmlal.u8 q14, d3, d5 \n" "vrshrn.u16 d0, q13, #8 \n" "vrshrn.u16 d1, q14, #8 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 1b \n" "b 99f \n" // Blend 25 / 75. "25: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 25b \n" "b 99f \n" // Blend 50 / 50. "50: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q1}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 50b \n" "b 99f \n" // Blend 75 / 25. "75: \n" + MEMACCESS(1) "vld1.8 {q1}, [%1]! \n" + MEMACCESS(2) "vld1.8 {q0}, [%2]! \n" "subs %3, %3, #16 \n" "vrhadd.u8 q0, q1 \n" "vrhadd.u8 q0, q1 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 75b \n" "b 99f \n" // Blend 100 / 0 - Copy row unchanged. "100: \n" + MEMACCESS(1) "vld1.8 {q0}, [%1]! \n" "subs %3, %3, #16 \n" + MEMACCESS(0) "vst1.8 {q0}, [%0]! \n" "bgt 100b \n" @@ -2324,7 +2577,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "blt 89f \n" // Blend 8 pixels. "8: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0. + MEMACCESS(1) "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1. "subs %3, %3, #8 \n" // 8 processed per loop. "vmull.u8 q10, d4, d3 \n" // db * a @@ -2338,6 +2593,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "vqadd.u8 q0, q0, q2 \n" // + sbg "vqadd.u8 d2, d2, d6 \n" // + sr "vmov.u8 d3, #255 \n" // a = 255 + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB. "bge 8b \n" @@ -2347,7 +2603,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // Blend 1 pixels. "1: \n" + MEMACCESS(0) "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0. + MEMACCESS(1) "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1. "subs %3, %3, #1 \n" // 1 processed per loop. "vmull.u8 q10, d4, d3 \n" // db * a @@ -2361,6 +2619,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "vqadd.u8 q0, q0, q2 \n" // + sbg "vqadd.u8 d2, d2, d6 \n" // + sr "vmov.u8 d3, #255 \n" // a = 255 + MEMACCESS(2) "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel. "bge 1b \n" @@ -2380,6 +2639,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( // Attenuate 8 pixels. "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q10, d0, d3 \n" // b * a @@ -2388,6 +2648,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8 "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8 "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8 + MEMACCESS(1) "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -2411,6 +2672,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB. "subs %1, %1, #8 \n" // 8 processed per loop. "vmovl.u8 q0, d0 \n" // b (0 .. 255) @@ -2428,6 +2690,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, "vqmovn.u16 d0, q0 \n" "vqmovn.u16 d2, q1 \n" "vqmovn.u16 d4, q2 \n" + MEMACCESS(0) "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(dst_argb), // %0 @@ -2452,6 +2715,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. "vmovl.u8 q10, d20 \n" // b (0 .. 255) @@ -2466,6 +2730,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, "vqmovn.u16 d22, q11 \n" "vqmovn.u16 d24, q12 \n" "vqmovn.u16 d26, q13 \n" + MEMACCESS(1) "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -2486,6 +2751,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d24 \n" // B @@ -2494,6 +2760,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B "vmov d1, d0 \n" // G "vmov d2, d0 \n" // R + MEMACCESS(1) "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -2521,6 +2788,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { "vmov.u8 d30, #50 \n" // BR coefficient ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. "subs %1, %1, #8 \n" // 8 processed per loop. "vmull.u8 q2, d0, d20 \n" // B to Sepia B @@ -2535,6 +2803,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R + MEMACCESS(0) "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(dst_argb), // %0 @@ -2551,12 +2820,14 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, const int8* matrix_argb, int width) { asm volatile ( + MEMACCESS(3) "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. "vmovl.s8 q0, d4 \n" // B,G coefficients s16. "vmovl.s8 q1, d5 \n" // R,A coefficients s16. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels. "subs %2, %2, #8 \n" // 8 processed per loop. "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit @@ -2595,6 +2866,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A + MEMACCESS(1) "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_argb), // %0 @@ -2615,7 +2887,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vmull.u8 q0, d0, d1 \n" // multiply B @@ -2626,6 +2900,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" @@ -2646,11 +2921,14 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vqadd.u8 q0, q0, q2 \n" // add B, G "vqadd.u8 q1, q1, q3 \n" // add R, A + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" @@ -2670,11 +2948,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. + MEMACCESS(1) "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. "vqsub.u8 q0, q0, q2 \n" // subtract B, G "vqsub.u8 q1, q1, q3 \n" // subtract R, A + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" @@ -2699,12 +2980,15 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d0}, [%0]! \n" // load 8 sobelx. + MEMACCESS(1) "vld1.8 {d1}, [%1]! \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. "vqadd.u8 d0, d0, d1 \n" // add "vmov.u8 d1, d0 \n" "vmov.u8 d2, d0 \n" + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 @@ -2723,10 +3007,13 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // 16 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {q0}, [%0]! \n" // load 16 sobelx. + MEMACCESS(1) "vld1.8 {q1}, [%1]! \n" // load 16 sobely. "subs %3, %3, #16 \n" // 16 processed per loop. "vqadd.u8 q0, q0, q1 \n" // add + MEMACCESS(2) "vst1.8 {q0}, [%2]! \n" // store 16 pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 @@ -2750,10 +3037,13 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, // 8 pixel loop. ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d2}, [%0]! \n" // load 8 sobelx. + MEMACCESS(1) "vld1.8 {d0}, [%1]! \n" // load 8 sobely. "subs %3, %3, #8 \n" // 8 processed per loop. "vqadd.u8 d1, d0, d2 \n" // add + MEMACCESS(2) "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels. "bgt 1b \n" : "+r"(src_sobelx), // %0 @@ -2774,21 +3064,28 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d0}, [%0],%5 \n" // top + MEMACCESS(0) "vld1.8 {d1}, [%0],%6 \n" "vsubl.u8 q0, d0, d1 \n" + MEMACCESS(1) "vld1.8 {d2}, [%1],%5 \n" // center * 2 + MEMACCESS(1) "vld1.8 {d3}, [%1],%6 \n" "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n" + MEMACCESS(2) "vld1.8 {d2}, [%2],%5 \n" // bottom + MEMACCESS(2) "vld1.8 {d3}, [%2],%6 \n" "subs %4, %4, #8 \n" // 8 pixels "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vabs.s16 q0, q0 \n" "vqmovn.u16 d0, q0 \n" + MEMACCESS(3) "vst1.8 {d0}, [%3]! \n" // store 8 sobelx "bgt 1b \n" : "+r"(src_y0), // %0 @@ -2811,21 +3108,28 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, asm volatile ( ".p2align 2 \n" "1: \n" + MEMACCESS(0) "vld1.8 {d0}, [%0],%4 \n" // left + MEMACCESS(1) "vld1.8 {d1}, [%1],%4 \n" "vsubl.u8 q0, d0, d1 \n" + MEMACCESS(0) "vld1.8 {d2}, [%0],%4 \n" // center * 2 + MEMACCESS(1) "vld1.8 {d3}, [%1],%4 \n" "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vadd.s16 q0, q0, q1 \n" + MEMACCESS(0) "vld1.8 {d2}, [%0],%5 \n" // right + MEMACCESS(1) "vld1.8 {d3}, [%1],%5 \n" "subs %3, %3, #8 \n" // 8 pixels "vsubl.u8 q1, d2, d3 \n" "vadd.s16 q0, q0, q1 \n" "vabs.s16 q0, q0 \n" "vqmovn.u16 d0, q0 \n" + MEMACCESS(2) "vst1.8 {d0}, [%2]! \n" // store 8 sobely "bgt 1b \n" : "+r"(src_y0), // %0