diff --git a/README.chromium b/README.chromium index 8d6c9524e..fcab38358 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1053 +Version: 1054 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index bb4ab6947..f0ed6c942 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -279,25 +279,25 @@ extern "C" { // #define HAS_MIRRORROW_NEON // #define HAS_MIRRORUVROW_NEON // #define HAS_ARGBMIRRORROW_NEON -// #define HAS_RGB24TOARGBROW_NEON -// #define HAS_RAWTOARGBROW_NEON +#define HAS_RGB24TOARGBROW_NEON +#define HAS_RAWTOARGBROW_NEON // #define HAS_RGB565TOARGBROW_NEON // #define HAS_ARGB1555TOARGBROW_NEON // #define HAS_ARGB4444TOARGBROW_NEON -// #define HAS_ARGBTORGB24ROW_NEON -// #define HAS_ARGBTORAWROW_NEON -// #define HAS_YUY2TOYROW_NEON -// #define HAS_UYVYTOYROW_NEON -// #define HAS_YUY2TOUV422ROW_NEON -// #define HAS_UYVYTOUV422ROW_NEON -// #define HAS_YUY2TOUVROW_NEON -// #define HAS_UYVYTOUVROW_NEON -// #define HAS_HALFROW_NEON -// #define HAS_ARGBTOBAYERROW_NEON -// #define HAS_ARGBTOBAYERGGROW_NEON -// #define HAS_ARGBSHUFFLEROW_NEON -// #define HAS_I422TOYUY2ROW_NEON -// #define HAS_I422TOUYVYROW_NEON +#define HAS_ARGBTORGB24ROW_NEON +#define HAS_ARGBTORAWROW_NEON +#define HAS_YUY2TOYROW_NEON +#define HAS_UYVYTOYROW_NEON +#define HAS_YUY2TOUV422ROW_NEON +#define HAS_UYVYTOUV422ROW_NEON +#define HAS_YUY2TOUVROW_NEON +#define HAS_UYVYTOUVROW_NEON +#define HAS_HALFROW_NEON +#define HAS_ARGBTOBAYERROW_NEON +#define HAS_ARGBTOBAYERGGROW_NEON +#define HAS_ARGBSHUFFLEROW_NEON +#define HAS_I422TOYUY2ROW_NEON +#define HAS_I422TOUYVYROW_NEON // #define HAS_ARGBTORGB565ROW_NEON // #define HAS_ARGBTOARGB1555ROW_NEON // #define HAS_ARGBTOARGB4444ROW_NEON diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 6c0e316fe..86c685b5e 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1053 +#define LIBYUV_VERSION 1054 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_any.cc b/source/row_any.cc index 97ef84417..ce8b3dad1 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -79,9 +79,13 @@ YANY(I422ToARGB4444Row_Any_NEON, I422ToARGB4444Row_NEON, I422ToARGB4444Row_C, YANY(I422ToARGB1555Row_Any_NEON, I422ToARGB1555Row_NEON, I422ToARGB1555Row_C, 1, 2, 7) YANY(I422ToRGB565Row_Any_NEON, I422ToRGB565Row_NEON, I422ToRGB565Row_C, 1, 2, 7) -YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15) -YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15) #endif // HAS_I422TOARGBROW_NEON +#ifdef HAS_I422TOYUY2ROW_NEON +YANY(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, I422ToYUY2Row_C, 1, 2, 15) +#endif // HAS_I422TOYUY2ROW_NEON +#ifdef HAS_I422TOUYVYROW_NEON +YANY(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, I422ToUYVYRow_C, 1, 2, 15) +#endif // HAS_I422TOUYVYROW_NEON #undef YANY // Wrappers to handle odd width @@ -250,12 +254,26 @@ YANY(RAWToYRow_Any_NEON, RAWToYRow_NEON, 3, 1, 8) YANY(RGB565ToYRow_Any_NEON, RGB565ToYRow_NEON, 2, 1, 8) YANY(ARGB1555ToYRow_Any_NEON, ARGB1555ToYRow_NEON, 2, 1, 8) YANY(ARGB4444ToYRow_Any_NEON, ARGB4444ToYRow_NEON, 2, 1, 8) +#endif +#ifdef HAS_YUY2TOYROW_NEON YANY(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 2, 1, 16) +#endif +#ifdef HAS_UYVYTOYROW_NEON YANY(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 2, 1, 16) +#endif +#ifdef HAS_RGB24TOARGBROW_NEON YANY(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 3, 4, 8) +#endif +#ifdef HAS_RAWTOARGBROW_NEON YANY(RAWToARGBRow_Any_NEON, RAWToARGBRow_NEON, 3, 4, 8) +#endif +#ifdef HAS_RGB565TOARGBROW_NEON YANY(RGB565ToARGBRow_Any_NEON, RGB565ToARGBRow_NEON, 2, 4, 8) +#endif +#ifdef HAS_ARGB1555TOARGBROW_NEON YANY(ARGB1555ToARGBRow_Any_NEON, ARGB1555ToARGBRow_NEON, 2, 4, 8) +#endif +#ifdef HAS_ARGB4444TOARGBROW_NEON YANY(ARGB4444ToARGBRow_Any_NEON, ARGB4444ToARGBRow_NEON, 2, 4, 8) #endif #undef YANY @@ -333,7 +351,11 @@ UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15) UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15) UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15) UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15) +#endif +#ifdef HAS_YUY2TOUVROW_NEON UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15) +#endif +#ifdef HAS_UYVYTOUVROW_NEON UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15) #endif #undef UVANY diff --git a/source/row_neon64.cc b/source/row_neon64.cc index e11768cca..10aca4c7c 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -1007,20 +1007,20 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { #ifdef HAS_RGB24TOARGBROW_NEON void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { asm volatile ( - "vmov.u8 d4, #255 \n" // Alpha + "movi v4.8b, #255 \n" // Alpha ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. + "ld3 {v1.8b-v3.8b}, [%0], #24 \n" // load 8 pixels of RGB24. "subs %2, %2, #8 \n" // 8 processed per loop. MEMACCESS(1) - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "st4 {v1.8b-v4.8b}, [%1], #32 \n" // store 8 pixels of ARGB. "bgt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List ); } #endif // HAS_RGB24TOARGBROW_NEON @@ -1028,21 +1028,22 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { #ifdef HAS_RAWTOARGBROW_NEON void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { asm volatile ( - "vmov.u8 d4, #255 \n" // Alpha + "movi v5.8b, #255 \n" // Alpha ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. + "ld3 {v0.8b-v2.8b}, [%0], #24 \n" // read r g b "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B + "mov v3.8b, v1.8b \n" // move g + "mov v4.8b, v0.8b \n" // move r MEMACCESS(1) - "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB. + "st4 {v2.8b-v5.8b}, [%1], #32 \n" // store b g r a "bgt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List ); } #endif // HAS_RAWTOARGBROW_NEON @@ -1170,16 +1171,16 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load 8 pixels of ARGB. "subs %2, %2, #8 \n" // 8 processed per loop. MEMACCESS(1) - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24. + "st3 {v1.8b-v3.8b}, [%1], #24 \n" // store 8 pixels of RGB24. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List ); } #endif // HAS_ARGBTORGB24ROW_NEON @@ -1190,17 +1191,18 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. + "ld4 {v1.8b-v4.8b}, [%0], #32 \n" // load b g r a "subs %2, %2, #8 \n" // 8 processed per loop. - "vswp.u8 d1, d3 \n" // swap R, B + "mov v4.8b, v2.8b \n" // mov g + "mov v5.8b, v1.8b \n" // mov b MEMACCESS(1) - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW. + "st3 {v3.8b-v5.8b}, [%1], #24 \n" // store r g b "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 "+r"(pix) // %2 : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List ); } #endif // HAS_ARGBTORAWROW_NEON @@ -1211,16 +1213,16 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %2, %2, #16 \n" // 16 processed per loop. MEMACCESS(1) - "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y. + "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1" // Clobber List ); } #endif // HAS_YUY2TOYROW_NEON @@ -1231,16 +1233,16 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. + "ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %2, %2, #16 \n" // 16 processed per loop. MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y. + "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1" // Clobber List ); } #endif // HAS_UYVYTOYROW_NEON @@ -1252,19 +1254,19 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // store 8 U. + "st1 {v1.8b}, [%1], #8 \n" // store 8 U. MEMACCESS(2) - "vst1.8 {d3}, [%2]! \n" // store 8 V. + "st1 {v3.8b}, [%2], #8 \n" // store 8 V. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(pix) // %3 : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_YUY2TOUV422ROW_NEON @@ -1276,19 +1278,19 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) - "vst1.8 {d0}, [%1]! \n" // store 8 U. + "st1 {v0.8b}, [%1], #8 \n" // store 8 U. MEMACCESS(2) - "vst1.8 {d2}, [%2]! \n" // store 8 V. + "st1 {v2.8b}, [%2], #8 \n" // store 8 V. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 "+r"(pix) // %3 : - : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_UYVYTOUV422ROW_NEON @@ -1297,20 +1299,20 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( - "add %1, %0, %1 \n" // stride + src_yuy2 + "add %x1, %x0, %w1, sxtw \n" // stride + src_yuy2 ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of YUY2. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2. - "vrhadd.u8 d1, d1, d5 \n" // average rows of U - "vrhadd.u8 d3, d3, d7 \n" // average rows of V + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row YUY2. + "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U + "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V MEMACCESS(2) - "vst1.8 {d1}, [%2]! \n" // store 8 U. + "st1 {v1.8b}, [%2], #8 \n" // store 8 U. MEMACCESS(3) - "vst1.8 {d3}, [%3]! \n" // store 8 V. + "st1 {v3.8b}, [%3], #8 \n" // store 8 V. "bgt 1b \n" : "+r"(src_yuy2), // %0 "+r"(stride_yuy2), // %1 @@ -1318,7 +1320,7 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List ); } #endif // HAS_YUY2TOUVROW_NEON @@ -1327,20 +1329,20 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( - "add %1, %0, %1 \n" // stride + src_uyvy + "add %x1, %x0, %w1, sxtw \n" // stride + src_uyvy ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load 16 pixels of UYVY. "subs %4, %4, #16 \n" // 16 pixels = 8 UVs. MEMACCESS(1) - "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY. - "vrhadd.u8 d0, d0, d4 \n" // average rows of U - "vrhadd.u8 d2, d2, d6 \n" // average rows of V + "ld4 {v4.8b-v7.8b}, [%1], #32 \n" // load next row UYVY. + "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U + "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V MEMACCESS(2) - "vst1.8 {d0}, [%2]! \n" // store 8 U. + "st1 {v0.8b}, [%2], #8 \n" // store 8 U. MEMACCESS(3) - "vst1.8 {d2}, [%3]! \n" // store 8 V. + "st1 {v2.8b}, [%3], #8 \n" // store 8 V. "bgt 1b \n" : "+r"(src_uyvy), // %0 "+r"(stride_uyvy), // %1 @@ -1348,7 +1350,7 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, "+r"(dst_v), // %3 "+r"(pix) // %4 : - : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List ); } #endif // HAS_UYVYTOUVROW_NEON @@ -1358,23 +1360,23 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, uint8* dst_uv, int pix) { asm volatile ( // change the stride to row 2 pointer - "add %1, %0 \n" + "add %x1, %x0, %w1, sxtw \n" "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load row 1 16 pixels. "subs %3, %3, #16 \n" // 16 processed per loop MEMACCESS(1) - "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels. - "vrhadd.u8 q0, q1 \n" // average row 1 and 2 + "ld1 {v1.16b}, [%1], #16 \n" // load row 2 16 pixels. + "urhadd v0.16b, v0.16b, v1.16b \n" // average row 1 and 2 MEMACCESS(2) - "vst1.8 {q0}, [%2]! \n" + "st1 {v0.16b}, [%2], #16 \n" "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(src_uv_stride), // %1 "+r"(dst_uv), // %2 "+r"(pix) // %3 : - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1" // Clobber List ); } #endif // HAS_HALFROW_NEON @@ -1384,22 +1386,22 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride, void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer, uint32 selector, int pix) { asm volatile ( - "vmov.u32 d6[0], %3 \n" // selector + "mov v2.s[0], %w3 \n" // selector "1: \n" MEMACCESS(0) - "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels. + "ld1 {v0.16b, v1.16b}, [%0], 32 \n" // load row 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop - "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels - "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels - "vtrn.u32 d4, d5 \n" // combine 8 pixels + "tbl v4.8b, {v0.16b}, v2.8b \n" // look up 4 pixels + "tbl v5.8b, {v1.16b}, v2.8b \n" // look up 4 pixels + "trn1 v4.4s, v4.4s, v5.4s \n" // combine 8 pixels MEMACCESS(1) - "vst1.8 {d4}, [%1]! \n" // store 8. + "st1 {v4.8b}, [%1], #8 \n" // store 8. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 "+r"(pix) // %2 : "r"(selector) // %3 - : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v4", "v5" // Clobber List ); } #endif // HAS_ARGBTOBAYERROW_NEON @@ -1411,16 +1413,16 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer, asm volatile ( "1: \n" MEMACCESS(0) - "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels. + "ld4 {v0.8b-v3.8b}, [%0], #32 \n" // load row 8 pixels. "subs %2, %2, #8 \n" // 8 processed per loop MEMACCESS(1) - "vst1.8 {d1}, [%1]! \n" // store 8 G's. + "st1 {v1.8b}, [%1], #8 \n" // store 8 G's. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_bayer), // %1 "+r"(pix) // %2 : - : "cc", "memory", "q0", "q1" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } #endif // HAS_ARGBTOBAYERGGROW_NEON @@ -1431,21 +1433,20 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, const uint8* shuffler, int pix) { asm volatile ( MEMACCESS(3) - "vld1.8 {q2}, [%3] \n" // shuffler + "ld1 {v2.16b}, [%3] \n" // shuffler "1: \n" MEMACCESS(0) - "vld1.8 {q0}, [%0]! \n" // load 4 pixels. + "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. "subs %2, %2, #4 \n" // 4 processed per loop - "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels - "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels + "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels MEMACCESS(1) - "vst1.8 {q1}, [%1]! \n" // store 4. + "st1 {v1.16b}, [%1], #16 \n" // store 4. "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(pix) // %2 : "r"(shuffler) // %3 - : "cc", "memory", "q0", "q1", "q2" // Clobber List + : "cc", "memory", "v0", "v1", "v2" // Clobber List ); } #endif // HAS_ARGBSHUFFLEROW_NEON @@ -1459,14 +1460,15 @@ void I422ToYUY2Row_NEON(const uint8* src_y, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys + "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys + "mov v2.8b, v1.8b \n" MEMACCESS(1) - "vld1.8 {d1}, [%1]! \n" // load 8 Us + "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us MEMACCESS(2) - "vld1.8 {d3}, [%2]! \n" // load 8 Vs + "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels MEMACCESS(3) - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels. + "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 YUY2/16 pixels. "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 @@ -1474,7 +1476,7 @@ void I422ToYUY2Row_NEON(const uint8* src_y, "+r"(dst_yuy2), // %3 "+r"(width) // %4 : - : "cc", "memory", "d0", "d1", "d2", "d3" + : "cc", "memory", "v0", "v1", "v2", "v3" ); } #endif // HAS_I422TOYUY2ROW_NEON @@ -1488,14 +1490,15 @@ void I422ToUYVYRow_NEON(const uint8* src_y, ".p2align 2 \n" "1: \n" MEMACCESS(0) - "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys + "ld2 {v1.8b, v2.8b}, [%0], #16 \n" // load 16 Ys + "mov v3.8b, v2.8b \n" MEMACCESS(1) - "vld1.8 {d0}, [%1]! \n" // load 8 Us + "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us MEMACCESS(2) - "vld1.8 {d2}, [%2]! \n" // load 8 Vs + "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs "subs %4, %4, #16 \n" // 16 pixels MEMACCESS(3) - "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels. + "st4 {v0.8b-v3.8b}, [%3], #32 \n" // Store 8 UYVY/16 pixels. "bgt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 @@ -1503,7 +1506,7 @@ void I422ToUYVYRow_NEON(const uint8* src_y, "+r"(dst_uyvy), // %3 "+r"(width) // %4 : - : "cc", "memory", "d0", "d1", "d2", "d3" + : "cc", "memory", "v0", "v1", "v2", "v3" ); } #endif // HAS_I422TOUYVYROW_NEON