diff --git a/README.chromium b/README.chromium index 5811eeae8..0062b186b 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1794 +Version: 1796 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 1f6096e66..db63a25cb 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1794 +#define LIBYUV_VERSION 1796 #endif // INCLUDE_LIBYUV_VERSION_H_ \ No newline at end of file diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index e14615847..8285b6c36 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -1298,7 +1298,7 @@ int ARGBToRGB24(const uint8_t* src_argb, #if defined(HAS_ARGBTORGB24ROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON; - if (IS_ALIGNED(width, 8)) { + if (IS_ALIGNED(width, 16)) { ARGBToRGB24Row = ARGBToRGB24Row_NEON; } } diff --git a/source/row_any.cc b/source/row_any.cc index c9a402eda..5b113fb45 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -917,7 +917,7 @@ ANY11(ARGB1555ToARGBRow_Any_AVX2, ARGB1555ToARGBRow_AVX2, 0, 2, 4, 15) ANY11(ARGB4444ToARGBRow_Any_AVX2, ARGB4444ToARGBRow_AVX2, 0, 2, 4, 15) #endif #if defined(HAS_ARGBTORGB24ROW_NEON) -ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 7) +ANY11(ARGBToRGB24Row_Any_NEON, ARGBToRGB24Row_NEON, 0, 4, 3, 15) ANY11(ARGBToRAWRow_Any_NEON, ARGBToRAWRow_NEON, 0, 4, 3, 7) ANY11(ARGBToRGB565Row_Any_NEON, ARGBToRGB565Row_NEON, 0, 4, 2, 7) ANY11(ARGBToARGB1555Row_Any_NEON, ARGBToARGB1555Row_NEON, 0, 4, 2, 7) diff --git a/source/row_neon.cc b/source/row_neon.cc index 6ef6f1c46..03ad8302c 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1304,16 +1304,17 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. - "subs %2, %2, #8 \n" // 8 processed per loop. - "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of - // RGB24. + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" + "subs %2, %2, #16 \n" // 16 processed per loop. + "vst3.8 {d0, d2, d4}, [%1]! \n" // store 16 RGB24 pixels. + "vst3.8 {d1, d3, d5}, [%1]! \n" "bgt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 : - : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List + : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List ); } @@ -2319,9 +2320,6 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"); } -static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, - 10, 9, 8, 11, 14, 13, 12, 15}; - void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { @@ -2342,11 +2340,15 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb, : "cc", "memory", "q0", "q1", "q2", "q3"); } +static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, + 10, 9, 8, 11, 14, 13, 12, 15}; + void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { asm volatile( - "vld1.8 q4, %3 \n" // shuffler + "vld1.8 {q4}, [%3] \n" // shuffler + "1: \n" "vld1.8 {q0}, [%0]! \n" "vld1.8 {q2}, [%0]! \n" @@ -2360,10 +2362,10 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb, "vst2.8 {q0, q1}, [%1]! \n" // store 4 pixels "vst2.8 {q2, q3}, [%1]! \n" // store 4 pixels "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ab64), // %1 - "+r"(width) // %2 - : "m"(kShuffleARGBToABGR) // %3 + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "r"(&kShuffleARGBToABGR) // %3 : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); } @@ -2397,7 +2399,8 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { asm volatile( - "vld1.8 d8, %3 \n" // shuffler + "vld1.8 {d8}, [%3] \n" // shuffler + "1: \n" "vld1.16 {q0}, [%0]! \n" "vld1.16 {q1}, [%0]! \n" @@ -2411,10 +2414,10 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, "vst1.8 {q0}, [%1]! \n" // store 4 pixels "vst1.8 {q2}, [%1]! \n" // store 4 pixels "bgt 1b \n" - : "+r"(src_ab64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAB64ToARGB) // %3 + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kShuffleAB64ToARGB) // %3 : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); } diff --git a/source/row_neon64.cc b/source/row_neon64.cc index e62e52b12..919f7f226 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -1373,17 +1373,16 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, int width) { asm volatile( "1: \n" - "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB - "subs %w2, %w2, #8 \n" // 8 processed per loop. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64\n" // load 16 ARGB + "subs %w2, %w2, #16 \n" // 16 pixels per loop. "prfm pldl1keep, [%0, 448] \n" - "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of - // RGB24 + "st3 {v0.16b,v1.16b,v2.16b}, [%1], #48\n" // store 8 RGB24 "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 "+r"(width) // %2 : - : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List + : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } @@ -1684,8 +1683,6 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23"); } -static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, - 10, 9, 8, 11, 14, 13, 12, 15}; void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, @@ -1707,11 +1704,14 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb, : "cc", "memory", "v0", "v1", "v2", "v3"); } +static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, + 10, 9, 8, 11, 14, 13, 12, 15}; + void ARGBToAB64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ab64, int width) { asm volatile( - "ldr q4, %3 \n" // shuffler + "ldr q4, [%3] \n" // shuffler "1: \n" "ldp q0, q2, [%0], #32 \n" // load 8 pixels "tbl v0.16b, {v0.16b}, v4.16b \n" @@ -1723,10 +1723,10 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb, "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_ab64), // %1 - "+r"(width) // %2 - : "m"(kShuffleARGBToABGR) // %3 + : "+r"(src_argb), // %0 + "+r"(dst_ab64), // %1 + "+r"(width) // %2 + : "r"(&kShuffleARGBToABGR) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } @@ -1737,7 +1737,7 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64, uint8_t* dst_argb, int width) { asm volatile( - "ldr q4, %3 \n" // shuffler + "ldr q4, [%3] \n" // shuffler "1: \n" "ldp q0, q1, [%0], #32 \n" // load 4 pixels "ldp q2, q3, [%0], #32 \n" // load 4 pixels @@ -1747,10 +1747,10 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64, "subs %w2, %w2, #8 \n" // 8 processed per loop. "stp q0, q2, [%1], #32 \n" // store 8 pixels "b.gt 1b \n" - : "+r"(src_ar64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAR64ToARGB) // %3 + : "+r"(src_ar64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kShuffleAR64ToARGB) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } @@ -1761,7 +1761,7 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, uint8_t* dst_argb, int width) { asm volatile( - "ldr q4, %3 \n" // shuffler + "ldr q4, [%3] \n" // shuffler "1: \n" "ldp q0, q1, [%0], #32 \n" // load 4 pixels "ldp q2, q3, [%0], #32 \n" // load 4 pixels @@ -1771,10 +1771,10 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, "subs %w2, %w2, #8 \n" // 8 processed per loop. "stp q0, q2, [%1], #32 \n" // store 8 pixels "b.gt 1b \n" - : "+r"(src_ab64), // %0 - "+r"(dst_argb), // %1 - "+r"(width) // %2 - : "m"(kShuffleAB64ToARGB) // %3 + : "+r"(src_ab64), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(&kShuffleAB64ToARGB) // %3 : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); }