diff --git a/README.chromium b/README.chromium index ee2209cee..d71256b18 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1803 +Version: 1805 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 018931bb0..e88e9be24 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -303,6 +303,7 @@ extern "C" { #define HAS_MERGEXRGBROW_SSE2 #define HAS_MERGERGBROW_SSSE3 #define HAS_MIRRORUVROW_SSSE3 +#define HAS_NV21TOYUV24ROW_SSSE3 #define HAS_P210TOAR30ROW_SSSE3 #define HAS_P210TOARGBROW_SSSE3 #define HAS_P410TOAR30ROW_SSSE3 @@ -319,6 +320,7 @@ extern "C" { #define HAS_SPLITRGBROW_SSSE3 #define HAS_SWAPUVROW_SSSE3 + #if defined(__x86_64__) || !defined(__pic__) // TODO(fbarchard): fix build error on android_full_debug=1 // https://code.google.com/p/libyuv/issues/detail?id=517 @@ -355,6 +357,7 @@ extern "C" { #define HAS_MERGEXR64ROW_AVX2 #define HAS_MERGEXRGB16TO8ROW_AVX2 #define HAS_MERGEXRGBROW_AVX2 +#define HAS_NV21TOYUV24ROW_AVX2 #define HAS_I210TOAR30ROW_AVX2 #define HAS_I210TOARGBROW_AVX2 #define HAS_I212TOAR30ROW_AVX2 @@ -379,8 +382,6 @@ extern "C" { #define HAS_SPLITXRGBROW_AVX2 #define HAS_SPLITUVROW_16_AVX2 #define HAS_SWAPUVROW_AVX2 -// TODO(fbarchard): Fix AVX2 version of YUV24 -// #define HAS_NV21TOYUV24ROW_AVX2 #if defined(__x86_64__) || !defined(__pic__) // TODO(fbarchard): fix build error on android_full_debug=1 @@ -3227,6 +3228,10 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); +void NV21ToYUV24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width); void NV21ToYUV24Row_AVX2(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, @@ -3568,6 +3573,10 @@ void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void NV21ToYUV24Row_Any_SSSE3(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width); void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 01a73b5a2..ac2634307 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1803 +#define LIBYUV_VERSION 1805 #endif // INCLUDE_LIBYUV_VERSION_H_ \ No newline at end of file diff --git a/source/convert_argb.cc b/source/convert_argb.cc index d8f7b2773..77d4f3bec 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -3925,6 +3925,14 @@ int NV21ToYUV24(const uint8_t* src_y, } } #endif +#if defined(HAS_NV21TOYUV24ROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + NV21ToYUV24Row = NV21ToYUV24Row_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + NV21ToYUV24Row = NV21ToYUV24Row_SSSE3; + } + } +#endif #if defined(HAS_NV21TOYUV24ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2; diff --git a/source/row_any.cc b/source/row_any.cc index e238c8dad..ed105a023 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -545,21 +545,21 @@ ANY31PT(MergeXRGB16To8Row_Any_NEON, #undef ANY31PT // Any 2 planes to 1. -#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ - void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ - int width) { \ - SIMD_ALIGNED(uint8_t temp[64 * 3]); \ - memset(temp, 0, 64 * 2); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ - } \ - memcpy(temp, y_buf + n * SBPP, r * SBPP); \ - memcpy(temp + 64, uv_buf + (n >> UVSHIFT) * SBPP2, \ - SS(r, UVSHIFT) * SBPP2); \ - ANY_SIMD(temp, temp + 64, temp + 128, MASK + 1); \ - memcpy(dst_ptr + n * BPP, temp + 128, r * BPP); \ +#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ + void NAMEANY(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, \ + int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 3]); \ + memset(temp, 0, 128 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(y_buf, uv_buf, dst_ptr, n); \ + } \ + memcpy(temp, y_buf + n * SBPP, r * SBPP); \ + memcpy(temp + 128, uv_buf + (n >> UVSHIFT) * SBPP2, \ + SS(r, UVSHIFT) * SBPP2); \ + ANY_SIMD(temp, temp + 128, temp + 256, MASK + 1); \ + memcpy(dst_ptr + n * BPP, temp + 256, r * BPP); \ } // Merge functions. @@ -581,6 +581,9 @@ ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7) #ifdef HAS_NV21TOYUV24ROW_NEON ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15) #endif +#ifdef HAS_NV21TOYUV24ROW_SSSE3 +ANY21(NV21ToYUV24Row_Any_SSSE3, NV21ToYUV24Row_SSSE3, 1, 1, 2, 3, 15) +#endif #ifdef HAS_NV21TOYUV24ROW_AVX2 ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31) #endif diff --git a/source/row_gcc.cc b/source/row_gcc.cc index a5f739893..a7523c468 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -8894,127 +8894,135 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, } #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 -#ifdef HAS_NV21TOYUV24ROW_AVX2 +static const uvec8 kYUV24Shuffle[3] = + {{ 8, 9, 0, 8, 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12 }, + { 9, 1, 10, 11, 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15 }, + { 2, 10, 11, 3, 12, 13, 4, 12, 13, 5, 14, 15, 6, 14, 15, 7 }}; -// begin NV21ToYUV24Row_C avx2 constants -static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, - 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, - 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, - 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00}; +// Convert biplanar NV21 to packed YUV24 +// NV21 has VU in memory for chroma. +// YUV24 is VUY in memory +void NV21ToYUV24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + asm volatile( + "sub %0,%1 \n" + "movdqa (%4),%%xmm4 \n" // 3 shuffler constants + "movdqa 16(%4),%%xmm5 \n" + "movdqa 32(%4),%%xmm6 \n" + "1: \n" + "movdqu (%0),%%xmm2 \n" // load 16 Y values + "movdqu (%0,%1),%%xmm3 \n" // load 8 VU values + "lea 16(%0),%0 \n" + "movdqa %%xmm2,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "shufps $0x44,%%xmm3,%%xmm0 \n" // Y 0..7, UV 0..3 + "shufps $0x99,%%xmm3,%%xmm1 \n" // Y 4..11, UV 2..5 + "shufps $0xee,%%xmm3,%%xmm2 \n" // Y 8..15, UV 4..7 + "pshufb %%xmm4, %%xmm0 \n" // weave into YUV24 + "pshufb %%xmm5, %%xmm1 \n" + "pshufb %%xmm6, %%xmm2 \n" + "movdqu %%xmm0,(%2) \n" + "movdqu %%xmm1,16(%2) \n" + "movdqu %%xmm2,32(%2) \n" + "lea 48(%2),%2 \n" + "sub $16,%3 \n" // 16 pixels per loop + "jg 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : "r"(&kYUV24Shuffle[0]) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} -static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, - 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, - 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, - 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80}; - -static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, - 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, - 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, - 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00}; - -static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d, - 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05, - 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d, - 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05}; - -static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, - 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, - 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, - 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80}; - -static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, - 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, - 0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, - 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f}; - -static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80, - 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80, - 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80, - 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80}; - -static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, - 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, - 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, - 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a}; - -static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, - 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, - 0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, - 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80}; - -// NV21ToYUV24Row_AVX2 +// Convert biplanar NV21 to packed YUV24 +// NV21 has VU in memory for chroma. +// YUV24 is VUY in memory void NV21ToYUV24Row_AVX2(const uint8_t* src_y, const uint8_t* src_vu, uint8_t* dst_yuv24, int width) { - uint8_t* src_y_ptr; - uint64_t src_offset = 0; - uint64_t width64; - - width64 = width; - src_y_ptr = (uint8_t*)src_y; - asm volatile( - "vmovdqu %5, %%ymm0 \n" // init blend value - "vmovdqu %6, %%ymm1 \n" // init blend value - "vmovdqu %7, %%ymm2 \n" // init blend value - // "sub $0x20, %3 \n" //sub 32 from - // width for final loop + "sub %0,%1 \n" + "vbroadcastf128 (%4),%%ymm4 \n" // 3 shuffler constants + "vbroadcastf128 16(%4),%%ymm5 \n" + "vbroadcastf128 32(%4),%%ymm6 \n" - LABELALIGN - "1: \n" // label 1 - "vmovdqu (%0,%4), %%ymm3 \n" // src_y - "vmovdqu 1(%1,%4), %%ymm4 \n" // src_uv+1 - "vmovdqu (%1), %%ymm5 \n" // src_uv - "vpshufb %8, %%ymm3, %%ymm13 \n" // y, kSHUF0 for shuf - "vpshufb %9, %%ymm4, %%ymm14 \n" // uv+1, kSHUF1 for - // shuf - "vpshufb %10, %%ymm5, %%ymm15 \n" // uv, kSHUF2 for - // shuf - "vpshufb %11, %%ymm3, %%ymm3 \n" // y kSHUF3 for shuf - "vpshufb %12, %%ymm4, %%ymm4 \n" // uv+1 kSHUF4 for - // shuf - "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" // blend 0 - "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" // blend 0 - "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" // blend 2 - "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" // blend 1 - "vpshufb %13, %%ymm5, %%ymm15 \n" // shuffle const - "vpor %%ymm4, %%ymm3, %%ymm5 \n" // get results - "vmovdqu %%ymm12, 0x20(%2) \n" // store dst_yuv+20h - "vpor %%ymm15, %%ymm5, %%ymm3 \n" // get results - "add $0x20, %4 \n" // add to src buffer - // ptr - "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" // insert - "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" // insert - "vmovdqu %%ymm4, (%2) \n" // store dst_yuv - "vmovdqu %%ymm5, 0x40(%2) \n" // store dst_yuv+40h - "add $0x60,%2 \n" // add to dst buffer - // ptr - // "cmp %3, %4 \n" //(width64 - - // 32 bytes) and src_offset - "sub $0x20,%3 \n" // 32 pixels per loop - "jg 1b \n" - "vzeroupper \n" // sse-avx2 - // transistions - - : "+r"(src_y), //%0 - "+r"(src_vu), //%1 - "+r"(dst_yuv24), //%2 - "+r"(width64), //%3 - "+r"(src_offset) //%4 - : "m"(kBLEND0), //%5 - "m"(kBLEND1), //%6 - "m"(kBLEND2), //%7 - "m"(kSHUF0), //%8 - "m"(kSHUF1), //%9 - "m"(kSHUF2), //%10 - "m"(kSHUF3), //%11 - "m"(kSHUF4), //%12 - "m"(kSHUF5) //%13 - : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12", - "xmm13", "xmm14", "xmm15"); + "1: \n" + "vmovdqu (%0),%%ymm2 \n" // load 32 Y values + "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values + "lea 32(%0),%0 \n" + "vshufps $0x44,%%ymm3,%%ymm2,%%ymm0 \n" // Y 0..7, UV 0..3 + "vshufps $0x99,%%ymm3,%%ymm2,%%ymm1 \n" // Y 4..11, UV 2..5 + "vshufps $0xee,%%ymm3,%%ymm2,%%ymm2 \n" // Y 8..15, UV 4..7 + "vpshufb %%ymm4,%%ymm0,%%ymm0 \n" // weave into YUV24 + "vpshufb %%ymm5,%%ymm1,%%ymm1 \n" + "vpshufb %%ymm6,%%ymm2,%%ymm2 \n" + "vperm2i128 $0x20,%%ymm1,%%ymm0,%%ymm3 \n" + "vperm2i128 $0x30,%%ymm0,%%ymm2,%%ymm0 \n" + "vperm2i128 $0x31,%%ymm2,%%ymm1,%%ymm1 \n" + "vmovdqu %%ymm3,(%2) \n" + "vmovdqu %%ymm0,32(%2) \n" + "vmovdqu %%ymm1,64(%2) \n" + "lea 96(%2),%2 \n" + "sub $32,%3 \n" // 32 pixels per loop + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : "r"(&kYUV24Shuffle[0]) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); } -#endif // HAS_NV21TOYUV24ROW_AVX2 + +#ifdef HAS_NV21ToYUV24ROW_AVX512 +// The following VMBI VEX256 code tests okay with the intelsde emulator. +static const lvec8 kYUV24Perm[3] = + {{ 32, 33, 0, 32, 33, 1, 34, 35, 2, 34, 35, 3, 36, 37, 4, 36, + 37, 5, 38, 39, 6, 38, 39, 7, 40, 41, 8, 40, 41, 9, 42, 43 }, + { 10, 42, 43, 11, 44, 45, 12, 44, 45, 13, 46, 47, 14, 46, 47, 15, + 48, 49, 16, 48, 49, 17, 50, 51, 18, 50, 51, 19, 52, 53, 20, 52 }, + { 53, 21, 54, 55, 22, 54, 55, 23, 56, 57, 24, 56, 57, 25, 58, 59, + 26, 58, 59, 27, 60, 61, 28, 60, 61, 29, 62, 63, 30, 62, 63, 31 }}; + +void NV21ToYUV24Row_AVX512(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + asm volatile( + "sub %0,%1 \n" + "vmovdqa (%4),%%ymm4 \n" // 3 shuffler constants + "vmovdqa 32(%4),%%ymm5 \n" + "vmovdqa 64(%4),%%ymm6 \n" + LABELALIGN + "1: \n" + "vmovdqu (%0),%%ymm2 \n" // load 32 Y values + "vmovdqu (%0,%1),%%ymm3 \n" // load 16 VU values + "lea 32(%0),%0 \n" + "vmovdqa %%ymm2, %%ymm0 \n" + "vmovdqa %%ymm2, %%ymm1 \n" + "vpermt2b %%ymm3,%%ymm4,%%ymm0 \n" + "vpermt2b %%ymm3,%%ymm5,%%ymm1 \n" + "vpermt2b %%ymm3,%%ymm6,%%ymm2 \n" + "vmovdqu %%ymm0,(%2) \n" + "vmovdqu %%ymm1,32(%2) \n" + "vmovdqu %%ymm2,64(%2) \n" + "lea 96(%2),%2 \n" + "sub $32,%3 \n" + "jg 1b \n" + "vzeroupper \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : "r"(&kYUV24Perm[0]) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); +} + +#endif // HAS_NV21ToYUV24ROW_AVX512 #ifdef HAS_SWAPUVROW_SSSE3 diff --git a/source/row_neon64.cc b/source/row_neon64.cc index fff278708..b781bda34 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -15,6 +15,9 @@ namespace libyuv { extern "C" { #endif +// Enable LIBYUV_USE_ST2 and LIBYUV_USE_ST3 for CPUs that prefer them. +// Exynos M1, M2, M3 are slow with ST2, ST3 and ST4 instructions. + // This module is for GCC Neon armv8 64 bit. #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) @@ -1683,6 +1686,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v23"); } +#if LIBYUV_USE_ST2 void ARGBToAR64Row_NEON(const uint8_t* src_argb, uint16_t* dst_ar64, int width) { @@ -1702,6 +1706,28 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb, : : "cc", "memory", "v0", "v1", "v2", "v3"); } +#else +void ARGBToAR64Row_NEON(const uint8_t* src_argb, + uint16_t* dst_ar64, + int width) { + asm volatile( + "1: \n" + "ldp q0, q1, [%0], #32 \n" // load 8 ARGB pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "zip1 v2.16b, v0.16b, v0.16b \n" + "zip2 v3.16b, v0.16b, v0.16b \n" + "prfm pldl1keep, [%0, 448] \n" + "zip1 v4.16b, v1.16b, v1.16b \n" + "zip2 v5.16b, v1.16b, v1.16b \n" + "st1 {v2.16b, v3.16b, v4.16b, v5.16b}, [%1], #64 \n" // 8 AR64 + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_ar64), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5"); +} +#endif // LIBYUV_USE_ST2 static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; @@ -3669,6 +3695,7 @@ void GaussRow_F32_NEON(const float* src, float* dst, int width) { : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"); } +#if LIBYUV_USE_ST3 // Convert biplanar NV21 to packed YUV24 void NV21ToYUV24Row_NEON(const uint8_t* src_y, const uint8_t* src_vu, @@ -3692,8 +3719,42 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, : : "cc", "memory", "v0", "v1", "v2"); } +#else +static const uvec8 kYUV24Shuffle[3] = + {{ 16, 17, 0, 16, 17, 1, 18, 19, 2, 18, 19, 3, 20, 21, 4, 20 }, + { 21, 5, 22, 23, 6, 22, 23, 7, 24, 25, 8, 24, 25, 9, 26, 27 }, + { 10, 26, 27, 11, 28, 29, 12, 28, 29, 13, 30, 31, 14, 30, 31, 15 }}; -// AYUV is YVUA in memory. UV for NV12 is UV order in memory. +// Convert biplanar NV21 to packed YUV24 +// NV21 has VU in memory for chroma. +// YUV24 is VUY in memory +void NV21ToYUV24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + asm volatile( + "ld1 {v5.16b,v6.16b,v7.16b}, [%4]\n" // 3 shuffler constants + "1: \n" + "ld1 {v0.16b}, [%0], #16 \n" // load 16 Y values + "ld1 {v1.16b}, [%1], #16 \n" // load 8 VU values + "tbl v2.16b, {v0.16b,v1.16b}, v5.16b\n" // weave into YUV24 + "prfm pldl1keep, [%0, 448] \n" + "tbl v3.16b, {v0.16b,v1.16b}, v6.16b\n" + "prfm pldl1keep, [%1, 448] \n" + "tbl v4.16b, {v0.16b,v1.16b}, v7.16b\n" + "subs %w3, %w3, #16 \n" // 16 pixels per loop + "st1 {v2.16b,v3.16b,v4.16b}, [%2], #48\n" // store 16 YUV pixels + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : "r"(&kYUV24Shuffle[0]) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); +} +#endif // LIBYUV_USE_ST3 + +// AYUV is VUYA in memory. UV for NV12 is UV order in memory. void AYUVToUVRow_NEON(const uint8_t* src_ayuv, int src_stride_ayuv, uint8_t* dst_uv, @@ -3708,8 +3769,8 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average "uqrshrn v2.8b, v1.8h, #2 \n" "subs %w3, %w3, #16 \n" // 16 processed per loop. @@ -3737,8 +3798,8 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. - "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average "uqrshrn v1.8b, v1.8h, #2 \n" "subs %w3, %w3, #16 \n" // 16 processed per loop.