From 312c02a5aad4adda67cb2e0cc93a497d12845522 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Wed, 24 Mar 2021 13:45:04 -0700 Subject: [PATCH] Fixes for SplitUVPlane_16 and MergeUVPlane_16 Planar functions pass depth instead of scale factor. Row functions pass shift instead of depth. Add assert to C. AVX shift instruction expects a single shift value in XMM. Neon pass shift as input (not output). Split Neon reimplemented as left shift on shorts by negative to achieve right shift. Add planar unitests Bug: libyuv:888 Change-Id: I8fe62d3d777effc5321c361cd595c58b7f93807e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2782086 Reviewed-by: richard winterton Reviewed-by: Mirko Bonadei --- README.chromium | 2 +- include/libyuv/convert.h | 18 +-- include/libyuv/version.h | 2 +- source/convert.cc | 30 ++-- source/planar_functions.cc | 49 ++++--- source/row_common.cc | 8 ++ source/row_gcc.cc | 17 +-- source/row_neon.cc | 39 ++---- source/row_neon64.cc | 278 ++++++++++++++++++------------------- unit_test/planar_test.cc | 98 +++++++++++++ 10 files changed, 311 insertions(+), 230 deletions(-) diff --git a/README.chromium b/README.chromium index 8e6d8cb40..8b2021ac3 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1782 +Version: 1783 License: BSD License File: LICENSE diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index c24430cc6..93e7550be 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -696,15 +696,15 @@ int RAWToI420(const uint8_t* src_raw, // RGB big endian (rgb in memory) to J420. LIBYUV_API int RAWToJ420(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height); + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height); // RGB16 (RGBP fourcc) little endian to I420. LIBYUV_API diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 5aa410032..2f565a747 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1782 +#define LIBYUV_VERSION 1783 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index 768e0f379..69f7fb6e0 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -400,7 +400,7 @@ int I210ToI010(const uint16_t* src_y, } // Any I[420]1[02] to P[420]1[02] format with mirroring. -static int Ix1xToPx1x(const uint16_t* src_y, +static int IxxxToPxxx(const uint16_t* src_y, int src_stride_y, const uint16_t* src_u, int src_stride_u, @@ -441,7 +441,7 @@ int I010ToP010(const uint16_t* src_y, int dst_stride_uv, int width, int height) { - return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v, + return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, width, height, 1, 1, 10); } @@ -459,7 +459,7 @@ int I210ToP210(const uint16_t* src_y, int dst_stride_uv, int width, int height) { - return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v, + return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, width, height, 1, 0, 10); } @@ -477,7 +477,7 @@ int I012ToP012(const uint16_t* src_y, int dst_stride_uv, int width, int height) { - return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v, + return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, width, height, 1, 1, 12); } @@ -495,7 +495,7 @@ int I212ToP212(const uint16_t* src_y, int dst_stride_uv, int width, int height) { - return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v, + return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv, width, height, 1, 0, 12); } @@ -2246,20 +2246,20 @@ int RAWToI420(const uint8_t* src_raw, // Convert RAW to J420. LIBYUV_API int RAWToJ420(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_u, - int dst_stride_u, - uint8_t* dst_v, - int dst_stride_v, - int width, - int height) { + int src_stride_raw, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_u, + int dst_stride_u, + uint8_t* dst_v, + int dst_stride_v, + int width, + int height) { int y; #if (defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \ defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI) void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw, - uint8_t* dst_u, uint8_t* dst_v, int width) = + uint8_t* dst_u, uint8_t* dst_v, int width) = RAWToUVJRow_C; void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = RAWToYJRow_C; diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 2f2089fbd..38287af02 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -10,6 +10,7 @@ #include "libyuv/planar_functions.h" +#include #include // for memset() #include "libyuv/cpu_id.h" @@ -563,9 +564,9 @@ void SplitUVPlane_16(const uint16_t* src_uv, int height, int depth) { int y; - int scale = 1 << depth; - void (*SplitUVRow)(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v, - int scale, int width) = SplitUVRow_16_C; + void (*SplitUVRow_16)(const uint16_t* src_uv, uint16_t* dst_u, + uint16_t* dst_v, int depth, int width) = + SplitUVRow_16_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -583,24 +584,24 @@ void SplitUVPlane_16(const uint16_t* src_uv, } #if defined(HAS_SPLITUVROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - SplitUVRow = SplitUVRow_16_Any_AVX2; + SplitUVRow_16 = SplitUVRow_16_Any_AVX2; if (IS_ALIGNED(width, 16)) { - SplitUVRow = SplitUVRow_16_AVX2; + SplitUVRow_16 = SplitUVRow_16_AVX2; } } #endif #if defined(HAS_SPLITUVROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { - SplitUVRow = SplitUVRow_16_Any_NEON; + SplitUVRow_16 = SplitUVRow_16_Any_NEON; if (IS_ALIGNED(width, 8)) { - SplitUVRow = SplitUVRow_16_NEON; + SplitUVRow_16 = SplitUVRow_16_NEON; } } #endif for (y = 0; y < height; ++y) { // Copy a row of UV. - SplitUVRow(src_uv, dst_u, dst_v, scale, width); + SplitUVRow_16(src_uv, dst_u, dst_v, depth, width); dst_u += dst_stride_u; dst_v += dst_stride_v; src_uv += src_stride_uv; @@ -618,9 +619,11 @@ void MergeUVPlane_16(const uint16_t* src_u, int height, int depth) { int y; - int scale = 1 << (16 - depth); - void (*MergeUVRow)(const uint16_t* src_u, const uint16_t* src_v, - uint16_t* dst_uv, int scale, int width) = MergeUVRow_16_C; + void (*MergeUVRow_16)(const uint16_t* src_u, const uint16_t* src_v, + uint16_t* dst_uv, int depth, int width) = + MergeUVRow_16_C; + assert(depth >= 8); + assert(depth <= 16); // Negative height means invert the image. if (height < 0) { height = -height; @@ -636,24 +639,24 @@ void MergeUVPlane_16(const uint16_t* src_u, } #if defined(HAS_MERGEUVROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - MergeUVRow = MergeUVRow_16_Any_AVX2; + MergeUVRow_16 = MergeUVRow_16_Any_AVX2; if (IS_ALIGNED(width, 16)) { - MergeUVRow = MergeUVRow_16_AVX2; + MergeUVRow_16 = MergeUVRow_16_AVX2; } } #endif #if defined(HAS_MERGEUVROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { - MergeUVRow = MergeUVRow_16_Any_NEON; + MergeUVRow_16 = MergeUVRow_16_Any_NEON; if (IS_ALIGNED(width, 8)) { - MergeUVRow = MergeUVRow_16_NEON; + MergeUVRow_16 = MergeUVRow_16_NEON; } } #endif for (y = 0; y < height; ++y) { // Merge a row of U and V into a row of UV. - MergeUVRow(src_u, src_v, dst_uv, scale, width); + MergeUVRow_16(src_u, src_v, dst_uv, depth, width); src_u += src_stride_u; src_v += src_stride_v; dst_uv += dst_stride_uv; @@ -671,8 +674,8 @@ void ConvertToMSBPlane_16(const uint16_t* src_y, int depth) { int y; int scale = 1 << (16 - depth); - void (*MultiplyRow)(const uint16_t* src_y, uint16_t* dst_y, int scale, - int width) = MultiplyRow_16_C; + void (*MultiplyRow_16)(const uint16_t* src_y, uint16_t* dst_y, int scale, + int width) = MultiplyRow_16_C; // Negative height means invert the image. if (height < 0) { height = -height; @@ -688,23 +691,23 @@ void ConvertToMSBPlane_16(const uint16_t* src_y, #if defined(HAS_MULTIPLYROW_16_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - MultiplyRow = MultiplyRow_16_Any_AVX2; + MultiplyRow_16 = MultiplyRow_16_Any_AVX2; if (IS_ALIGNED(width, 32)) { - MultiplyRow = MultiplyRow_16_AVX2; + MultiplyRow_16 = MultiplyRow_16_AVX2; } } #endif #if defined(HAS_MULTIPLYROW_16_NEON) if (TestCpuFlag(kCpuHasNEON)) { - MultiplyRow = MultiplyRow_16_Any_NEON; + MultiplyRow_16 = MultiplyRow_16_Any_NEON; if (IS_ALIGNED(width, 16)) { - MultiplyRow = MultiplyRow_16_NEON; + MultiplyRow_16 = MultiplyRow_16_NEON; } } #endif for (y = 0; y < height; ++y) { - MultiplyRow(src_y, dst_y, scale, width); + MultiplyRow_16(src_y, dst_y, scale, width); src_y += src_stride_y; dst_y += dst_stride_y; } diff --git a/source/row_common.cc b/source/row_common.cc index b80e0b3b9..0e84961b3 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -10,6 +10,7 @@ #include "libyuv/row.h" +#include #include #include // For memcpy and memset. @@ -3045,6 +3046,8 @@ void MergeUVRow_16_C(const uint16_t* src_u, int depth, int width) { int shift = 16 - depth; + assert(depth >= 8); + assert(depth <= 16); int x; for (x = 0; x < width; ++x) { dst_uv[0] = src_u[x] << shift; @@ -3061,6 +3064,8 @@ void SplitUVRow_16_C(const uint16_t* src_uv, int width) { int shift = 16 - depth; int x; + assert(depth >= 8); + assert(depth <= 16); for (x = 0; x < width; ++x) { dst_u[x] = src_uv[0] >> shift; dst_v[x] = src_uv[1] >> shift; @@ -3098,6 +3103,9 @@ void Convert16To8Row_C(const uint16_t* src_y, int scale, int width) { int x; + assert(scale >= 256); + assert(scale <= 32768); + for (x = 0; x < width; ++x) { dst_y[x] = clamp255((src_y[x] * scale) >> 16); } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 1b4ad9b03..2591bcfee 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -4728,8 +4728,6 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, // clang-format off asm volatile ( "vmovd %4,%%xmm3 \n" - "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%xmm3 \n" "sub %0,%1 \n" // 16 pixels per loop. @@ -4761,7 +4759,7 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u, } #endif // HAS_MERGEUVROW_AVX2 -#ifdef HAS_MERGEUVROW_16_AVX2 +#ifdef HAS_SPLITUVROW_16_AVX2 const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15}; void SplitUVRow_16_AVX2(const uint16_t* src_uv, @@ -4773,8 +4771,6 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv, // clang-format off asm volatile ( "vmovd %4,%%xmm3 \n" - "vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n" - "vbroadcastss %%xmm3,%%xmm3 \n" "vbroadcastf128 %5,%%ymm4 \n" "sub %1,%2 \n" @@ -4800,16 +4796,15 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv, "jg 1b \n" "vzeroupper \n" : "+r"(src_uv), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width), // %3 - "+r"(depth) // %4 - : + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(depth), // %4 "m"(kSplitUVShuffle16) // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"); // clang-format on } -#endif // HAS_MERGEUVROW_AVX2 +#endif // HAS_SPLITUVROW_16_AVX2 // Use scale to convert lsb formats to msb, depending how many bits there are: // 128 = 9 bits diff --git a/source/row_neon.cc b/source/row_neon.cc index 5d109a3b4..2165d0d01 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -3270,32 +3270,22 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv, uint16_t* dst_v, int depth, int width) { + int shift = depth - 16; // Negative for right shift. asm volatile( - "vdup.32 q0, %3 \n" + "vdup.16 q2, %4 \n" "1: \n" - "vld2.16 {q1, q2}, [%0]! \n" // load 8 UV - "vmovl.u16 q3, d2 \n" - "vmovl.u16 q4, d3 \n" - "vshl.u32 q3, q3, q0 \n" - "vshl.u32 q4, q4, q0 \n" - "vmovn.u32 d2, q3 \n" - "vmovn.u32 d3, q4 \n" - "vmovl.u16 q3, d4 \n" - "vmovl.u16 q4, d5 \n" - "vshl.u32 q3, q3, q0 \n" - "vshl.u32 q4, q4, q0 \n" - "vmovn.u32 d4, q3 \n" - "vmovn.u32 d5, q4 \n" - "subs %4, %4, #8 \n" // 8 src pixels per loop - "vst1.16 {q1}, [%1]! \n" // store 8 U pixels - "vst1.16 {q2}, [%2]! \n" // store 8 V pixels + "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV + "vshl.u16 q0, q0, q2 \n" + "vshl.u16 q1, q1, q2 \n" + "subs %3, %3, #8 \n" // 8 src pixels per loop + "vst1.16 {q0}, [%1]! \n" // store 8 U pixels + "vst1.16 {q1}, [%2]! \n" // store 8 V pixels "bgt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(depth), // %3 - "+r"(width) // %4 - : + "+r"(width) // %3 + : "r"(shift) // %4 : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); } @@ -3306,21 +3296,20 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, int width) { int shift = 16 - depth; asm volatile( - "vdup.16 q2, %3 \n" + "vdup.16 q2, %4 \n" "1: \n" "vld1.16 {q0}, [%0]! \n" // load 8 U "vld1.16 {q1}, [%1]! \n" // load 8 V "vshl.u16 q0, q0, q2 \n" "vshl.u16 q1, q1, q2 \n" - "subs %4, %4, #8 \n" // 8 src pixels per loop + "subs %3, %3, #8 \n" // 8 src pixels per loop "vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels "bgt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 - "+r"(shift), // %3 - "+r"(width) // %4 - : + "+r"(width) // %3 + : "r"(shift) // %4 : "cc", "memory", "q0", "q1", "q2"); } diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 3281e90f1..903bf5cd4 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -673,8 +673,8 @@ void SplitUVRow_NEON(const uint8_t* src_uv, asm volatile( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pairs of UV - "prfm pldl1keep, [%0, 448] \n" "subs %w3, %w3, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%1], #16 \n" // store U "st1 {v1.16b}, [%2], #16 \n" // store V "b.gt 1b \n" @@ -696,9 +696,9 @@ void MergeUVRow_NEON(const uint8_t* src_u, "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load U "ld1 {v1.16b}, [%1], #16 \n" // load V + "subs %w3, %w3, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%1, 448] \n" - "subs %w3, %w3, #16 \n" // 16 processed per loop "st2 {v0.16b,v1.16b}, [%2], #32 \n" // store 16 pairs of UV "b.gt 1b \n" : "+r"(src_u), // %0 @@ -719,8 +719,8 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb, asm volatile( "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RGB - "prfm pldl1keep, [%0, 448] \n" "subs %w4, %w4, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%1], #16 \n" // store R "st1 {v1.16b}, [%2], #16 \n" // store G "st1 {v2.16b}, [%3], #16 \n" // store B @@ -746,12 +746,11 @@ void MergeRGBRow_NEON(const uint8_t* src_r, "ld1 {v0.16b}, [%0], #16 \n" // load R "ld1 {v1.16b}, [%1], #16 \n" // load G "ld1 {v2.16b}, [%2], #16 \n" // load B + "subs %w4, %w4, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%2, 448] \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop "st3 {v0.16b,v1.16b,v2.16b}, [%3], #48 \n" // store 16 RGB - "prfm pldl1keep, [%0, 448] \n" "b.gt 1b \n" : "+r"(src_r), // %0 "+r"(src_g), // %1 @@ -773,8 +772,8 @@ void SplitARGBRow_NEON(const uint8_t* src_rgba, asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w5, %w5, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%3], #16 \n" // store B "st1 {v1.16b}, [%2], #16 \n" // store G "st1 {v2.16b}, [%1], #16 \n" // store R @@ -804,11 +803,11 @@ void MergeARGBRow_NEON(const uint8_t* src_r, "ld1 {v1.16b}, [%1], #16 \n" // load G "ld1 {v0.16b}, [%2], #16 \n" // load B "ld1 {v3.16b}, [%3], #16 \n" // load A + "subs %w5, %w5, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%2, 448] \n" "prfm pldl1keep, [%3, 448] \n" - "subs %w5, %w5, #16 \n" // 16 processed per loop "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%4], #64 \n" // store 16ARGB "b.gt 1b \n" : "+r"(src_r), // %0 @@ -831,8 +830,8 @@ void SplitXRGBRow_NEON(const uint8_t* src_rgba, asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w4, %w4, #16 \n" // 16 processed per loop + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%3], #16 \n" // store B "st1 {v1.16b}, [%2], #16 \n" // store G "st1 {v2.16b}, [%1], #16 \n" // store R @@ -859,10 +858,10 @@ void MergeXRGBRow_NEON(const uint8_t* src_r, "ld1 {v2.16b}, [%0], #16 \n" // load R "ld1 {v1.16b}, [%1], #16 \n" // load G "ld1 {v0.16b}, [%2], #16 \n" // load B + "subs %w4, %w4, #16 \n" // 16 processed per loop "prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%2, 448] \n" - "subs %w4, %w4, #16 \n" // 16 processed per loop "st4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%3], #64 \n" // store 16ARGB "b.gt 1b \n" : "+r"(src_r), // %0 @@ -1072,9 +1071,9 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { "movi v5.8b, #255 \n" // Alpha "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v3.8b, v1.8b, v1.8b \n" // move g + "prfm pldl1keep, [%0, 448] \n" "orr v4.8b, v0.8b, v0.8b \n" // move r "st4 {v2.8b,v3.8b,v4.8b,v5.8b}, [%1], #32 \n" // store b g r a "b.gt 1b \n" @@ -1091,9 +1090,9 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { "movi v0.8b, #255 \n" // Alpha "1: \n" "ld3 {v3.8b,v4.8b,v5.8b}, [%0], #24 \n" // read r g b - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v2.8b, v4.8b, v4.8b \n" // move g + "prfm pldl1keep, [%0, 448] \n" "orr v1.8b, v5.8b, v5.8b \n" // move r "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store a b g r "b.gt 1b \n" @@ -1109,9 +1108,9 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { asm volatile( "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v3.8b, v1.8b, v1.8b \n" // move g + "prfm pldl1keep, [%0, 448] \n" "orr v4.8b, v0.8b, v0.8b \n" // move r "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r "b.gt 1b \n" @@ -1143,8 +1142,8 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, "movi v3.8b, #255 \n" // Alpha "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" @@ -1233,8 +1232,8 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, asm volatile( "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB "b.gt 1b \n" @@ -1252,8 +1251,8 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb, asm volatile( "1: \n" "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" "st3 {v1.8b,v2.8b,v3.8b}, [%1], #24 \n" // store 8 pixels of // RGB24 "b.gt 1b \n" @@ -1269,9 +1268,9 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { asm volatile( "1: \n" "ld4 {v1.8b,v2.8b,v3.8b,v4.8b}, [%0], #32 \n" // load b g r a - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "orr v4.8b, v2.8b, v2.8b \n" // mov g + "prfm pldl1keep, [%0, 448] \n" "orr v5.8b, v1.8b, v1.8b \n" // mov b "st3 {v3.8b,v4.8b,v5.8b}, [%1], #24 \n" // store r g b "b.gt 1b \n" @@ -1287,8 +1286,8 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { asm volatile( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of YUY2. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #16 \n" // 16 processed per loop. + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.16b}, [%1], #16 \n" // store 16 pixels of Y. "b.gt 1b \n" : "+r"(src_yuy2), // %0 @@ -1303,8 +1302,8 @@ void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { asm volatile( "1: \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n" // load 16 pixels of UYVY. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #16 \n" // 16 processed per loop. + "prfm pldl1keep, [%0, 448] \n" "st1 {v1.16b}, [%1], #16 \n" // store 16 pixels of Y. "b.gt 1b \n" : "+r"(src_uyvy), // %0 @@ -1322,8 +1321,8 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2, asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 YUY2 - "prfm pldl1keep, [%0, 448] \n" "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "prfm pldl1keep, [%0, 448] \n" "st1 {v1.8b}, [%1], #8 \n" // store 8 U. "st1 {v3.8b}, [%2], #8 \n" // store 8 V. "b.gt 1b \n" @@ -1343,8 +1342,8 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy, asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 UYVY - "prfm pldl1keep, [%0, 448] \n" "subs %w3, %w3, #16 \n" // 16 pixels = 8 UVs. + "prfm pldl1keep, [%0, 448] \n" "st1 {v0.8b}, [%1], #8 \n" // store 8 U. "st1 {v2.8b}, [%2], #8 \n" // store 8 V. "b.gt 1b \n" @@ -1366,10 +1365,10 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2, asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "urhadd v1.8b, v1.8b, v5.8b \n" // average rows of U + "prfm pldl1keep, [%0, 448] \n" "urhadd v3.8b, v3.8b, v7.8b \n" // average rows of V "st1 {v1.8b}, [%2], #8 \n" // store 8 U. "st1 {v3.8b}, [%3], #8 \n" // store 8 V. @@ -1394,10 +1393,10 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy, asm volatile( "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 16 pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w4, %w4, #16 \n" // 16 pixels = 8 UVs. "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load next row "urhadd v0.8b, v0.8b, v4.8b \n" // average rows of U + "prfm pldl1keep, [%0, 448] \n" "urhadd v2.8b, v2.8b, v6.8b \n" // average rows of V "st1 {v0.8b}, [%2], #8 \n" // store 8 U. "st1 {v2.8b}, [%3], #8 \n" // store 8 V. @@ -1422,8 +1421,8 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb, "ld1 {v2.16b}, [%3] \n" // shuffler "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 4 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #4 \n" // 4 processed per loop + "prfm pldl1keep, [%0, 448] \n" "tbl v1.16b, {v0.16b}, v2.16b \n" // look up 4 pixels "st1 {v1.16b}, [%1], #16 \n" // store 4. "b.gt 1b \n" @@ -1443,11 +1442,11 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y, asm volatile( "1: \n" "ld2 {v0.8b, v1.8b}, [%0], #16 \n" // load 16 Ys - "prfm pldl1keep, [%0, 448] \n" + "subs %w4, %w4, #16 \n" // 16 pixels "orr v2.8b, v1.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" "ld1 {v1.8b}, [%1], #8 \n" // load 8 Us "ld1 {v3.8b}, [%2], #8 \n" // load 8 Vs - "subs %w4, %w4, #16 \n" // 16 pixels "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%3], #32 \n" // Store 16 pixels. "b.gt 1b \n" : "+r"(src_y), // %0 @@ -1467,8 +1466,8 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y, asm volatile( "1: \n" "ld2 {v1.8b,v2.8b}, [%0], #16 \n" // load 16 Ys - "prfm pldl1keep, [%0, 448] \n" "orr v3.8b, v2.8b, v2.8b \n" + "prfm pldl1keep, [%0, 448] \n" "ld1 {v0.8b}, [%1], #8 \n" // load 8 Us "ld1 {v2.8b}, [%2], #8 \n" // load 8 Vs "subs %w4, %w4, #16 \n" // 16 pixels @@ -1490,8 +1489,8 @@ void ARGBToRGB565Row_NEON(const uint8_t* src_argb, "1: \n" "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 // pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" ARGBTORGB565 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels RGB565. "b.gt 1b \n" @@ -1511,9 +1510,9 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb, "1: \n" "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" // load 8 // pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v20.8b, v20.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" "uqadd v21.8b, v21.8b, v1.8b \n" "uqadd v22.8b, v22.8b, v1.8b \n" ARGBTORGB565 "st1 {v0.16b}, [%0], #16 \n" // store 8 pixels RGB565. @@ -1532,8 +1531,8 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, "1: \n" "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 // pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB1555 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels "b.gt 1b \n" @@ -1553,8 +1552,8 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, "1: \n" "ld4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%0], #32 \n" // load 8 // pixels - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" ARGBTOARGB4444 "st1 {v0.16b}, [%1], #16 \n" // store 8 pixels "b.gt 1b \n" @@ -1575,8 +1574,8 @@ void ARGBToAR64Row_NEON(const uint8_t* src_argb, "1: \n" "ldp q0, q2, [%0], #32 \n" // load 8 pixels "mov v1.16b, v0.16b \n" - "mov v3.16b, v2.16b \n" "prfm pldl1keep, [%0, 448] \n" + "mov v3.16b, v2.16b \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels @@ -1597,9 +1596,9 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb, "ldp q0, q2, [%0], #32 \n" // load 8 pixels "tbl v0.16b, {v0.16b}, v4.16b \n" "tbl v2.16b, {v2.16b}, v4.16b \n" + "prfm pldl1keep, [%0, 448] \n" "mov v1.16b, v0.16b \n" "mov v3.16b, v2.16b \n" - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "st2 {v0.16b, v1.16b}, [%1], #32 \n" // store 4 pixels "st2 {v2.16b, v3.16b}, [%1], #32 \n" // store 4 pixels @@ -1622,8 +1621,8 @@ void AR64ToARGBRow_NEON(const uint16_t* src_ar64, "1: \n" "ldp q0, q1, [%0], #32 \n" // load 4 pixels "ldp q2, q3, [%0], #32 \n" // load 4 pixels - "prfm pldl1keep, [%0, 448] \n" "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" + "prfm pldl1keep, [%0, 448] \n" "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "stp q0, q2, [%1], #32 \n" // store 8 pixels @@ -1646,8 +1645,8 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, "1: \n" "ldp q0, q1, [%0], #32 \n" // load 4 pixels "ldp q2, q3, [%0], #32 \n" // load 4 pixels - "prfm pldl1keep, [%0, 448] \n" "tbl v0.16b, {v0.16b, v1.16b}, v4.16b \n" + "prfm pldl1keep, [%0, 448] \n" "tbl v2.16b, {v2.16b, v3.16b}, v4.16b \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "stp q0, q2, [%1], #32 \n" // store 8 pixels @@ -1667,9 +1666,9 @@ void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v3.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y @@ -1708,9 +1707,9 @@ void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { "movi v6.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v3.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y @@ -1730,9 +1729,9 @@ void RGBAToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { "movi v6.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v0.8h, v1.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v0.8h, v2.8b, v5.8b \n" // G "umlal v0.8h, v3.8b, v6.8b \n" // R "uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y @@ -1760,9 +1759,9 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb, "movi v29.16b,#0x80 \n" // 128.5 "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlsl v4.8h, v1.8b, v25.8b \n" // G "umlsl v4.8h, v2.8b, v26.8b \n" // R "add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned @@ -1823,14 +1822,14 @@ void ARGBToUVRow_NEON(const uint8_t* src_argb, RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. @@ -1869,13 +1868,13 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb, "movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit) "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. @@ -1909,13 +1908,13 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra, RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts. @@ -1949,13 +1948,13 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr, RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. - "prfm pldl1keep, [%1, 448] \n" "uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts. @@ -1989,13 +1988,13 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba, RGBTOUV_SETUP_REG "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts. @@ -2029,13 +2028,13 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, RGBTOUV_SETUP_REG "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 16 more. - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. @@ -2069,13 +2068,13 @@ void RAWToUVRow_NEON(const uint8_t* src_raw, RGBTOUV_SETUP_REG "1: \n" "ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels. - "prfm pldl1keep, [%0, 448] \n" "uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts. "ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load 8 more RAW pixels - "prfm pldl1keep, [%1, 448] \n" "uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts. @@ -2110,9 +2109,9 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, RGBTOUV_SETUP_REG "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "prfm pldl1keep, [%0, 448] \n" RGB565TOARGB "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%0], #16 \n" // next 8 RGB565 pixels. @@ -2122,9 +2121,9 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565, "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // load 8 RGB565 pixels. - "prfm pldl1keep, [%1, 448] \n" RGB565TOARGB "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // next 8 RGB565 pixels. @@ -2168,9 +2167,9 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, RGBTOUV_SETUP_REG "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "prfm pldl1keep, [%0, 448] \n" RGB555TOARGB "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB1555 pixels. @@ -2180,9 +2179,9 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555, "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB1555 pixels. - "prfm pldl1keep, [%1, 448] \n" RGB555TOARGB "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB1555 pixels. @@ -2226,9 +2225,9 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, RGBTOUV_SETUP_REG // sets v20-v25 "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "prfm pldl1keep, [%0, 448] \n" ARGB4444TOARGB "uaddlp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uaddlp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%0], #16 \n" // next 8 ARGB4444 pixels. @@ -2238,9 +2237,9 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444, "uaddlp v28.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // load 8 ARGB4444 pixels. - "prfm pldl1keep, [%1, 448] \n" ARGB4444TOARGB "uadalp v16.4h, v0.8b \n" // B 8 bytes -> 4 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v17.4h, v1.8b \n" // G 8 bytes -> 4 shorts. "uadalp v18.4h, v2.8b \n" // R 8 bytes -> 4 shorts. "ld1 {v0.16b}, [%1], #16 \n" // next 8 ARGB4444 pixels. @@ -2283,10 +2282,10 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { "movi v27.8b, #16 \n" // Add 16 constant "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 RGB565 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. RGB565TOARGB "umull v3.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v25.8b \n" // G "umlal v3.8h, v2.8b, v26.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y @@ -2311,10 +2310,10 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB1555 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB1555TOARGB "umull v3.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v5.8b \n" // G "umlal v3.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y @@ -2338,10 +2337,10 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, "movi v27.8b, #16 \n" // Add 16 constant "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 8 ARGB4444 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. ARGB4444TOARGB "umull v3.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v3.8h, v1.8b, v25.8b \n" // G "umlal v3.8h, v2.8b, v26.8b \n" // R "uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y @@ -2363,9 +2362,9 @@ void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v1.8b, v4.8b \n" // R + "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v3.8b, v6.8b \n" // B "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y @@ -2387,9 +2386,9 @@ void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // R + "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v6.8b \n" // B "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y @@ -2411,9 +2410,9 @@ void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v1.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v2.8b, v5.8b \n" // G "umlal v16.8h, v3.8b, v6.8b \n" // R "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y @@ -2435,9 +2434,9 @@ void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y @@ -2459,9 +2458,9 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { "movi v7.8b, #16 \n" // Add 16 constant "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v16.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v16.8h, v1.8b, v5.8b \n" // G "umlal v16.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y @@ -2482,9 +2481,9 @@ void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { "movi v6.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v0.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v0.8h, v1.8b, v5.8b \n" // G "umlal v0.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y @@ -2504,11 +2503,11 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { "movi v4.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. - "umull v0.8h, v0.8b, v4.8b \n" // R + "umull v0.8h, v0.8b, v4.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v0.8h, v1.8b, v5.8b \n" // G - "umlal v0.8h, v2.8b, v6.8b \n" // B + "umlal v0.8h, v2.8b, v6.8b \n" // R "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "b.gt 1b \n" @@ -2540,11 +2539,11 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, "1: \n" "ld1 {v0.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%2], #16 \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" "subs %w3, %w3, #16 \n" "umull v2.8h, v0.8b, v4.8b \n" + "prfm pldl1keep, [%1, 448] \n" "umull2 v3.8h, v0.16b, v4.16b \n" + "prfm pldl1keep, [%2, 448] \n" "umlal v2.8h, v1.8b, v5.8b \n" "umlal2 v3.8h, v1.16b, v5.16b \n" "rshrn v0.8b, v2.8h, #8 \n" @@ -2557,10 +2556,10 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, "50: \n" "ld1 {v0.16b}, [%1], #16 \n" "ld1 {v1.16b}, [%2], #16 \n" - "prfm pldl1keep, [%1, 448] \n" - "prfm pldl1keep, [%2, 448] \n" "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" "urhadd v0.16b, v0.16b, v1.16b \n" + "prfm pldl1keep, [%2, 448] \n" "st1 {v0.16b}, [%0], #16 \n" "b.gt 50b \n" "b 99f \n" @@ -2568,8 +2567,8 @@ void InterpolateRow_NEON(uint8_t* dst_ptr, // Blend 100 / 0 - Copy row unchanged. "100: \n" "ld1 {v0.16b}, [%1], #16 \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #16 \n" + "prfm pldl1keep, [%1, 448] \n" "st1 {v0.16b}, [%0], #16 \n" "b.gt 100b \n" @@ -2596,11 +2595,11 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0, "8: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB0 "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 ARGB1 - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v16.8h, v4.8b, v3.8b \n" // db * a + "prfm pldl1keep, [%0, 448] \n" "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "prfm pldl1keep, [%1, 448] \n" "umull v18.8h, v6.8b, v3.8b \n" // dr * a "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 @@ -2626,11 +2625,11 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb0, // ARGB0. "ld4 {v4.b,v5.b,v6.b,v7.b}[0], [%1], #4 \n" // load 1 pixel // ARGB1. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #1 \n" // 1 processed per loop. "umull v16.8h, v4.8b, v3.8b \n" // db * a + "prfm pldl1keep, [%0, 448] \n" "umull v17.8h, v5.8b, v3.8b \n" // dg * a + "prfm pldl1keep, [%1, 448] \n" "umull v18.8h, v6.8b, v3.8b \n" // dr * a "uqrshrn v16.8b, v16.8h, #8 \n" // db >>= 8 "uqrshrn v17.8b, v17.8h, #8 \n" // dg >>= 8 @@ -2664,9 +2663,9 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, // Attenuate 8 pixels. "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v3.8b \n" // b * a + "prfm pldl1keep, [%0, 448] \n" "umull v5.8h, v1.8b, v3.8b \n" // g * a "umull v6.8h, v2.8b, v3.8b \n" // r * a "uqrshrn v0.8b, v4.8h, #8 \n" // b >>= 8 @@ -2697,9 +2696,9 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb, // 8 pixel loop. "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB. - "prfm pldl1keep, [%0, 448] \n" "subs %w1, %w1, #8 \n" // 8 processed per loop. "uxtl v0.8h, v0.8b \n" // b (0 .. 255) + "prfm pldl1keep, [%0, 448] \n" "uxtl v1.8h, v1.8b \n" "uxtl v2.8h, v2.8b \n" "sqdmulh v0.8h, v0.8h, v4.8h \n" // b * scale @@ -2739,9 +2738,9 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb, // 8 pixel loop. "1: \n" "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "uxtl v4.8h, v4.8b \n" // b (0 .. 255) + "prfm pldl1keep, [%0, 448] \n" "uxtl v5.8h, v5.8b \n" "uxtl v6.8h, v6.8b \n" "uxtl v7.8h, v7.8b \n" @@ -2772,9 +2771,9 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { "movi v26.8b, #77 \n" // R * 0.2990 coefficient "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v24.8b \n" // B + "prfm pldl1keep, [%0, 448] \n" "umlal v4.8h, v1.8b, v25.8b \n" // G "umlal v4.8h, v2.8b, v26.8b \n" // R "uqrshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit B @@ -2807,9 +2806,9 @@ void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { "movi v30.8b, #50 \n" // BR coefficient "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0] \n" // load 8 ARGB pixels. - "prfm pldl1keep, [%0, 448] \n" "subs %w1, %w1, #8 \n" // 8 processed per loop. "umull v4.8h, v0.8b, v20.8b \n" // B to Sepia B + "prfm pldl1keep, [%0, 448] \n" "umlal v4.8h, v1.8b, v21.8b \n" // G "umlal v4.8h, v2.8b, v22.8b \n" // R "umull v5.8h, v0.8b, v24.8b \n" // B to Sepia G @@ -2844,9 +2843,9 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb, "1: \n" "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%0], #32 \n" // load 8 ARGB - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop. "uxtl v16.8h, v16.8b \n" // b (0 .. 255) 16 bit + "prfm pldl1keep, [%0, 448] \n" "uxtl v17.8h, v17.8b \n" // g "uxtl v18.8h, v18.8b \n" // r "uxtl v19.8h, v19.8b \n" // a @@ -2903,11 +2902,11 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb0, "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "umull v0.8h, v0.8b, v4.8b \n" // multiply B + "prfm pldl1keep, [%0, 448] \n" "umull v1.8h, v1.8b, v5.8b \n" // multiply G + "prfm pldl1keep, [%1, 448] \n" "umull v2.8h, v2.8b, v6.8b \n" // multiply R "umull v3.8h, v3.8b, v7.8b \n" // multiply A "rshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit B @@ -2934,11 +2933,11 @@ void ARGBAddRow_NEON(const uint8_t* src_argb0, "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v0.8b, v0.8b, v4.8b \n" + "prfm pldl1keep, [%0, 448] \n" "uqadd v1.8b, v1.8b, v5.8b \n" + "prfm pldl1keep, [%1, 448] \n" "uqadd v2.8b, v2.8b, v6.8b \n" "uqadd v3.8b, v3.8b, v7.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB @@ -2961,11 +2960,11 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb0, "1: \n" "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%1], #32 \n" // load 8 more - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqsub v0.8b, v0.8b, v4.8b \n" + "prfm pldl1keep, [%0, 448] \n" "uqsub v1.8b, v1.8b, v5.8b \n" + "prfm pldl1keep, [%1, 448] \n" "uqsub v2.8b, v2.8b, v6.8b \n" "uqsub v3.8b, v3.8b, v7.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB @@ -2993,11 +2992,11 @@ void SobelRow_NEON(const uint8_t* src_sobelx, "1: \n" "ld1 {v0.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v1.8b}, [%1], #8 \n" // load 8 sobely. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. "uqadd v0.8b, v0.8b, v1.8b \n" // add + "prfm pldl1keep, [%0, 448] \n" "orr v1.8b, v0.8b, v0.8b \n" + "prfm pldl1keep, [%1, 448] \n" "orr v2.8b, v0.8b, v0.8b \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" @@ -3019,10 +3018,10 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx, "1: \n" "ld1 {v0.16b}, [%0], #16 \n" // load 16 sobelx. "ld1 {v1.16b}, [%1], #16 \n" // load 16 sobely. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #16 \n" // 16 processed per loop. + "prfm pldl1keep, [%0, 448] \n" "uqadd v0.16b, v0.16b, v1.16b \n" // add + "prfm pldl1keep, [%1, 448] \n" "st1 {v0.16b}, [%2], #16 \n" // store 16 pixels. "b.gt 1b \n" : "+r"(src_sobelx), // %0 @@ -3048,10 +3047,10 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx, "1: \n" "ld1 {v2.8b}, [%0], #8 \n" // load 8 sobelx. "ld1 {v0.8b}, [%1], #8 \n" // load 8 sobely. - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 processed per loop. + "prfm pldl1keep, [%0, 448] \n" "uqadd v1.8b, v0.8b, v2.8b \n" // add + "prfm pldl1keep, [%1, 448] \n" "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%2], #32 \n" // store 8 ARGB "b.gt 1b \n" : "+r"(src_sobelx), // %0 @@ -3075,18 +3074,18 @@ void SobelXRow_NEON(const uint8_t* src_y0, "1: \n" "ld1 {v0.8b}, [%0],%5 \n" // top "ld1 {v1.8b}, [%0],%6 \n" - "prfm pldl1keep, [%0, 448] \n" "usubl v0.8h, v0.8b, v1.8b \n" + "prfm pldl1keep, [%0, 448] \n" "ld1 {v2.8b}, [%1],%5 \n" // center * 2 "ld1 {v3.8b}, [%1],%6 \n" - "prfm pldl1keep, [%1, 448] \n" "usubl v1.8h, v2.8b, v3.8b \n" + "prfm pldl1keep, [%1, 448] \n" "add v0.8h, v0.8h, v1.8h \n" "add v0.8h, v0.8h, v1.8h \n" "ld1 {v2.8b}, [%2],%5 \n" // bottom "ld1 {v3.8b}, [%2],%6 \n" - "prfm pldl1keep, [%2, 448] \n" "subs %w4, %w4, #8 \n" // 8 pixels + "prfm pldl1keep, [%2, 448] \n" "usubl v1.8h, v2.8b, v3.8b \n" "add v0.8h, v0.8h, v1.8h \n" "abs v0.8h, v0.8h \n" @@ -3124,11 +3123,11 @@ void SobelYRow_NEON(const uint8_t* src_y0, "add v0.8h, v0.8h, v1.8h \n" "ld1 {v2.8b}, [%0],%5 \n" // right "ld1 {v3.8b}, [%1],%5 \n" - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #8 \n" // 8 pixels "usubl v1.8h, v2.8b, v3.8b \n" + "prfm pldl1keep, [%0, 448] \n" "add v0.8h, v0.8h, v1.8h \n" + "prfm pldl1keep, [%1, 448] \n" "abs v0.8h, v0.8h \n" "uqxtn v0.8b, v0.8h \n" "st1 {v0.8b}, [%2], #8 \n" // store 8 sobely @@ -3151,9 +3150,9 @@ void HalfFloat1Row_NEON(const uint16_t* src, asm volatile( "1: \n" "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v2.4s, v1.4h \n" // 8 int's + "prfm pldl1keep, [%0, 448] \n" "uxtl2 v3.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats "scvtf v3.4s, v3.4s \n" @@ -3175,9 +3174,9 @@ void HalfFloatRow_NEON(const uint16_t* src, asm volatile( "1: \n" "ld1 {v1.16b}, [%0], #16 \n" // load 8 shorts - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v2.4s, v1.4h \n" // 8 int's + "prfm pldl1keep, [%0, 448] \n" "uxtl2 v3.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats "scvtf v3.4s, v3.4s \n" @@ -3201,9 +3200,9 @@ void ByteToFloatRow_NEON(const uint8_t* src, asm volatile( "1: \n" "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 pixels per loop "uxtl v1.8h, v1.8b \n" // 8 shorts + "prfm pldl1keep, [%0, 448] \n" "uxtl v2.4s, v1.4h \n" // 8 ints "uxtl2 v3.4s, v1.8h \n" "scvtf v2.4s, v2.4s \n" // 8 floats @@ -3230,9 +3229,9 @@ float ScaleMaxSamples_NEON(const float* src, "1: \n" "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "prfm pldl1keep, [%0, 448] \n" "fmul v4.4s, v2.4s, %4.s[0] \n" // scale "fmax v5.4s, v5.4s, v1.4s \n" // max "fmax v6.4s, v6.4s, v2.4s \n" @@ -3260,9 +3259,9 @@ float ScaleSumSamples_NEON(const float* src, "1: \n" "ld1 {v1.4s, v2.4s}, [%0], #32 \n" // load 8 samples - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #8 \n" // 8 processed per loop "fmul v3.4s, v1.4s, %4.s[0] \n" // scale + "prfm pldl1keep, [%0, 448] \n" "fmul v4.4s, v2.4s, %4.s[0] \n" "fmla v5.4s, v1.4s, v1.4s \n" // sum of squares "fmla v6.4s, v2.4s, v2.4s \n" @@ -3470,10 +3469,10 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y, "1: \n" "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values - "prfm pldl1keep, [%0, 448] \n" - "prfm pldl1keep, [%1, 448] \n" "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values + "prfm pldl1keep, [%0, 448] \n" "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values + "prfm pldl1keep, [%1, 448] \n" "subs %w3, %w3, #16 \n" // 16 pixels per loop "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels "b.gt 1b \n" @@ -3494,12 +3493,12 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv, "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. "uqrshrn v3.8b, v0.8h, #2 \n" // 2x2 average "uqrshrn v2.8b, v1.8h, #2 \n" @@ -3523,12 +3522,12 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv, "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv - "prfm pldl1keep, [%0, 448] \n" "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%0, 448] \n" "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 - "prfm pldl1keep, [%1, 448] \n" "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "prfm pldl1keep, [%1, 448] \n" "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average "uqrshrn v1.8b, v1.8h, #2 \n" @@ -3548,8 +3547,8 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { asm volatile( "1: \n" "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #16 \n" // 16 pixels per loop + "prfm pldl1keep, [%0, 448] \n" "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels "b.gt 1b \n" : "+r"(src_ayuv), // %0 @@ -3570,9 +3569,9 @@ void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { "1: \n" "ld1 {v0.16b}, [%0], 16 \n" // load 16 UV values "ld1 {v1.16b}, [%0], 16 \n" - "prfm pldl1keep, [%0, 448] \n" "subs %w2, %w2, #16 \n" // 16 pixels per loop "tbl v0.16b, {v0.16b}, v2.16b \n" + "prfm pldl1keep, [%0, 448] \n" "tbl v1.16b, {v1.16b}, v2.16b \n" "stp q0, q1, [%1], 32 \n" // store 16 VU pixels "b.gt 1b \n" @@ -3625,34 +3624,24 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv, uint16_t* dst_v, int depth, int width) { + int shift = depth - 16; // Negative for right shift. asm volatile( - "dup v0.4s, %w3 \n" + "dup v2.8h, %w4 \n" "1: \n" - "ld2 {v1.8h, v2.8h}, [%0], #32 \n" // load 8 UV + "ld2 {v0.8h, v1.8h}, [%0], #32 \n" // load 8 UV + "subs %w3, %w3, #8 \n" // 8 src pixels per loop + "ushl v0.8h, v0.8h, v2.8h \n" "prfm pldl1keep, [%0, 448] \n" - "ushll v3.4s, v1.4h, #0 \n" - "ushll2 v4.4s, v1.8h, #0 \n" - "ushl v3.4s, v3.4s, v0.4s \n" - "ushl v4.4s, v4.4s, v0.4s \n" - "xtn v1.4h, v3.4s \n" - "xtn2 v1.8h, v4.4s \n" - "ushll v3.4s, v2.4h, #0 \n" - "ushll2 v4.4s, v2.8h, #0 \n" - "ushl v3.4s, v3.4s, v0.4s \n" - "ushl v4.4s, v4.4s, v0.4s \n" - "xtn v2.4h, v3.4s \n" - "xtn2 v2.8h, v4.4s \n" - "subs %w4, %w4, #8 \n" // 8 src pixels per loop - "st1 {v1.8h}, [%1], #16 \n" // store 8 U pixels - "st1 {v2.8h}, [%2], #16 \n" // store 8 V pixels + "ushl v1.8h, v1.8h, v2.8h \n" + "st1 {v0.8h}, [%1], #16 \n" // store 8 U pixels + "st1 {v1.8h}, [%2], #16 \n" // store 8 V pixels "b.gt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(depth), // %3 - "+r"(width) // %4 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); + "+r"(width) // %3 + : "r"(shift) // %4 + : "cc", "memory", "v0", "v1", "v2"); } void MergeUVRow_16_NEON(const uint16_t* src_u, @@ -3662,23 +3651,22 @@ void MergeUVRow_16_NEON(const uint16_t* src_u, int width) { int shift = 16 - depth; asm volatile( - "dup v2.8h, %w3 \n" + "dup v2.8h, %w4 \n" "1: \n" "ld1 {v0.8h}, [%0], #16 \n" // load 8 U - "prfm pldl1keep, [%0, 448] \n" + "subs %w3, %w3, #8 \n" // 8 src pixels per loop "ld1 {v1.8h}, [%1], #16 \n" // load 8 V - "prfm pldl1keep, [%1, 448] \n" "ushl v0.8h, v0.8h, v2.8h \n" + "prfm pldl1keep, [%0, 448] \n" "ushl v1.8h, v1.8h, v2.8h \n" - "subs %w4, %w4, #8 \n" // 8 src pixels per loop + "prfm pldl1keep, [%1, 448] \n" "st2 {v0.8h, v1.8h}, [%2], #32 \n" // store 8 UV pixels "b.gt 1b \n" : "+r"(src_u), // %0 "+r"(src_v), // %1 "+r"(dst_uv), // %2 - "+r"(shift), // %3 - "+r"(width) // %4 - : + "+r"(width) // %3 + : "r"(shift) // %4 : "cc", "memory", "v0", "v1", "v2"); } @@ -3690,8 +3678,8 @@ void MultiplyRow_16_NEON(const uint16_t* src_y, "dup v2.8h, %w2 \n" "1: \n" "ldp q0, q1, [%0], #32 \n" - "prfm pldl1keep, [%0, 448] \n" "mul v0.8h, v0.8h, v2.8h \n" + "prfm pldl1keep, [%0, 448] \n" "mul v1.8h, v1.8h, v2.8h \n" "stp q0, q1, [%1] \n" // store 16 pixels "add %1, %1, #32 \n" @@ -3713,9 +3701,9 @@ void DivideRow_16_NEON(const uint16_t* src_y, "dup v0.8h, %w2 \n" "1: \n" "ldp q1, q2, [%0], #32 \n" - "prfm pldl1keep, [%0, 448] \n" "ushll v3.4s, v1.4h, #0 \n" "ushll v4.4s, v2.4h, #0 \n" + "prfm pldl1keep, [%0, 448] \n" "ushll2 v1.4s, v1.8h, #0 \n" "ushll2 v2.4s, v2.8h, #0 \n" "mul v3.4s, v0.4s, v3.4s \n" diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index fd1755cdc..75f1e5d5f 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2605,6 +2605,64 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) { free_aligned_buffer_page_end(dst_pixels_c); } +// 16 bit channel split and merge +TEST_F(LibYUVPlanarTest, MergeUVPlane_16_Opt) { + // Round count up to multiple of 16 + const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + align_buffer_page_end(src_pixels, kPixels * 2 * 2); + align_buffer_page_end(tmp_pixels_u_c, kPixels * 2); + align_buffer_page_end(tmp_pixels_v_c, kPixels * 2); + align_buffer_page_end(tmp_pixels_u_opt, kPixels * 2); + align_buffer_page_end(tmp_pixels_v_opt, kPixels * 2); + align_buffer_page_end(dst_pixels_opt, kPixels * 2 * 2); + align_buffer_page_end(dst_pixels_c, kPixels * 2 * 2); + MemRandomize(src_pixels, kPixels * 2 * 2); + MemRandomize(tmp_pixels_u_c, kPixels * 2); + MemRandomize(tmp_pixels_v_c, kPixels * 2); + MemRandomize(tmp_pixels_u_opt, kPixels * 2); + MemRandomize(tmp_pixels_v_opt, kPixels * 2); + MemRandomize(dst_pixels_opt, kPixels * 2 * 2); + MemRandomize(dst_pixels_c, kPixels * 2 * 2); + + MaskCpuFlags(disable_cpu_flags_); + SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2, + (uint16_t*)tmp_pixels_u_c, benchmark_width_, + (uint16_t*)tmp_pixels_v_c, benchmark_width_, benchmark_width_, + benchmark_height_, 12); + MergeUVPlane_16((const uint16_t*)tmp_pixels_u_c, benchmark_width_, + (const uint16_t*)tmp_pixels_v_c, benchmark_width_, + (uint16_t*)dst_pixels_c, benchmark_width_ * 2, + benchmark_width_, benchmark_height_, 12); + MaskCpuFlags(benchmark_cpu_info_); + + SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2, + (uint16_t*)tmp_pixels_u_opt, benchmark_width_, + (uint16_t*)tmp_pixels_v_opt, benchmark_width_, + benchmark_width_, benchmark_height_, 12); + + for (int i = 0; i < benchmark_iterations_; ++i) { + MergeUVPlane_16((const uint16_t*)tmp_pixels_u_opt, benchmark_width_, + (const uint16_t*)tmp_pixels_v_opt, benchmark_width_, + (uint16_t*)dst_pixels_opt, benchmark_width_ * 2, + benchmark_width_, benchmark_height_, 12); + } + + for (int i = 0; i < kPixels * 2; ++i) { + EXPECT_EQ(tmp_pixels_u_c[i], tmp_pixels_u_opt[i]); + EXPECT_EQ(tmp_pixels_v_c[i], tmp_pixels_v_opt[i]); + } + for (int i = 0; i < kPixels * 2 * 2; ++i) { + EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]); + } + free_aligned_buffer_page_end(src_pixels); + free_aligned_buffer_page_end(tmp_pixels_u_c); + free_aligned_buffer_page_end(tmp_pixels_v_c); + free_aligned_buffer_page_end(tmp_pixels_u_opt); + free_aligned_buffer_page_end(tmp_pixels_v_opt); + free_aligned_buffer_page_end(dst_pixels_opt); + free_aligned_buffer_page_end(dst_pixels_c); +} + TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) { // Round count up to multiple of 16 const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; @@ -2649,6 +2707,46 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) { free_aligned_buffer_page_end(dst_pixels_c); } +// 16 bit channel split +TEST_F(LibYUVPlanarTest, SplitUVPlane_16_Opt) { + // Round count up to multiple of 16 + const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15; + align_buffer_page_end(src_pixels, kPixels * 2 * 2); + align_buffer_page_end(dst_pixels_u_c, kPixels * 2); + align_buffer_page_end(dst_pixels_v_c, kPixels * 2); + align_buffer_page_end(dst_pixels_u_opt, kPixels * 2); + align_buffer_page_end(dst_pixels_v_opt, kPixels * 2); + MemRandomize(src_pixels, kPixels * 2 * 2); + MemRandomize(dst_pixels_u_c, kPixels * 2); + MemRandomize(dst_pixels_v_c, kPixels * 2); + MemRandomize(dst_pixels_u_opt, kPixels * 2); + MemRandomize(dst_pixels_v_opt, kPixels * 2); + + MaskCpuFlags(disable_cpu_flags_); + SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2, + (uint16_t*)dst_pixels_u_c, benchmark_width_, + (uint16_t*)dst_pixels_v_c, benchmark_width_, benchmark_width_, + benchmark_height_, 10); + MaskCpuFlags(benchmark_cpu_info_); + + for (int i = 0; i < benchmark_iterations_; ++i) { + SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2, + (uint16_t*)dst_pixels_u_opt, benchmark_width_, + (uint16_t*)dst_pixels_v_opt, benchmark_width_, + benchmark_width_, benchmark_height_, 10); + } + + for (int i = 0; i < kPixels * 2; ++i) { + EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]); + EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]); + } + free_aligned_buffer_page_end(src_pixels); + free_aligned_buffer_page_end(dst_pixels_u_c); + free_aligned_buffer_page_end(dst_pixels_v_c); + free_aligned_buffer_page_end(dst_pixels_u_opt); + free_aligned_buffer_page_end(dst_pixels_v_opt); +} + TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) { // Round count up to multiple of 16 const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;