From 1170363ce55fec2a256ce383479d8a6a3edadffe Mon Sep 17 00:00:00 2001 From: Dale Curtis Date: Thu, 19 Mar 2026 23:39:57 +0000 Subject: [PATCH] Add Gemini implementation for NEON32 RGB to YUV matrix operations These are about 25% faster than the C versions. Bug: libyuv:42280902 Change-Id: I8b298670ee5f3ed5db35527fc41d6d9a51b020a1 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7573682 Reviewed-by: Frank Barchard Commit-Queue: Dale Curtis --- include/libyuv/row.h | 49 +++++++-- source/convert.cc | 8 ++ source/convert_from_argb.cc | 32 ++++++ source/row_any.cc | 24 +++++ source/row_common.cc | 10 ++ source/row_neon.cc | 205 ++++++++++++++---------------------- 6 files changed, 196 insertions(+), 132 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 034ff866e..db875b74f 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -406,11 +406,17 @@ extern "C" { #define HAS_ARGBTORGB24ROW_NEON #define HAS_ARGBTORGB565DITHERROW_NEON #define HAS_ARGBTORGB565ROW_NEON +#if !defined(__aarch64__) +#define HAS_ARGBTOUV444MATRIXROW_NEON +#endif #define HAS_ARGBTOUV444ROW_NEON #define HAS_ARGBTOUVJ444ROW_NEON #define HAS_ARGBTOUVJROW_NEON #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON +#if !defined(__aarch64__) +#define HAS_ARGBTOYMATRIXROW_NEON +#endif #define HAS_ARGBTOYROW_NEON #define HAS_AYUVTOUVROW_NEON #define HAS_AYUVTOVUROW_NEON @@ -975,20 +981,19 @@ typedef uint32_t ulvec32[8]; typedef uint8_t ulvec8[32]; #endif -struct ArgbConstants { - uint8_t kRGBToY[32]; - int8_t kRGBToU[32]; - int8_t kRGBToV[32]; - uint16_t kAddY[16]; - uint16_t kAddUV[16]; -}; - #if defined(__aarch64__) || defined(__arm__) || defined(__riscv) // This struct is for ARM and RISC-V color conversion. struct YuvConstants { uvec8 kUVCoeff; vec16 kRGBCoeffBias; }; +struct ArgbConstants { + uvec8 kRGBToY; + vec8 kRGBToU; + vec8 kRGBToV; + uvec16 kAddY; + uvec16 kAddUV; +}; #else // This struct is for Intel color conversion. struct YuvConstants { @@ -998,6 +1003,13 @@ struct YuvConstants { int16_t kYToRgb[16]; int16_t kYBiasToRgb[16]; }; +struct ArgbConstants { + uint8_t kRGBToY[32]; + int8_t kRGBToU[32]; + int8_t kRGBToV[32]; + uint16_t kAddY[16]; + uint16_t kAddUV[16]; +}; // Offsets into YuvConstants structure #define KUVTOB 0 @@ -1778,6 +1790,27 @@ void RGBAToYJRow_LSX(const uint8_t* src_rgba, uint8_t* dst_yj, int width); void ARGBToYJRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width); void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width); void RGBAToYJRow_LASX(const uint8_t* src_rgba, uint8_t* dst_yj, int width); + +#if !defined(__aarch64__) +void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +void ARGBToUV444MatrixRow_Any_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c); +void ARGBToYMatrixRow_Any_NEON(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c); +#endif void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, diff --git a/source/convert.cc b/source/convert.cc index e01442316..cddaf961b 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -2178,6 +2178,14 @@ int ARGBToI420Matrix(const uint8_t* src_argb, ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2; } } +#endif +#if defined(HAS_ARGBTOYMATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; + } + } #endif if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 || height == 0) { diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 8f6483a02..c7bf41ea8 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -199,6 +199,22 @@ int ARGBToI444Matrix(const uint8_t* src_argb, ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_AVX2; } } +#endif +#if defined(HAS_ARGBTOYMATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; + } + } +#endif +#if defined(HAS_ARGBTOUV444MATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_NEON; + } + } #endif if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 || height == 0) { @@ -415,6 +431,14 @@ int ARGBToI422Matrix(const uint8_t* src_argb, ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2; } } +#endif +#if defined(HAS_ARGBTOYMATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; + } + } #endif if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 || height == 0) { @@ -677,6 +701,14 @@ int ARGBToNV12Matrix(const uint8_t* src_argb, ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2; } } +#endif +#if defined(HAS_ARGBTOYMATRIXROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; + } + } #endif void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v, uint8_t* dst_uv, int width) = MergeUVRow_C; diff --git a/source/row_any.cc b/source/row_any.cc index f44bcfb5c..f34f3eb2e 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -2253,6 +2253,30 @@ ANY12M(ARGBToUV444MatrixRow_Any_AVX2, ARGBToUV444MatrixRow_AVX2, 4, 31) #ifdef HAS_ARGBTOUV444MATRIXROW_SSSE3 ANY12M(ARGBToUV444MatrixRow_Any_SSSE3, ARGBToUV444MatrixRow_SSSE3, 4, 15) #endif +#ifdef HAS_ARGBTOUV444MATRIXROW_NEON +ANY12M(ARGBToUV444MatrixRow_Any_NEON, ARGBToUV444MatrixRow_NEON, 4, 7) +#endif + +#define ANY11MC(NAMEANY, ANY_SIMD, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width, \ + const struct ArgbConstants* c) { \ + SIMD_ALIGNED(uint8_t vin[128]); \ + SIMD_ALIGNED(uint8_t vout[128]); \ + memset(vin, 0, sizeof(vin)); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, n, c); \ + } \ + memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP); \ + ANY_SIMD(vin, vout, MASK + 1, c); \ + memcpy(dst_ptr + (ptrdiff_t)n, vout, (ptrdiff_t)r); \ + } + +#ifdef HAS_ARGBTOYMATRIXROW_NEON +ANY11MC(ARGBToYMatrixRow_Any_NEON, ARGBToYMatrixRow_NEON, 4, 15) +#endif +#undef ANY11MC #ifdef HAS_ARGBTOUVROW_AVX2 ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31) diff --git a/source/row_common.cc b/source/row_common.cc index a9969d808..8b192a539 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1486,6 +1486,15 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { {YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}} #endif +#if defined(__aarch64__) || defined(__arm__) || defined(__riscv) +#define ARGBCONSTANTSBODY(Y0, Y1, Y2, Y3, U0, U1, U2, U3, V0, V1, V2, V3, AY, \ + AUV) \ + {{Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3}, \ + {U0, U1, U2, U3, U0, U1, U2, U3, U0, U1, U2, U3, U0, U1, U2, U3}, \ + {V0, V1, V2, V3, V0, V1, V2, V3, V0, V1, V2, V3, V0, V1, V2, V3}, \ + {AY, AY, AY, AY, AY, AY, AY, AY}, \ + {AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV}} +#else #define ARGBCONSTANTSBODY(Y0, Y1, Y2, Y3, U0, U1, U2, U3, V0, V1, V2, V3, AY, \ AUV) \ {{Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3, \ @@ -1497,6 +1506,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) { {AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY}, \ {AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, \ AUV, AUV}} +#endif // clang-format on diff --git a/source/row_neon.cc b/source/row_neon.cc index 1f1a3bbf3..689412668 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -9,6 +9,7 @@ */ #include "libyuv/row.h" +#include "libyuv/convert_from_argb.h" // For ArgbConstants #ifdef __cplusplus namespace libyuv { @@ -1840,39 +1841,36 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, ); } -// Coefficients expressed as negatives to allow 128 -struct RgbUVConstants { - int8_t kRGBToU[4]; - int8_t kRGBToV[4]; -}; - // 8x1 pixels. -static void ARGBToUV444MatrixRow_NEON( - const uint8_t* src_argb, - uint8_t* dst_u, - uint8_t* dst_v, - int width, - const struct RgbUVConstants* rgbuvconstants) { +void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_u, + uint8_t* dst_v, + int width, + const struct ArgbConstants* c) { asm volatile( - "vld1.8 {d0}, [%4] \n" // load rgbuvconstants - "vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient - "vdup.u8 d25, d0[1] \n" // UG -0.5781 coefficient - "vdup.u8 d26, d0[2] \n" // UR -0.2969 coefficient - "vdup.u8 d27, d0[4] \n" // VB -0.1406 coefficient - "vdup.u8 d28, d0[5] \n" // VG -0.7344 coefficient - "vneg.s8 d24, d24 \n" - "vmov.u16 q15, #0x8000 \n" // 128.0 + "vld1.8 {d16}, [%4] \n" // load kRGBToU + "vld1.8 {d17}, [%5] \n" // load kRGBToV + "vld1.16 {d18[0]}, [%6] \n" // load kAddUV[0] + "vabs.s8 d16, d16 \n" // BU, GU, RU + "vabs.s8 d17, d17 \n" // BV, GV, RV + "vdup.8 d20, d16[0] \n" // BU + "vdup.8 d21, d16[1] \n" // GU + "vdup.8 d22, d16[2] \n" // RU + "vdup.8 d23, d17[0] \n" // BV + "vdup.8 d24, d17[1] \n" // GV + "vdup.8 d25, d17[2] \n" // RV + "vdup.16 q15, d18[0] \n" // kAddUV "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "subs %3, %3, #8 \n" // 8 processed per loop. - "vmull.u8 q2, d0, d24 \n" // B - "vmlsl.u8 q2, d1, d25 \n" // G - "vmlsl.u8 q2, d2, d26 \n" // R + "vmull.u8 q2, d0, d20 \n" // B * BU + "vmlsl.u8 q2, d1, d21 \n" // - G * GU + "vmlsl.u8 q2, d2, d22 \n" // - R * RU - "vmull.u8 q3, d2, d24 \n" // R - "vmlsl.u8 q3, d1, d28 \n" // G - "vmlsl.u8 q3, d0, d27 \n" // B + "vmull.u8 q3, d2, d25 \n" // R * RV + "vmlsl.u8 q3, d1, d24 \n" // - G * GV + "vmlsl.u8 q3, d0, d23 \n" // - B * BV "vaddhn.u16 d0, q2, q15 \n" // signed -> unsigned "vaddhn.u16 d1, q3, q15 \n" @@ -1880,53 +1878,32 @@ static void ARGBToUV444MatrixRow_NEON( "vst1.8 {d0}, [%1]! \n" // store 8 pixels U. "vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "bgt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_u), // %1 - "+r"(dst_v), // %2 - "+r"(width) // %3 - : "r"(rgbuvconstants) // %4 - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", - "q15"); + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(width) // %3 + : "r"(&c->kRGBToU), // %4 + "r"(&c->kRGBToV), // %5 + "r"(&c->kAddUV) // %6 + : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", + "q12", "q13", "q14", "q15"); } -// RGB to BT601 coefficients -// UB 0.875 coefficient = 112 -// UG -0.5781 coefficient = -74 -// UR -0.2969 coefficient = -38 -// VB -0.1406 coefficient = -18 -// VG -0.7344 coefficient = -94 -// VR 0.875 coefficient = 112 - -static const struct RgbUVConstants kARGBI601UVConstants = {{-112, 74, 38, 0}, - {18, 94, -112, 0}}; - void ARGBToUV444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, - &kARGBI601UVConstants); + ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbI601Constants); } -// RGB to JPEG coefficients -// UB 0.500 coefficient = 128 -// UG -0.33126 coefficient = -85 -// UR -0.16874 coefficient = -43 -// VB -0.08131 coefficient = -21 -// VG -0.41869 coefficient = -107 -// VR 0.500 coefficient = 128 - -static const struct RgbUVConstants kARGBJPEGUVConstants = {{-128, 85, 43, 0}, - {21, 107, -128, 0}}; - void ARGBToUVJ444Row_NEON(const uint8_t* src_argb, uint8_t* dst_u, uint8_t* dst_v, int width) { - ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, - &kARGBJPEGUVConstants); + ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbJPEGConstants); } + // clang-format off // 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. #define RGBTOUV(QB, QG, QR) \ @@ -2754,47 +2731,22 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64, : "cc", "memory", "q0", "q1", "q2", "q3", "q4"); } -struct RgbConstants { - uint8_t kRGBToY[4]; - uint16_t kAddY; -}; - -// RGB to JPeg coefficients -// B * 0.1140 coefficient = 29 -// G * 0.5870 coefficient = 150 -// R * 0.2990 coefficient = 77 -// Add 0.5 -static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, - 0x0080}; - -static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 0x0080}; - -// RGB to BT.601 coefficients -// B * 0.1016 coefficient = 25 -// G * 0.5078 coefficient = 129 -// R * 0.2578 coefficient = 66 -// Add 16.5 = 0x1080 - -static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, - 0x1080}; - -static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080}; - // ARGB expects first 3 values to contain RGB and 4th value is ignored. -static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, - uint8_t* dst_y, - int width, - const struct RgbConstants* rgbconstants) { +void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, + uint8_t* dst_y, + int width, + const struct ArgbConstants* c) { asm volatile( - "vld1.8 {d0}, [%3] \n" // load rgbconstants - "vdup.u8 d20, d0[0] \n" - "vdup.u8 d21, d0[1] \n" - "vdup.u8 d22, d0[2] \n" - "vdup.u16 q12, d0[2] \n" + "vld1.8 {d16}, [%3] \n" // load kRGBToY + "vld1.16 {d18[0]}, [%4] \n" // load kAddY[0] + "vdup.8 d20, d16[0] \n" // BY + "vdup.8 d21, d16[1] \n" // GY + "vdup.8 d22, d16[2] \n" // RY + "vdup.16 q12, d18[0] \n" // AY "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB "vld4.8 {d1, d3, d5, d7}, [%0]! \n" - "subs %2, %2, #16 \n" // 16 processed per loop. + "subs %1, %1, #16 \n" // 16 processed per loop. "vmull.u8 q8, d0, d20 \n" // B "vmull.u8 q9, d1, d20 \n" "vmlal.u8 q8, d2, d21 \n" // G @@ -2803,30 +2755,31 @@ static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb, "vmlal.u8 q9, d5, d22 \n" "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y "vaddhn.u16 d1, q9, q12 \n" - "vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y. + "vst1.8 {d0, d1}, [%2]! \n" // store 16 pixels Y. "bgt 1b \n" : "+r"(src_argb), // %0 - "+r"(dst_y), // %1 - "+r"(width) // %2 - : "r"(rgbconstants) // %3 + "+r"(width), // %1 + "+r"(dst_y) // %2 + : "r"(&c->kRGBToY), // %3 + "r"(&c->kAddY) // %4 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22", "q12"); } void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants); + ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kArgbI601Constants); } void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) { - ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants); + ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kArgbJPEGConstants); } void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { - ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants); + ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kAbgrI601Constants); } void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { - ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants); + ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kAbgrJPEGConstants); } // RGBA expects first value to be A and ignored, then 3 values to contain RGB. @@ -2834,13 +2787,14 @@ void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width, - const struct RgbConstants* rgbconstants) { + const struct ArgbConstants* c) { asm volatile( - "vld1.8 {d0}, [%3] \n" // load rgbconstants - "vdup.u8 d20, d0[0] \n" - "vdup.u8 d21, d0[1] \n" - "vdup.u8 d22, d0[2] \n" - "vdup.u16 q12, d0[2] \n" + "vld1.8 {d16}, [%3] \n" // load kRGBToY + "vld1.16 {d18[0]}, [%4] \n" // load kAddY[0] + "vdup.8 d20, d16[0] \n" // BY + "vdup.8 d21, d16[1] \n" // GY + "vdup.8 d22, d16[2] \n" // RY + "vdup.16 q12, d18[0] \n" // AY "1: \n" "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of RGBA "vld4.8 {d1, d3, d5, d7}, [%0]! \n" @@ -2858,33 +2812,35 @@ static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba, : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 - : "r"(rgbconstants) // %3 + : "r"(&c->kRGBToY), // %3 + "r"(&c->kAddY) // %4 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22", "q12"); } void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { - RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants); + RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kArgbI601Constants); } void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { - RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants); + RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kArgbJPEGConstants); } void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { - RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants); + RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kAbgrI601Constants); } static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, uint8_t* dst_y, int width, - const struct RgbConstants* rgbconstants) { + const struct ArgbConstants* c) { asm volatile( - "vld1.8 {d0}, [%3] \n" // load rgbconstants - "vdup.u8 d20, d0[0] \n" - "vdup.u8 d21, d0[1] \n" - "vdup.u8 d22, d0[2] \n" - "vdup.u16 q12, d0[2] \n" + "vld1.8 {d16}, [%3] \n" // load kRGBToY + "vld1.16 {d18[0]}, [%4] \n" // load kAddY[0] + "vdup.8 d20, d16[0] \n" // BY + "vdup.8 d21, d16[1] \n" // GY + "vdup.8 d22, d16[2] \n" // RY + "vdup.16 q12, d18[0] \n" // AY "1: \n" "vld3.8 {d2, d4, d6}, [%0]! \n" // load 16 pixels of // RGB24. @@ -2903,25 +2859,26 @@ static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, : "+r"(src_rgb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 - : "r"(rgbconstants) // %3 + : "r"(&c->kRGBToY), // %3 + "r"(&c->kAddY) // %4 : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22", "q12"); } void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { - RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants); + RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kArgbJPEGConstants); } void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { - RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants); + RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kAbgrJPEGConstants); } void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) { - RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants); + RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kArgbI601Constants); } void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { - RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants); + RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kAbgrI601Constants); } // Bilinear filter 16x2 -> 16x1