mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-04-30 19:09:18 +08:00
Add Gemini implementation for NEON32 RGB to YUV matrix operations
These are about 25% faster than the C versions. Bug: libyuv:42280902 Change-Id: I8b298670ee5f3ed5db35527fc41d6d9a51b020a1 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7573682 Reviewed-by: Frank Barchard <fbarchard@chromium.org> Commit-Queue: Dale Curtis <dalecurtis@chromium.org>
This commit is contained in:
parent
4183733af5
commit
1170363ce5
@ -406,11 +406,17 @@ extern "C" {
|
||||
#define HAS_ARGBTORGB24ROW_NEON
|
||||
#define HAS_ARGBTORGB565DITHERROW_NEON
|
||||
#define HAS_ARGBTORGB565ROW_NEON
|
||||
#if !defined(__aarch64__)
|
||||
#define HAS_ARGBTOUV444MATRIXROW_NEON
|
||||
#endif
|
||||
#define HAS_ARGBTOUV444ROW_NEON
|
||||
#define HAS_ARGBTOUVJ444ROW_NEON
|
||||
#define HAS_ARGBTOUVJROW_NEON
|
||||
#define HAS_ARGBTOUVROW_NEON
|
||||
#define HAS_ARGBTOYJROW_NEON
|
||||
#if !defined(__aarch64__)
|
||||
#define HAS_ARGBTOYMATRIXROW_NEON
|
||||
#endif
|
||||
#define HAS_ARGBTOYROW_NEON
|
||||
#define HAS_AYUVTOUVROW_NEON
|
||||
#define HAS_AYUVTOVUROW_NEON
|
||||
@ -975,20 +981,19 @@ typedef uint32_t ulvec32[8];
|
||||
typedef uint8_t ulvec8[32];
|
||||
#endif
|
||||
|
||||
struct ArgbConstants {
|
||||
uint8_t kRGBToY[32];
|
||||
int8_t kRGBToU[32];
|
||||
int8_t kRGBToV[32];
|
||||
uint16_t kAddY[16];
|
||||
uint16_t kAddUV[16];
|
||||
};
|
||||
|
||||
#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
|
||||
// This struct is for ARM and RISC-V color conversion.
|
||||
struct YuvConstants {
|
||||
uvec8 kUVCoeff;
|
||||
vec16 kRGBCoeffBias;
|
||||
};
|
||||
struct ArgbConstants {
|
||||
uvec8 kRGBToY;
|
||||
vec8 kRGBToU;
|
||||
vec8 kRGBToV;
|
||||
uvec16 kAddY;
|
||||
uvec16 kAddUV;
|
||||
};
|
||||
#else
|
||||
// This struct is for Intel color conversion.
|
||||
struct YuvConstants {
|
||||
@ -998,6 +1003,13 @@ struct YuvConstants {
|
||||
int16_t kYToRgb[16];
|
||||
int16_t kYBiasToRgb[16];
|
||||
};
|
||||
struct ArgbConstants {
|
||||
uint8_t kRGBToY[32];
|
||||
int8_t kRGBToU[32];
|
||||
int8_t kRGBToV[32];
|
||||
uint16_t kAddY[16];
|
||||
uint16_t kAddUV[16];
|
||||
};
|
||||
|
||||
// Offsets into YuvConstants structure
|
||||
#define KUVTOB 0
|
||||
@ -1778,6 +1790,27 @@ void RGBAToYJRow_LSX(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
|
||||
void ARGBToYJRow_LASX(const uint8_t* src_argb0, uint8_t* dst_y, int width);
|
||||
void ABGRToYJRow_LASX(const uint8_t* src_abgr, uint8_t* dst_yj, int width);
|
||||
void RGBAToYJRow_LASX(const uint8_t* src_rgba, uint8_t* dst_yj, int width);
|
||||
|
||||
#if !defined(__aarch64__)
|
||||
void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToUV444MatrixRow_Any_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
void ARGBToYMatrixRow_Any_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c);
|
||||
#endif
|
||||
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
|
||||
@ -2178,6 +2178,14 @@ int ARGBToI420Matrix(const uint8_t* src_argb,
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYMATRIXROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 ||
|
||||
height == 0) {
|
||||
|
||||
@ -199,6 +199,22 @@ int ARGBToI444Matrix(const uint8_t* src_argb,
|
||||
ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYMATRIXROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOUV444MATRIXROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGBToUV444MatrixRow = ARGBToUV444MatrixRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 ||
|
||||
height == 0) {
|
||||
@ -415,6 +431,14 @@ int ARGBToI422Matrix(const uint8_t* src_argb,
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYMATRIXROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
if (!src_argb || !dst_y || !dst_u || !dst_v || !argbconstants || width <= 0 ||
|
||||
height == 0) {
|
||||
@ -677,6 +701,14 @@ int ARGBToNV12Matrix(const uint8_t* src_argb,
|
||||
ARGBToUVMatrixRow = ARGBToUVMatrixRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYMATRIXROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
void (*MergeUVRow)(const uint8_t* src_u, const uint8_t* src_v,
|
||||
uint8_t* dst_uv, int width) = MergeUVRow_C;
|
||||
|
||||
@ -2253,6 +2253,30 @@ ANY12M(ARGBToUV444MatrixRow_Any_AVX2, ARGBToUV444MatrixRow_AVX2, 4, 31)
|
||||
#ifdef HAS_ARGBTOUV444MATRIXROW_SSSE3
|
||||
ANY12M(ARGBToUV444MatrixRow_Any_SSSE3, ARGBToUV444MatrixRow_SSSE3, 4, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUV444MATRIXROW_NEON
|
||||
ANY12M(ARGBToUV444MatrixRow_Any_NEON, ARGBToUV444MatrixRow_NEON, 4, 7)
|
||||
#endif
|
||||
|
||||
#define ANY11MC(NAMEANY, ANY_SIMD, BPP, MASK) \
|
||||
void NAMEANY(const uint8_t* src_ptr, uint8_t* dst_ptr, int width, \
|
||||
const struct ArgbConstants* c) { \
|
||||
SIMD_ALIGNED(uint8_t vin[128]); \
|
||||
SIMD_ALIGNED(uint8_t vout[128]); \
|
||||
memset(vin, 0, sizeof(vin)); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(src_ptr, dst_ptr, n, c); \
|
||||
} \
|
||||
memcpy(vin, src_ptr + (ptrdiff_t)n * BPP, (ptrdiff_t)r * BPP); \
|
||||
ANY_SIMD(vin, vout, MASK + 1, c); \
|
||||
memcpy(dst_ptr + (ptrdiff_t)n, vout, (ptrdiff_t)r); \
|
||||
}
|
||||
|
||||
#ifdef HAS_ARGBTOYMATRIXROW_NEON
|
||||
ANY11MC(ARGBToYMatrixRow_Any_NEON, ARGBToYMatrixRow_NEON, 4, 15)
|
||||
#endif
|
||||
#undef ANY11MC
|
||||
|
||||
#ifdef HAS_ARGBTOUVROW_AVX2
|
||||
ANY12S(ARGBToUVRow_Any_AVX2, ARGBToUVRow_AVX2, 0, 4, 31)
|
||||
|
||||
@ -1486,6 +1486,15 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
||||
{YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}
|
||||
#endif
|
||||
|
||||
#if defined(__aarch64__) || defined(__arm__) || defined(__riscv)
|
||||
#define ARGBCONSTANTSBODY(Y0, Y1, Y2, Y3, U0, U1, U2, U3, V0, V1, V2, V3, AY, \
|
||||
AUV) \
|
||||
{{Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3}, \
|
||||
{U0, U1, U2, U3, U0, U1, U2, U3, U0, U1, U2, U3, U0, U1, U2, U3}, \
|
||||
{V0, V1, V2, V3, V0, V1, V2, V3, V0, V1, V2, V3, V0, V1, V2, V3}, \
|
||||
{AY, AY, AY, AY, AY, AY, AY, AY}, \
|
||||
{AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV}}
|
||||
#else
|
||||
#define ARGBCONSTANTSBODY(Y0, Y1, Y2, Y3, U0, U1, U2, U3, V0, V1, V2, V3, AY, \
|
||||
AUV) \
|
||||
{{Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3, Y0, Y1, Y2, Y3, \
|
||||
@ -1497,6 +1506,7 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
||||
{AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY, AY}, \
|
||||
{AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, AUV, \
|
||||
AUV, AUV}}
|
||||
#endif
|
||||
|
||||
// clang-format on
|
||||
|
||||
|
||||
@ -9,6 +9,7 @@
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/convert_from_argb.h" // For ArgbConstants
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
@ -1840,39 +1841,36 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
|
||||
);
|
||||
}
|
||||
|
||||
// Coefficients expressed as negatives to allow 128
|
||||
struct RgbUVConstants {
|
||||
int8_t kRGBToU[4];
|
||||
int8_t kRGBToV[4];
|
||||
};
|
||||
|
||||
// 8x1 pixels.
|
||||
static void ARGBToUV444MatrixRow_NEON(
|
||||
const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct RgbUVConstants* rgbuvconstants) {
|
||||
void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"vld1.8 {d0}, [%4] \n" // load rgbuvconstants
|
||||
"vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient
|
||||
"vdup.u8 d25, d0[1] \n" // UG -0.5781 coefficient
|
||||
"vdup.u8 d26, d0[2] \n" // UR -0.2969 coefficient
|
||||
"vdup.u8 d27, d0[4] \n" // VB -0.1406 coefficient
|
||||
"vdup.u8 d28, d0[5] \n" // VG -0.7344 coefficient
|
||||
"vneg.s8 d24, d24 \n"
|
||||
"vmov.u16 q15, #0x8000 \n" // 128.0
|
||||
"vld1.8 {d16}, [%4] \n" // load kRGBToU
|
||||
"vld1.8 {d17}, [%5] \n" // load kRGBToV
|
||||
"vld1.16 {d18[0]}, [%6] \n" // load kAddUV[0]
|
||||
"vabs.s8 d16, d16 \n" // BU, GU, RU
|
||||
"vabs.s8 d17, d17 \n" // BV, GV, RV
|
||||
"vdup.8 d20, d16[0] \n" // BU
|
||||
"vdup.8 d21, d16[1] \n" // GU
|
||||
"vdup.8 d22, d16[2] \n" // RU
|
||||
"vdup.8 d23, d17[0] \n" // BV
|
||||
"vdup.8 d24, d17[1] \n" // GV
|
||||
"vdup.8 d25, d17[2] \n" // RV
|
||||
"vdup.16 q15, d18[0] \n" // kAddUV
|
||||
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q2, d0, d24 \n" // B
|
||||
"vmlsl.u8 q2, d1, d25 \n" // G
|
||||
"vmlsl.u8 q2, d2, d26 \n" // R
|
||||
"vmull.u8 q2, d0, d20 \n" // B * BU
|
||||
"vmlsl.u8 q2, d1, d21 \n" // - G * GU
|
||||
"vmlsl.u8 q2, d2, d22 \n" // - R * RU
|
||||
|
||||
"vmull.u8 q3, d2, d24 \n" // R
|
||||
"vmlsl.u8 q3, d1, d28 \n" // G
|
||||
"vmlsl.u8 q3, d0, d27 \n" // B
|
||||
"vmull.u8 q3, d2, d25 \n" // R * RV
|
||||
"vmlsl.u8 q3, d1, d24 \n" // - G * GV
|
||||
"vmlsl.u8 q3, d0, d23 \n" // - B * BV
|
||||
|
||||
"vaddhn.u16 d0, q2, q15 \n" // signed -> unsigned
|
||||
"vaddhn.u16 d1, q3, q15 \n"
|
||||
@ -1880,53 +1878,32 @@ static void ARGBToUV444MatrixRow_NEON(
|
||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_u), // %1
|
||||
"+r"(dst_v), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"(rgbuvconstants) // %4
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
|
||||
"q15");
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_u), // %1
|
||||
"+r"(dst_v), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"(&c->kRGBToU), // %4
|
||||
"r"(&c->kRGBToV), // %5
|
||||
"r"(&c->kAddUV) // %6
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
|
||||
"q12", "q13", "q14", "q15");
|
||||
}
|
||||
|
||||
// RGB to BT601 coefficients
|
||||
// UB 0.875 coefficient = 112
|
||||
// UG -0.5781 coefficient = -74
|
||||
// UR -0.2969 coefficient = -38
|
||||
// VB -0.1406 coefficient = -18
|
||||
// VG -0.7344 coefficient = -94
|
||||
// VR 0.875 coefficient = 112
|
||||
|
||||
static const struct RgbUVConstants kARGBI601UVConstants = {{-112, 74, 38, 0},
|
||||
{18, 94, -112, 0}};
|
||||
|
||||
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
|
||||
&kARGBI601UVConstants);
|
||||
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbI601Constants);
|
||||
}
|
||||
|
||||
// RGB to JPEG coefficients
|
||||
// UB 0.500 coefficient = 128
|
||||
// UG -0.33126 coefficient = -85
|
||||
// UR -0.16874 coefficient = -43
|
||||
// VB -0.08131 coefficient = -21
|
||||
// VG -0.41869 coefficient = -107
|
||||
// VR 0.500 coefficient = 128
|
||||
|
||||
static const struct RgbUVConstants kARGBJPEGUVConstants = {{-128, 85, 43, 0},
|
||||
{21, 107, -128, 0}};
|
||||
|
||||
void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
|
||||
&kARGBJPEGUVConstants);
|
||||
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, &kArgbJPEGConstants);
|
||||
}
|
||||
|
||||
|
||||
// clang-format off
|
||||
// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
|
||||
#define RGBTOUV(QB, QG, QR) \
|
||||
@ -2754,47 +2731,22 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4");
|
||||
}
|
||||
|
||||
struct RgbConstants {
|
||||
uint8_t kRGBToY[4];
|
||||
uint16_t kAddY;
|
||||
};
|
||||
|
||||
// RGB to JPeg coefficients
|
||||
// B * 0.1140 coefficient = 29
|
||||
// G * 0.5870 coefficient = 150
|
||||
// R * 0.2990 coefficient = 77
|
||||
// Add 0.5
|
||||
static const struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0},
|
||||
0x0080};
|
||||
|
||||
static const struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 0x0080};
|
||||
|
||||
// RGB to BT.601 coefficients
|
||||
// B * 0.1016 coefficient = 25
|
||||
// G * 0.5078 coefficient = 129
|
||||
// R * 0.2578 coefficient = 66
|
||||
// Add 16.5 = 0x1080
|
||||
|
||||
static const struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0},
|
||||
0x1080};
|
||||
|
||||
static const struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
|
||||
|
||||
// ARGB expects first 3 values to contain RGB and 4th value is ignored.
|
||||
static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct RgbConstants* rgbconstants) {
|
||||
void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"vld1.8 {d0}, [%3] \n" // load rgbconstants
|
||||
"vdup.u8 d20, d0[0] \n"
|
||||
"vdup.u8 d21, d0[1] \n"
|
||||
"vdup.u8 d22, d0[2] \n"
|
||||
"vdup.u16 q12, d0[2] \n"
|
||||
"vld1.8 {d16}, [%3] \n" // load kRGBToY
|
||||
"vld1.16 {d18[0]}, [%4] \n" // load kAddY[0]
|
||||
"vdup.8 d20, d16[0] \n" // BY
|
||||
"vdup.8 d21, d16[1] \n" // GY
|
||||
"vdup.8 d22, d16[2] \n" // RY
|
||||
"vdup.16 q12, d18[0] \n" // AY
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n"
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop.
|
||||
"subs %1, %1, #16 \n" // 16 processed per loop.
|
||||
"vmull.u8 q8, d0, d20 \n" // B
|
||||
"vmull.u8 q9, d1, d20 \n"
|
||||
"vmlal.u8 q8, d2, d21 \n" // G
|
||||
@ -2803,30 +2755,31 @@ static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
||||
"vmlal.u8 q9, d5, d22 \n"
|
||||
"vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
|
||||
"vaddhn.u16 d1, q9, q12 \n"
|
||||
"vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y.
|
||||
"vst1.8 {d0, d1}, [%2]! \n" // store 16 pixels Y.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(rgbconstants) // %3
|
||||
"+r"(width), // %1
|
||||
"+r"(dst_y) // %2
|
||||
: "r"(&c->kRGBToY), // %3
|
||||
"r"(&c->kAddY) // %4
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
|
||||
"q12");
|
||||
}
|
||||
|
||||
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
|
||||
ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kArgbI601Constants);
|
||||
}
|
||||
|
||||
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
|
||||
ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
|
||||
ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kArgbJPEGConstants);
|
||||
}
|
||||
|
||||
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
|
||||
ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kAbgrI601Constants);
|
||||
}
|
||||
|
||||
void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
|
||||
ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
|
||||
ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
|
||||
}
|
||||
|
||||
// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
|
||||
@ -2834,13 +2787,14 @@ void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
|
||||
static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct RgbConstants* rgbconstants) {
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"vld1.8 {d0}, [%3] \n" // load rgbconstants
|
||||
"vdup.u8 d20, d0[0] \n"
|
||||
"vdup.u8 d21, d0[1] \n"
|
||||
"vdup.u8 d22, d0[2] \n"
|
||||
"vdup.u16 q12, d0[2] \n"
|
||||
"vld1.8 {d16}, [%3] \n" // load kRGBToY
|
||||
"vld1.16 {d18[0]}, [%4] \n" // load kAddY[0]
|
||||
"vdup.8 d20, d16[0] \n" // BY
|
||||
"vdup.8 d21, d16[1] \n" // GY
|
||||
"vdup.8 d22, d16[2] \n" // RY
|
||||
"vdup.16 q12, d18[0] \n" // AY
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of RGBA
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n"
|
||||
@ -2858,33 +2812,35 @@ static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
|
||||
: "+r"(src_rgba), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(rgbconstants) // %3
|
||||
: "r"(&c->kRGBToY), // %3
|
||||
"r"(&c->kAddY) // %4
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
|
||||
"q12");
|
||||
}
|
||||
|
||||
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||
RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
|
||||
RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kArgbI601Constants);
|
||||
}
|
||||
|
||||
void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
|
||||
RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
|
||||
RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kArgbJPEGConstants);
|
||||
}
|
||||
|
||||
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
|
||||
RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
|
||||
RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kAbgrI601Constants);
|
||||
}
|
||||
|
||||
static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct RgbConstants* rgbconstants) {
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"vld1.8 {d0}, [%3] \n" // load rgbconstants
|
||||
"vdup.u8 d20, d0[0] \n"
|
||||
"vdup.u8 d21, d0[1] \n"
|
||||
"vdup.u8 d22, d0[2] \n"
|
||||
"vdup.u16 q12, d0[2] \n"
|
||||
"vld1.8 {d16}, [%3] \n" // load kRGBToY
|
||||
"vld1.16 {d18[0]}, [%4] \n" // load kAddY[0]
|
||||
"vdup.8 d20, d16[0] \n" // BY
|
||||
"vdup.8 d21, d16[1] \n" // GY
|
||||
"vdup.8 d22, d16[2] \n" // RY
|
||||
"vdup.16 q12, d18[0] \n" // AY
|
||||
"1: \n"
|
||||
"vld3.8 {d2, d4, d6}, [%0]! \n" // load 16 pixels of
|
||||
// RGB24.
|
||||
@ -2903,25 +2859,26 @@ static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
|
||||
: "+r"(src_rgb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(rgbconstants) // %3
|
||||
: "r"(&c->kRGBToY), // %3
|
||||
"r"(&c->kAddY) // %4
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
|
||||
"q12");
|
||||
}
|
||||
|
||||
void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
|
||||
RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
|
||||
RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kArgbJPEGConstants);
|
||||
}
|
||||
|
||||
void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
|
||||
RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants);
|
||||
RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kAbgrJPEGConstants);
|
||||
}
|
||||
|
||||
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
|
||||
RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants);
|
||||
RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kArgbI601Constants);
|
||||
}
|
||||
|
||||
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
|
||||
RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants);
|
||||
RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kAbgrI601Constants);
|
||||
}
|
||||
|
||||
// Bilinear filter 16x2 -> 16x1
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user