ABGRToJ420 call ARGBToI420Matrix

- Standardize libyuv ARGB-family (ARGB, ABGR, RGBA, BGRA) to YUV conversion by utilizing the generic MatrixRow architecture and explicit ArgbConstants.
- Consolidated ARGBToI420, ABGRToI420, BGRAToI420, and RGBAToI420 as wrappers for ARGBToI420Matrix.
- Refactored ABGRToJ420, ABGRToJ422, and ABGRToI422 to use generic matrix functions.
- Added matrix-based versions for NV21, I400, YUY2, and UYVY.
- Updated RAW and RGB24 to I420/I422/I444 dispatchers to use MatrixRow logic and explicit constants.
- Fixed parameter swap bugs in ARGBToI422, ARGBToJ422, and ABGRToJ422.
- Fixed a bug in the generic C implementation of matrix row functions ensuring all 4 channels are processed correctly for all ARGB-family formats.
- Moved kShuffleAARRGGBB in row_gcc.cc to the top of the libyuv namespace for visibility.
- Cleaned up redundant format-specific row implementations.

Bug: libyuv:42280902
Change-Id: I67ffa4c476abc0d2dcc4650510d7bda91b65988e
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7830291
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
Frank Barchard 2026-05-07 19:58:19 -07:00 committed by libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com
parent 4aacbbdfb4
commit 4b4e68b372
10 changed files with 2858 additions and 3678 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1936
Version: 1937
Revision: DEPS
License: BSD-3-Clause
License File: LICENSE

View File

@ -875,6 +875,19 @@ int BGRAToI420(const uint8_t* src_bgra,
int width,
int height);
// BGRA little endian (argb in memory) to I422.
LIBYUV_API
int BGRAToI422(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// ABGR little endian (rgba in memory) to I420.
LIBYUV_API
int ABGRToI420(const uint8_t* src_abgr,
@ -888,6 +901,19 @@ int ABGRToI420(const uint8_t* src_abgr,
int width,
int height);
// ABGR little endian (rgba in memory) to I422.
LIBYUV_API
int ABGRToI422(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// RGBA little endian (abgr in memory) to I420.
LIBYUV_API
int RGBAToI420(const uint8_t* src_rgba,
@ -901,6 +927,19 @@ int RGBAToI420(const uint8_t* src_rgba,
int width,
int height);
// RGBA little endian (abgr in memory) to I422.
LIBYUV_API
int RGBAToI422(const uint8_t* src_rgba,
int src_stride_rgba,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// RGB little endian (bgr in memory) to I420.
LIBYUV_API
int RGB24ToI420(const uint8_t* src_rgb24,

View File

@ -245,6 +245,19 @@ int ARGBToI422(const uint8_t* src_argb,
int width,
int height);
// Convert ABGR To I422.
LIBYUV_API
int ABGRToI422(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// RGB to I444 with matrix. See ArgbConstants at the top of this file for usage.
LIBYUV_API
int ARGBToI422Matrix(const uint8_t* src_argb,
@ -458,7 +471,7 @@ int ARGBToUYVY(const uint8_t* src_argb,
// RAW to NV21 with Matrix
LIBYUV_API
int RGBToNV21Matrix(const uint8_t* src_raw,
int RAWToNV21Matrix(const uint8_t* src_raw,
int src_stride_raw,
uint8_t* dst_y,
int dst_stride_y,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1936
#define LIBYUV_VERSION 1937
#endif // INCLUDE_LIBYUV_VERSION_H_

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
#include "libyuv/convert_from_argb.h" // For ArgbConstants
#include "libyuv/planar_functions.h"
#include <assert.h>
@ -15,12 +16,10 @@
#include "libyuv/cpu_id.h"
#include "libyuv/row.h"
#include "libyuv/convert_from_argb.h"
#include "libyuv/scale_row.h" // for ScaleRowDown2
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
@ -4745,8 +4744,8 @@ static int ARGBSobelize(const uint8_t* src_argb,
uint8_t* dst,
int width)) {
int y;
void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) =
ARGBToYJRow_C;
void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1,
uint8_t* dst_sobely, int width) = SobelYRow_C;
void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1,
@ -4763,65 +4762,57 @@ static int ARGBSobelize(const uint8_t* src_argb,
src_stride_argb = -src_stride_argb;
}
#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
#if defined(HAS_ARGBTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
ARGBToYJRow = ARGBToYJRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
#if defined(HAS_ARGBTOYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
ARGBToYJRow = ARGBToYJRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
ARGBToYJRow = ARGBToYJRow_AVX512BW;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_NEON)
#if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
ARGBToYJRow = ARGBToYJRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
ARGBToYJRow = ARGBToYJRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_LSX)
#if defined(HAS_ARGBTOYJROW_LSX)
if (TestCpuFlag(kCpuHasLSX)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
ARGBToYJRow = ARGBToYJRow_Any_LSX;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
ARGBToYJRow = ARGBToYJRow_LSX;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_LASX)
#if defined(HAS_ARGBTOYJROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
ARGBToYJRow = ARGBToYJRow_Any_LASX;
if (IS_ALIGNED(width, 32)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
ARGBToYJRow = ARGBToYJRow_LASX;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_RVV)
#if defined(HAS_ARGBTOYJROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
ARGBToYJRow = ARGBToYJRow_RVV;
}
#endif
@ -4859,10 +4850,10 @@ static int ARGBSobelize(const uint8_t* src_argb,
uint8_t* row_y2 = row_y1 + row_size;
if (!rows)
return 1;
ARGBToYMatrixRow(src_argb, row_y0, width, &kArgbJPEGConstants);
ARGBToYJRow(src_argb, row_y0, width);
row_y0[-1] = row_y0[0];
memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind.
ARGBToYMatrixRow(src_argb, row_y1, width, &kArgbJPEGConstants);
ARGBToYJRow(src_argb, row_y1, width);
row_y1[-1] = row_y1[0];
memset(row_y1 + width, row_y1[width - 1], 16);
memset(row_y2 + width, 0, 16);
@ -4872,7 +4863,7 @@ static int ARGBSobelize(const uint8_t* src_argb,
if (y < (height - 1)) {
src_argb += src_stride_argb;
}
ARGBToYMatrixRow(src_argb, row_y2, width, &kArgbJPEGConstants);
ARGBToYJRow(src_argb, row_y2, width);
row_y2[-1] = row_y2[0];
row_y2[width] = row_y2[width - 1];

View File

@ -753,28 +753,31 @@ MAKEROWYJ(ABGR, 0, 1, 2, 4)
MAKEROWYJ(RGBA, 3, 2, 1, 4)
#undef MAKEROWYJ
static __inline uint8_t RGBToYMatrix(uint8_t r,
uint8_t g,
uint8_t b,
static __inline uint8_t RGBToYMatrix(uint8_t b0,
uint8_t b1,
uint8_t b2,
uint8_t b3,
const struct ArgbConstants* c) {
return (c->kRGBToY[2] * r + c->kRGBToY[1] * g + c->kRGBToY[0] * b +
c->kAddY[0]) >>
return (c->kRGBToY[0] * b0 + c->kRGBToY[1] * b1 + c->kRGBToY[2] * b2 +
c->kRGBToY[3] * b3 + c->kAddY[0]) >>
8;
}
static __inline uint8_t RGBToUMatrix(uint8_t r,
uint8_t g,
uint8_t b,
static __inline uint8_t RGBToUMatrix(uint8_t b0,
uint8_t b1,
uint8_t b2,
uint8_t b3,
const struct ArgbConstants* c) {
return (c->kAddUV[0] -
(c->kRGBToU[2] * r + c->kRGBToU[1] * g + c->kRGBToU[0] * b)) >>
return (c->kAddUV[0] - (c->kRGBToU[0] * b0 + c->kRGBToU[1] * b1 +
c->kRGBToU[2] * b2 + c->kRGBToU[3] * b3)) >>
8;
}
static __inline uint8_t RGBToVMatrix(uint8_t r,
uint8_t g,
uint8_t b,
static __inline uint8_t RGBToVMatrix(uint8_t b0,
uint8_t b1,
uint8_t b2,
uint8_t b3,
const struct ArgbConstants* c) {
return (c->kAddUV[0] -
(c->kRGBToV[2] * r + c->kRGBToV[1] * g + c->kRGBToV[0] * b)) >>
return (c->kAddUV[0] - (c->kRGBToV[0] * b0 + c->kRGBToV[1] * b1 +
c->kRGBToV[2] * b2 + c->kRGBToV[3] * b3)) >>
8;
}
@ -784,7 +787,7 @@ void ARGBToYMatrixRow_C(const uint8_t* src_argb,
const struct ArgbConstants* c) {
int x;
for (x = 0; x < width; ++x) {
dst_y[0] = RGBToYMatrix(src_argb[2], src_argb[1], src_argb[0], c);
dst_y[0] = RGBToYMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
src_argb += 4;
dst_y += 1;
}
@ -799,25 +802,28 @@ void ARGBToUVMatrixRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1 = src_argb + src_stride_argb;
int x;
for (x = 0; x < width - 1; x += 2) {
uint8_t ab =
uint8_t b0 =
(src_argb[0] + src_argb[4] + src_argb1[0] + src_argb1[4] + 2) >> 2;
uint8_t ag =
uint8_t b1 =
(src_argb[1] + src_argb[5] + src_argb1[1] + src_argb1[5] + 2) >> 2;
uint8_t ar =
uint8_t b2 =
(src_argb[2] + src_argb[6] + src_argb1[2] + src_argb1[6] + 2) >> 2;
dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
uint8_t b3 =
(src_argb[3] + src_argb[7] + src_argb1[3] + src_argb1[7] + 2) >> 2;
dst_u[0] = RGBToUMatrix(b0, b1, b2, b3, c);
dst_v[0] = RGBToVMatrix(b0, b1, b2, b3, c);
src_argb += 8;
src_argb1 += 8;
dst_u += 1;
dst_v += 1;
}
if (width & 1) {
uint8_t ab = (src_argb[0] + src_argb1[0] + 1) >> 1;
uint8_t ag = (src_argb[1] + src_argb1[1] + 1) >> 1;
uint8_t ar = (src_argb[2] + src_argb1[2] + 1) >> 1;
dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
uint8_t b0 = (src_argb[0] + src_argb1[0] + 1) >> 1;
uint8_t b1 = (src_argb[1] + src_argb1[1] + 1) >> 1;
uint8_t b2 = (src_argb[2] + src_argb1[2] + 1) >> 1;
uint8_t b3 = (src_argb[3] + src_argb1[3] + 1) >> 1;
dst_u[0] = RGBToUMatrix(b0, b1, b2, b3, c);
dst_v[0] = RGBToVMatrix(b0, b1, b2, b3, c);
}
}
@ -828,11 +834,10 @@ void ARGBToUV444MatrixRow_C(const uint8_t* src_argb,
const struct ArgbConstants* c) {
int x;
for (x = 0; x < width; ++x) {
uint8_t ab = src_argb[0];
uint8_t ag = src_argb[1];
uint8_t ar = src_argb[2];
dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
dst_u[0] =
RGBToUMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
dst_v[0] =
RGBToVMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
src_argb += 4;
dst_u += 1;
dst_v += 1;
@ -1513,16 +1518,16 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
#define MAKEARGBCONSTANTS(name, RY, GY, BY, RU, GU, BU, RV, GV, BV, AY, AUV) \
const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) = \
extern const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) = \
ARGBCONSTANTSBODY(BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), -(GV), \
-(RV), 0, AY, AUV); \
const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) = \
extern const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) = \
ARGBCONSTANTSBODY(RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), -(GV), \
-(BV), 0, AY, AUV); \
const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) = \
extern const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) = \
ARGBCONSTANTSBODY(0, BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), \
-(GV), -(RV), AY, AUV); \
const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) = \
extern const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) = \
ARGBCONSTANTSBODY(0, RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), \
-(GV), -(BV), AY, AUV);

View File

@ -1848,32 +1848,41 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
int width,
const struct ArgbConstants* c) {
asm volatile(
"vld1.8 {d16}, [%4] \n" // load kRGBToU
"vld1.8 {d17}, [%5] \n" // load kRGBToV
"vld1.16 {d18[0]}, [%6] \n" // load kAddUV[0]
"vabs.s8 d16, d16 \n" // BU, GU, RU
"vabs.s8 d17, d17 \n" // BV, GV, RV
"vdup.8 d20, d16[0] \n" // BU
"vdup.8 d21, d16[1] \n" // GU
"vdup.8 d22, d16[2] \n" // RU
"vdup.8 d23, d17[0] \n" // BV
"vdup.8 d24, d17[1] \n" // GV
"vdup.8 d25, d17[2] \n" // RV
"vdup.16 q15, d18[0] \n" // kAddUV
"vld1.8 {d24}, [%4] \n" // load kRGBToU
"vld1.8 {d25}, [%5] \n" // load kRGBToV
"vld1.16 {d26[0]}, [%6] \n" // load kAddUV[0]
"vmovl.s8 q10, d24 \n" // U coeffs (8 shorts)
"vmovl.s8 q11, d25 \n" // V coeffs (8 shorts)
"vdup.16 q6, d26[0] \n" // bias
"1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d20 \n" // B * BU
"vmlsl.u8 q2, d1, d21 \n" // - G * GU
"vmlsl.u8 q2, d2, d22 \n" // - R * RU
"vmull.u8 q3, d2, d25 \n" // R * RV
"vmlsl.u8 q3, d1, d24 \n" // - G * GV
"vmlsl.u8 q3, d0, d23 \n" // - B * BV
"vmovl.u8 q4, d0 \n" // B
"vmovl.u8 q5, d1 \n" // G
"vmovl.u8 q7, d2 \n" // R
"vmovl.u8 q8, d3 \n" // A
"vaddhn.u16 d0, q2, q15 \n" // signed -> unsigned
"vaddhn.u16 d1, q3, q15 \n"
"vdup.16 q12, d20[0] \n"
"vmul.s16 q2, q4, q12 \n" // U = B * U0
"vdup.16 q12, d20[1] \n"
"vmla.s16 q2, q5, q12 \n" // U += G * U1
"vdup.16 q12, d20[2] \n"
"vmla.s16 q2, q7, q12 \n" // U += R * U2
"vdup.16 q12, d20[3] \n"
"vmla.s16 q2, q8, q12 \n" // U += A * U3
"vdup.16 q12, d22[0] \n"
"vmul.s16 q3, q4, q12 \n" // V = B * V0
"vdup.16 q12, d22[1] \n"
"vmla.s16 q3, q5, q12 \n" // V += G * V1
"vdup.16 q12, d22[2] \n"
"vmla.s16 q3, q7, q12 \n" // V += R * V2
"vdup.16 q12, d22[3] \n"
"vmla.s16 q3, q8, q12 \n" // V += A * V3
"vsubhn.s16 d0, q6, q2 \n" // 128.0 - U
"vsubhn.s16 d1, q6, q3 \n" // 128.0 - V
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
@ -1885,8 +1894,8 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
: "r"(&c->kRGBToU), // %4
"r"(&c->kRGBToV), // %5
"r"(&c->kAddUV) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
"q12", "q13", "q14", "q15");
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"q10", "q11", "q12");
}
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
@ -1926,16 +1935,11 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
const struct ArgbConstants* c) {
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
"vld1.8 {d18}, [%5] \n" // load kRGBToU
"vld1.8 {d19}, [%6] \n" // load kRGBToV
"vmovl.s8 q8, d18 \n" // U coeffs in q8 (d16, d17)
"vmovl.s8 q9, d19 \n" // V coeffs in q9 (d18, d19)
"vdup.16 q10, d16[0] \n" // U0
"vdup.16 q11, d16[1] \n" // U1
"vdup.16 q12, d16[2] \n" // U2
"vdup.16 q13, d18[0] \n" // V0
"vdup.16 q14, d18[1] \n" // V1
"vdup.16 q15, d18[2] \n" // V2
"vld1.8 {d24}, [%5] \n" // load kRGBToU (8 bytes, only 4 used)
"vld1.8 {d25}, [%6] \n" // load kRGBToV
"vmovl.s8 q14, d24 \n" // U coeffs in d28
"vmovl.s8 q15, d25 \n" // V coeffs in d30
"vmov.u16 q11, #0x8000 \n" // 128.0 bias
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
@ -1944,28 +1948,39 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more pixels.
"vld4.8 {d9, d11, d13, d15}, [%1]! \n"
"vpadal.u8 q0, q4 \n" // B
"vpadal.u8 q1, q5 \n" // G
"vpadal.u8 q2, q6 \n" // R
"vpadal.u8 q3, q7 \n" // A
"vrshr.u16 q0, q0, #2 \n" // average of 4
"vrshr.u16 q1, q1, #2 \n"
"vrshr.u16 q2, q2, #2 \n"
"vrshr.u16 q3, q3, #2 \n"
"vmov.u16 q3, #0x8000 \n" // 128.0
"vmul.s16 q8, q0, q10 \n" // U = B * U0
"vmla.s16 q8, q1, q11 \n" // U += G * U1
"vdup.16 q12, d28[0] \n"
"vmul.s16 q8, q0, q12 \n" // U = B * U0
"vdup.16 q12, d28[1] \n"
"vmla.s16 q8, q1, q12 \n" // U += G * U1
"vdup.16 q12, d28[2] \n"
"vmla.s16 q8, q2, q12 \n" // U += R * U2
"vdup.16 q12, d28[3] \n"
"vmla.s16 q8, q3, q12 \n" // U += A * U3
"vmul.s16 q9, q0, q13 \n" // V = B * V0
"vmla.s16 q9, q1, q14 \n" // V += G * V1
"vmla.s16 q9, q2, q15 \n" // V += R * V2
"vdup.16 q12, d30[0] \n"
"vmul.s16 q9, q0, q12 \n" // V = B * V0
"vdup.16 q12, d30[1] \n"
"vmla.s16 q9, q1, q12 \n" // V += G * V1
"vdup.16 q12, d30[2] \n"
"vmla.s16 q9, q2, q12 \n" // V += R * V2
"vdup.16 q12, d30[3] \n"
"vmla.s16 q9, q3, q12 \n" // V += A * V3
"vsubhn.s16 d0, q3, q8 \n" // 128.0 - U
"vsubhn.s16 d1, q3, q9 \n" // 128.0 - V
"vsubhn.s16 d0, q11, q8 \n" // 128.0 - U
"vsubhn.s16 d1, q11, q9 \n" // 128.0 - V
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
@ -1978,7 +1993,7 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
: "r"(&c->kRGBToU), // %5
"r"(&c->kRGBToV) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
"q8", "q9", "q11", "q12", "q14", "q15"
);
}
@ -2212,44 +2227,8 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_bgra
"vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient
"vmov.s16 q11, #74 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #38 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #18 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8000 \n" // 128.0
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
"vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
"vrshr.u16 q1, q1, #2 \n" // average of 4
"vrshr.u16 q2, q2, #2 \n"
"vrshr.u16 q3, q3, #2 \n"
RGBTOUV(q3, q2, q1)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_bgra), // %0
"+r"(src_stride_bgra), // %1
"+r"(dst_u), // %2-
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
ARGBToUVMatrixRow_NEON(src_bgra, src_stride_bgra, dst_u, dst_v, width,
&kBgraI601Constants);
}
void ABGRToUVRow_NEON(const uint8_t* src_abgr,
@ -2257,44 +2236,8 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_abgr
"vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient
"vmov.s16 q11, #74 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #38 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #18 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8000 \n" // 128.0
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
"vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
"vrshr.u16 q0, q0, #2 \n" // average of 4
"vrshr.u16 q1, q1, #2 \n"
"vrshr.u16 q2, q2, #2 \n"
RGBTOUV(q2, q1, q0)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_abgr), // %0
"+r"(src_stride_abgr), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_u, dst_v, width,
&kAbgrI601Constants);
}
void RGBAToUVRow_NEON(const uint8_t* src_rgba,
@ -2302,44 +2245,8 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_rgba
"vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient
"vmov.s16 q11, #74 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #38 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #18 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8000 \n" // 128.0
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
"vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
"vrshr.u16 q0, q0, #2 \n" // average of 4
"vrshr.u16 q1, q1, #2 \n"
"vrshr.u16 q2, q2, #2 \n"
RGBTOUV(q0, q1, q2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_rgba), // %0
"+r"(src_stride_rgba), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
ARGBToUVMatrixRow_NEON(src_rgba, src_stride_rgba, dst_u, dst_v, width,
&kRgbaI601Constants);
}
void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
@ -2801,15 +2708,16 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
int width,
const struct ArgbConstants* c) {
asm volatile(
"vld1.8 {d16}, [%3] \n" // load kRGBToY
"vld1.16 {d18[0]}, [%4] \n" // load kAddY[0]
"vdup.8 d20, d16[0] \n" // BY
"vdup.8 d21, d16[1] \n" // GY
"vdup.8 d22, d16[2] \n" // RY
"vdup.16 q12, d18[0] \n" // AY
"vld1.8 {d24}, [%3] \n" // load kRGBToY
"vld1.16 {d25[0]}, [%4] \n" // load kAddY[0]
"vdup.8 d20, d24[0] \n" // B
"vdup.8 d21, d24[1] \n" // G
"vdup.8 d22, d24[2] \n" // R
"vdup.8 d23, d24[3] \n" // A
"vdup.16 q12, d25[0] \n" // bias
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB
"vld4.8 {d1, d3, d5, d7}, [%0]! \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 pixels
"subs %1, %1, #16 \n" // 16 processed per loop.
"vmull.u8 q8, d0, d20 \n" // B
"vmull.u8 q9, d1, d20 \n"
@ -2817,6 +2725,8 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
"vmlal.u8 q9, d3, d21 \n"
"vmlal.u8 q8, d4, d22 \n" // R
"vmlal.u8 q9, d5, d22 \n"
"vmlal.u8 q8, d6, d23 \n" // A
"vmlal.u8 q9, d7, d23 \n"
"vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
"vaddhn.u16 d1, q9, q12 \n"
"vst1.8 {d0, d1}, [%2]! \n" // store 16 pixels Y.
@ -2826,8 +2736,8 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
"+r"(dst_y) // %2
: "r"(&c->kRGBToY), // %3
"r"(&c->kAddY) // %4
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
"q12");
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12",
"d24", "d25");
}
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@ -2846,52 +2756,20 @@ void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
}
// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
// Same code as ARGB, except the LD4
static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
asm volatile(
"vld1.8 {d16}, [%3] \n" // load kRGBToY
"vld1.16 {d18[0]}, [%4] \n" // load kAddY[0]
"vdup.8 d20, d16[0] \n" // BY
"vdup.8 d21, d16[1] \n" // GY
"vdup.8 d22, d16[2] \n" // RY
"vdup.16 q12, d18[0] \n" // AY
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of RGBA
"vld4.8 {d1, d3, d5, d7}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop.
"vmull.u8 q8, d2, d20 \n" // B
"vmull.u8 q9, d3, d20 \n"
"vmlal.u8 q8, d4, d21 \n" // G
"vmlal.u8 q9, d5, d21 \n"
"vmlal.u8 q8, d6, d22 \n" // R
"vmlal.u8 q9, d7, d22 \n"
"vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
"vaddhn.u16 d1, q9, q12 \n"
"vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y.
"bgt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(&c->kRGBToY), // %3
"r"(&c->kAddY) // %4
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
"q12");
}
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kArgbI601Constants);
ARGBToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgbaI601Constants);
}
void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kArgbJPEGConstants);
ARGBToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
}
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kAbgrI601Constants);
ARGBToYMatrixRow_NEON(src_bgra, dst_y, width, &kBgraI601Constants);
}
void BGRAToYJRow_NEON(const uint8_t* src_bgra, uint8_t* dst_yj, int width) {
ARGBToYMatrixRow_NEON(src_bgra, dst_yj, width, &kBgraJPEGConstants);
}
void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
@ -2899,12 +2777,12 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
int width,
const struct ArgbConstants* c) {
asm volatile(
"vld1.8 {d16}, [%3] \n" // load kRGBToY
"vld1.16 {d18[0]}, [%4] \n" // load kAddY[0]
"vdup.8 d20, d16[0] \n" // BY
"vdup.8 d21, d16[1] \n" // GY
"vdup.8 d22, d16[2] \n" // RY
"vdup.16 q12, d18[0] \n" // AY
"vld1.8 {d24}, [%3] \n" // load kRGBToY
"vld1.16 {d25[0]}, [%4] \n" // load kAddY[0]
"vdup.8 d20, d24[0] \n" // BY
"vdup.8 d21, d24[1] \n" // GY
"vdup.8 d22, d24[2] \n" // RY
"vdup.16 q12, d25[0] \n" // AY
"1: \n"
"vld3.8 {d2, d4, d6}, [%0]! \n" // load 16 pixels of
// RGB24.
@ -2925,8 +2803,8 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
"+r"(width) // %2
: "r"(&c->kRGBToY), // %3
"r"(&c->kAddY) // %4
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
"q12");
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12",
"d24", "d25");
}

View File

@ -2736,47 +2736,61 @@ struct RgbUVConstants {
};
// 8x1 pixels.
static void ARGBToUV444MatrixRow_NEON(
const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct RgbUVConstants* rgbuvconstants) {
void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c) {
asm volatile(
"ldr d0, [%4] \n" // load rgbuvconstants
"dup v24.16b, v0.b[0] \n" // UB 0.875 coefficient
"dup v25.16b, v0.b[1] \n" // UG -0.5781 coefficient
"dup v26.16b, v0.b[2] \n" // UR -0.2969 coefficient
"dup v27.16b, v0.b[4] \n" // VB -0.1406 coefficient
"dup v28.16b, v0.b[5] \n" // VG -0.7344 coefficient
"neg v24.16b, v24.16b \n"
"movi v29.8h, #0x80, lsl #8 \n" // 128.0
"ldr q16, [%[c], #16] \n" // kRGBToU
"ldr q17, [%[c], #32] \n" // kRGBToV
"ldr s0, [%[c], #64] \n" // kAddUV
"sxtl v16.8h, v16.8b \n" // sign extend U coeffs to 16-bit
"sxtl v17.8h, v17.8b \n" // sign extend V coeffs to 16-bit
"dup v20.8h, v16.h[0] \n" // U0
"dup v21.8h, v16.h[1] \n" // U1
"dup v22.8h, v16.h[2] \n" // U2
"dup v23.8h, v16.h[3] \n" // U3
"dup v24.8h, v17.h[0] \n" // V0
"dup v26.8h, v17.h[1] \n" // V1
"dup v27.8h, v17.h[2] \n" // V2
"dup v28.8h, v17.h[3] \n" // V3
"dup v25.8h, v0.h[0] \n" // kAddUV
"1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
"subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B
"umlsl v4.8h, v1.8b, v25.8b \n" // G
"umlsl v4.8h, v2.8b, v26.8b \n" // R
"prfm pldl1keep, [%0, 448] \n"
"umull v3.8h, v2.8b, v24.8b \n" // R
"umlsl v3.8h, v1.8b, v28.8b \n" // G
"umlsl v3.8h, v0.8b, v27.8b \n" // B
"uxtl v4.8h, v0.8b \n"
"uxtl v5.8h, v1.8b \n"
"uxtl v6.8h, v2.8b \n"
"uxtl v7.8h, v3.8b \n"
"addhn v0.8b, v4.8h, v29.8h \n" // signed -> unsigned
"addhn v1.8b, v3.8h, v29.8h \n"
// U = B*U0 + G*U1 + R*U2 + A*U3
"mul v18.8h, v4.8h, v20.8h \n"
"mla v18.8h, v5.8h, v21.8h \n"
"mla v18.8h, v6.8h, v22.8h \n"
"mla v18.8h, v7.8h, v23.8h \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
// V = B*V0 + G*V1 + R*V2 + A*V3
"mul v19.8h, v4.8h, v24.8h \n"
"mla v19.8h, v5.8h, v26.8h \n"
"mla v19.8h, v6.8h, v27.8h \n"
"mla v19.8h, v7.8h, v28.8h \n"
"subhn v0.8b, v25.8h, v18.8h \n"
"subhn v1.8b, v25.8h, v19.8h \n"
"st1 {v0.8b}, [%1], #8 \n"
"st1 {v1.8b}, [%2], #8 \n"
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
: "r"(rgbuvconstants) // %4
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
"v27", "v28", "v29");
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width) // %3
: [c] "r"(c) // %4
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28");
}
static void ARGBToUV444MatrixRow_NEON_I8MM(
@ -2784,10 +2798,12 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct RgbUVConstants* rgbuvconstants) {
const struct ArgbConstants* c) {
asm volatile(
"ld2r {v16.4s, v17.4s}, [%[rgbuvconstants]] \n"
"movi v29.8h, #0x80, lsl #8 \n" // 128.0
"ldr q16, [%[c], #16] \n" // kRGBToU
"ldr q17, [%[c], #32] \n" // kRGBToV
"ldr s0, [%[c], #64] \n" // kAddUV
"dup v29.8h, v0.h[0] \n" // 128.0
"1: \n"
"ldp q0, q1, [%[src]], #32 \n"
"subs %w[width], %w[width], #8 \n" // 8 processed per loop.
@ -2807,11 +2823,11 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
"str d0, [%[dst_u]], #8 \n" // store 8 pixels U.
"str d1, [%[dst_v]], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: [src] "+r"(src_argb), // %[src]
[dst_u] "+r"(dst_u), // %[dst_u]
[dst_v] "+r"(dst_v), // %[dst_v]
[width] "+r"(width) // %[width]
: [rgbuvconstants] "r"(rgbuvconstants) // %[rgbuvconstants]
: [src] "+r"(src_argb), // %[src]
[dst_u] "+r"(dst_u), // %[dst_u]
[dst_v] "+r"(dst_v), // %[dst_v]
[width] "+r"(width) // %[width]
: [c] "r"(c) // %[c]
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17",
"v29");
}
@ -2824,15 +2840,12 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
// VG -0.7344 coefficient = -94
// VR 0.875 coefficient = 112
static const struct RgbUVConstants kARGBI601UVConstants = {{-112, 74, 38, 0},
{18, 94, -112, 0}};
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
&kARGBI601UVConstants);
&kArgbI601Constants);
}
void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
@ -2840,26 +2853,15 @@ void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width,
&kARGBI601UVConstants);
&kArgbI601Constants);
}
// RGB to JPEG coefficients
// UB 0.500 coefficient = 128
// UG -0.33126 coefficient = -85
// UR -0.16874 coefficient = -43
// VB -0.08131 coefficient = -21
// VG -0.41869 coefficient = -107
// VR 0.500 coefficient = 128
static const struct RgbUVConstants kARGBJPEGUVConstants = {{-128, 85, 43, 0},
{21, 107, -128, 0}};
void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
&kARGBJPEGUVConstants);
&kArgbJPEGConstants);
}
void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
@ -2867,7 +2869,7 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width,
&kARGBJPEGUVConstants);
&kArgbJPEGConstants);
}
#define RGBTOUV_SETUP_REG \
@ -2906,12 +2908,14 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
"ldr q17, [%[c], #32] \n" // kRGBToV
"sxtl v16.8h, v16.8b \n" // sign extend U coeffs to 16-bit
"sxtl v17.8h, v17.8b \n" // sign extend V coeffs to 16-bit
"dup v20.8h, v16.h[0] \n" // U0 (-BU)
"dup v21.8h, v16.h[1] \n" // U1 (-GU)
"dup v22.8h, v16.h[2] \n" // U2 (-RU)
"dup v23.8h, v17.h[0] \n" // V0 (-BV)
"dup v24.8h, v17.h[1] \n" // V1 (-GV)
"dup v26.8h, v17.h[2] \n" // V2 (-RV)
"dup v20.8h, v16.h[0] \n" // U0
"dup v21.8h, v16.h[1] \n" // U1
"dup v22.8h, v16.h[2] \n" // U2
"dup v23.8h, v16.h[3] \n" // U3
"dup v24.8h, v17.h[0] \n" // V0
"dup v26.8h, v17.h[1] \n" // V1
"dup v27.8h, v17.h[2] \n" // V2
"dup v28.8h, v17.h[3] \n" // V3
"movi v25.8h, #0x80, lsl #8 \n" // 128.0 in 16-bit (0x8000)
"1: \n"
@ -2921,26 +2925,31 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"uaddlp v18.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
"uadalp v18.8h, v7.16b \n" // A 16 bytes -> 8 shorts.
"urshr v0.8h, v0.8h, #2 \n" // average of 4
"urshr v1.8h, v1.8h, #2 \n"
"urshr v2.8h, v2.8h, #2 \n"
"urshr v18.8h, v18.8h, #2 \n"
// U = B*U0 + G*U1 + R*U2
// U = B*U0 + G*U1 + R*U2 + A*U3
"mul v3.8h, v0.8h, v20.8h \n"
"mla v3.8h, v1.8h, v21.8h \n"
"mla v3.8h, v2.8h, v22.8h \n"
"mla v3.8h, v18.8h, v23.8h \n"
// V = B*V0 + G*V1 + R*V2
"mul v4.8h, v0.8h, v23.8h \n"
"mla v4.8h, v1.8h, v24.8h \n"
"mla v4.8h, v2.8h, v26.8h \n"
// V = B*V0 + G*V1 + R*V2 + A*V3
"mul v4.8h, v0.8h, v24.8h \n"
"mla v4.8h, v1.8h, v26.8h \n"
"mla v4.8h, v2.8h, v27.8h \n"
"mla v4.8h, v18.8h, v28.8h \n"
// U = (128.0 - U) >> 8, V = (128.0 - V) >> 8
"subhn v0.8b, v25.8h, v3.8h \n"
@ -2956,7 +2965,8 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
"+r"(width) // %4
: [c] "r"(c) // %5
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v20", "v21", "v22", "v23", "v24", "v25", "v26"
"v16", "v17", "v18", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28"
);
}
@ -2974,44 +2984,35 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
"movi v20.8h, #128 \n" // UB/VR coeff (0.500)
"movi v21.8h, #85 \n" // UG coeff (-0.33126)
"movi v22.8h, #43 \n" // UR coeff (-0.16874)
"movi v23.8h, #21 \n" // VB coeff (-0.08131)
"movi v24.8h, #107 \n" // VG coeff (-0.41869)
"movi v25.8h, #0x80, lsl #8 \n" // 128.0 (0x8000 in 16-bit)
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
ARGBToUVMatrixRow_NEON(src_argb, src_stride_argb, dst_u, dst_v, width,
&kArgbJPEGConstants);
}
"urshr v0.8h, v0.8h, #2 \n" // average of 4
"urshr v1.8h, v1.8h, #2 \n"
"urshr v2.8h, v2.8h, #2 \n"
void ABGRToUVRow_NEON(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_u, dst_v, width,
&kAbgrI601Constants);
}
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(src_argb_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
void BGRAToUVRow_NEON(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON(src_bgra, src_stride_bgra, dst_u, dst_v, width,
&kBgraI601Constants);
}
void RGBAToUVRow_NEON(const uint8_t* src_rgba,
int src_stride_rgba,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON(src_rgba, src_stride_rgba, dst_u, dst_v, width,
&kRgbaI601Constants);
}
void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
@ -3019,44 +3020,8 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
uint8_t* dst_uj,
uint8_t* dst_vj,
int width) {
const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
asm volatile (
"movi v20.8h, #128 \n" // UB/VR coeff (0.500)
"movi v21.8h, #85 \n" // UG coeff (-0.33126)
"movi v22.8h, #43 \n" // UR coeff (-0.16874)
"movi v23.8h, #21 \n" // VB coeff (-0.08131)
"movi v24.8h, #107 \n" // VG coeff (-0.41869)
"movi v25.8h, #0x80, lsl #8 \n" // 128.0 (0x8000 in 16-bit)
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
"uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
"urshr v0.8h, v0.8h, #2 \n" // average of 4
"urshr v1.8h, v1.8h, #2 \n"
"urshr v2.8h, v2.8h, #2 \n"
RGBTOUV(v2.8h, v1.8h, v0.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_abgr), // %0
"+r"(src_abgr_1), // %1
"+r"(dst_uj), // %2
"+r"(dst_vj), // %3
"+r"(width) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_uj, dst_vj, width,
&kAbgrJPEGConstants);
}
void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
@ -3149,126 +3114,6 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
);
}
void BGRAToUVRow_NEON(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
asm volatile (
RGBTOUV_SETUP_REG
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
"uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
"urshr v0.8h, v0.8h, #2 \n" // average of 4
"urshr v1.8h, v3.8h, #2 \n"
"urshr v2.8h, v2.8h, #2 \n"
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_bgra), // %0
"+r"(src_bgra_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
void ABGRToUVRow_NEON(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
asm volatile (
RGBTOUV_SETUP_REG
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
"uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
"urshr v0.8h, v3.8h, #2 \n" // average of 4
"urshr v2.8h, v2.8h, #2 \n"
"urshr v1.8h, v1.8h, #2 \n"
RGBTOUV(v0.8h, v2.8h, v1.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_abgr), // %0
"+r"(src_abgr_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
void RGBAToUVRow_NEON(const uint8_t* src_rgba,
int src_stride_rgba,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
asm volatile (
RGBTOUV_SETUP_REG
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
"uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
"urshr v0.8h, v0.8h, #2 \n" // average of 4
"urshr v1.8h, v1.8h, #2 \n"
"urshr v2.8h, v2.8h, #2 \n"
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(src_rgba_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
int src_stride_rgb24,
uint8_t* dst_u,
@ -3483,18 +3328,19 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
);
}
// Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the uvconstants layout.
// Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the ArgbConstants layout.
static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const int8_t* uvconstants) {
const struct ArgbConstants* c) {
const uint8_t* src1 = src + src_stride;
asm volatile(
"movi v23.8h, #0x80, lsl #8 \n" // 128.0 (0x8000 in
// 16-bit)
"ld2r {v24.4s, v25.4s}, [%[uvconstants]] \n"
"ldr q24, [%[c], #16] \n" // kRGBToU
"ldr q25, [%[c], #32] \n" // kRGBToV
"1: \n"
"ld2 {v0.4s, v1.4s}, [%[src]], #32 \n" // load 8 pixels
@ -3547,51 +3393,19 @@ static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
[dst_u] "+r"(dst_u), // %[dst_u]
[dst_v] "+r"(dst_v), // %[dst_v]
[width] "+r"(width) // %[width]
: [uvconstants] "r"(uvconstants) // %[uvconstants]
: [c] "r"(c) // %[c]
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v23",
"v24", "v25");
}
// RGB to BT601 coefficients
// UB 0.875 coefficient = 112
// UG -0.5781 coefficient = -74
// UR -0.2969 coefficient = -38
// VB -0.1406 coefficient = -18
// VG -0.7344 coefficient = -94
// VR 0.875 coefficient = 112
// I8MM constants are stored negated such that we can store 128 in int8_t.
static const int8_t kARGBToUVCoefficients[] = {
// -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
-112, 74, 38, 0, 18, 94, -112, 0,
};
static const int8_t kABGRToUVCoefficients[] = {
// -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
38, 74, -112, 0, -112, 94, 18, 0,
};
static const int8_t kBGRAToUVCoefficients[] = {
// 0, -UR, -UG, -UB, 0, -VR, -VG, -VB
0, 38, 74, -112, 0, -112, 94, 18,
};
static const int8_t kRGBAToUVCoefficients[] = {
// 0, -UB, -UG, -UR, 0, -VB, -VG, -VR
0, -112, 74, 38, 0, 18, 94, -112,
};
void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct ArgbConstants* c) {
int8_t uvconstants[8] = {
(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
(int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
uvconstants);
c);
}
void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
@ -3600,7 +3414,7 @@ void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
kARGBToUVCoefficients);
&kArgbI601Constants);
}
void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
@ -3609,7 +3423,7 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
kABGRToUVCoefficients);
&kAbgrI601Constants);
}
void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
@ -3618,7 +3432,7 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width,
kBGRAToUVCoefficients);
&kBgraI601Constants);
}
void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
@ -3627,35 +3441,16 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width,
kRGBAToUVCoefficients);
&kRgbaI601Constants);
}
// RGB to JPEG coefficients
// UB 0.500 coefficient = 128
// UG -0.33126 coefficient = -85
// UR -0.16874 coefficient = -43
// VB -0.08131 coefficient = -21
// VG -0.41869 coefficient = -107
// VR 0.500 coefficient = 128
// I8MM constants are stored negated such that we can store 128 in int8_t.
static const int8_t kARGBToUVJCoefficients[] = {
// -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
-128, 85, 43, 0, 21, 107, -128, 0,
};
static const int8_t kABGRToUVJCoefficients[] = {
// -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
43, 85, -128, 0, -128, 107, 21, 0,
};
void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
kARGBToUVJCoefficients);
&kArgbJPEGConstants);
}
void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
@ -3664,7 +3459,7 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
kABGRToUVJCoefficients);
&kAbgrJPEGConstants);
}
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
@ -3771,206 +3566,145 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
int width,
const struct ArgbConstants* c) {
asm volatile(
"ldr s0, [%3] \n" // load rgbconstants
"ldr s1, [%3, #48] \n"
"dup v6.16b, v0.b[0] \n"
"dup v7.16b, v0.b[1] \n"
"dup v16.16b, v0.b[2] \n"
"dup v17.8h, v1.h[0] \n"
"ldr s16, [%3] \n" // load 4 coeffs
"ldr s17, [%3, #48] \n" // load kAddY[0]
"dup v18.16b, v16.b[0] \n" // B
"dup v19.16b, v16.b[1] \n" // G
"dup v20.16b, v16.b[2] \n" // R
"dup v21.16b, v16.b[3] \n" // A
"dup v22.8h, v17.h[0] \n" // bias
"1: \n"
"ld4 {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n" // load 16
// pixels.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"umull v0.8h, v2.8b, v6.8b \n" // B
"umull2 v1.8h, v2.16b, v6.16b \n"
"umull v0.8h, v2.8b, v18.8b \n" // B
"umull2 v1.8h, v2.16b, v18.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"umlal v0.8h, v3.8b, v7.8b \n" // G
"umlal2 v1.8h, v3.16b, v7.16b \n"
"umlal v0.8h, v4.8b, v16.8b \n" // R
"umlal2 v1.8h, v4.16b, v16.16b \n"
"addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y
"addhn v1.8b, v1.8h, v17.8h \n"
"umlal v0.8h, v3.8b, v19.8b \n" // G
"umlal2 v1.8h, v3.16b, v19.16b \n"
"umlal v0.8h, v4.8b, v20.8b \n" // R
"umlal2 v1.8h, v4.16b, v20.16b \n"
"umlal v0.8h, v5.8b, v21.8b \n" // A
"umlal2 v1.8h, v5.16b, v21.16b \n"
"addhn v0.8b, v0.8h, v22.8h \n" // 16 bit to 8 bit Y
"addhn v1.8b, v1.8h, v22.8h \n"
"st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(c) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17");
: "r"(c) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
"v19", "v20", "v21", "v22");
}
void ARGBToYMatrixRow_NEON_DotProd(
const uint8_t* src_argb,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
asm volatile(
"ldr s0, [%3] \n" // load rgbconstants
"ldr s1, [%3, #48] \n"
"dup v16.4s, v0.s[0] \n"
"dup v17.8h, v1.h[0] \n"
"ldr s16, [%3] \n" // load 4 coeffs
"ldr s17, [%3, #48] \n" // load kAddY[0]
"dup v18.4s, v16.s[0] \n"
"dup v19.8h, v17.h[0] \n"
"1: \n"
"ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%0], #64 \n" // load 16
// pixels.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"movi v0.16b, #0 \n"
"movi v1.16b, #0 \n"
"movi v2.16b, #0 \n"
"movi v3.16b, #0 \n"
"udot v0.4s, v4.16b, v16.16b \n"
"udot v1.4s, v5.16b, v16.16b \n"
"udot v2.4s, v6.16b, v16.16b \n"
"udot v3.4s, v7.16b, v16.16b \n"
"udot v0.4s, v4.16b, v18.16b \n"
"udot v1.4s, v5.16b, v18.16b \n"
"udot v2.4s, v6.16b, v18.16b \n"
"udot v3.4s, v7.16b, v18.16b \n"
"uzp1 v0.8h, v0.8h, v1.8h \n"
"uzp1 v1.8h, v2.8h, v3.8h \n"
"addhn v0.8b, v0.8h, v17.8h \n"
"addhn v1.8b, v1.8h, v17.8h \n"
"addhn v0.8b, v0.8h, v19.8h \n"
"addhn v1.8b, v1.8h, v19.8h \n"
"st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(c) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17");
: "r"(c) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19");
}
// RGB to JPeg coefficients
// B * 0.1140 coefficient = 29
// G * 0.5870 coefficient = 150
// R * 0.2990 coefficient = 77
// Add 0.5
static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, {}, {}, {0x0080}, {}};
static const struct ArgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77}, {}, {}, {0x0080}, {}};
static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {}, {}, {0x0080}, {}};
// RGB to BT.601 coefficients
// B * 0.1016 coefficient = 25
// G * 0.5078 coefficient = 129
// R * 0.2578 coefficient = 66
// Add 16.5 = 0x1080
static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, {}, {}, {0x1080}, {}};
static const struct ArgbConstants kRgb24I601DotProdConstants = {{0, 25, 129, 66}, {}, {}, {0x1080}, {}};
static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, {}, {}, {0x1080}, {}};
static const struct ArgbConstants kRawI601DotProdConstants = {{0, 66, 129, 25}, {}, {}, {0x1080}, {}};
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kArgbI601Constants);
}
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kArgbJPEGConstants);
}
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kAbgrI601Constants);
}
void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
}
void ARGBToYRow_NEON_DotProd(const uint8_t* src_argb,
uint8_t* dst_y,
int width) {
ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_y, width, &kRgb24I601Constants);
ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_y, width, &kArgbI601Constants);
}
void ARGBToYJRow_NEON_DotProd(const uint8_t* src_argb,
uint8_t* dst_yj,
int width) {
ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_yj, width, &kRgb24JPEGConstants);
ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_yj, width, &kArgbJPEGConstants);
}
void ABGRToYRow_NEON_DotProd(const uint8_t* src_abgr,
uint8_t* dst_y,
int width) {
ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_y, width, &kRawI601Constants);
ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_y, width, &kAbgrI601Constants);
}
void ABGRToYJRow_NEON_DotProd(const uint8_t* src_abgr,
uint8_t* dst_yj,
int width) {
ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_yj, width, &kRawJPEGConstants);
ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
}
// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
// Same code as ARGB, except the LD4
static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
asm volatile(
"ldr s0, [%3] \n" // load rgbconstants
"ldr s1, [%3, #48] \n"
"dup v6.16b, v0.b[0] \n"
"dup v7.16b, v0.b[1] \n"
"dup v16.16b, v0.b[2] \n"
"dup v17.8h, v1.h[0] \n"
"1: \n"
"ld4 {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n" // load 16
// pixels.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"umull v0.8h, v2.8b, v6.8b \n" // B
"umull2 v1.8h, v2.16b, v6.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"umlal v0.8h, v3.8b, v7.8b \n" // G
"umlal2 v1.8h, v3.16b, v7.16b \n"
"umlal v0.8h, v4.8b, v16.8b \n" // R
"umlal2 v1.8h, v4.16b, v16.16b \n"
"addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y
"addhn v1.8b, v1.8h, v17.8h \n"
"st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
"b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(c) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17");
}
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
ARGBToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgbaI601Constants);
}
void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
ARGBToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
}
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
ARGBToYMatrixRow_NEON(src_bgra, dst_y, width, &kBgraI601Constants);
}
void RGBAToYRow_NEON_DotProd(const uint8_t* src_rgba,
uint8_t* dst_y,
int width) {
// No need for a separate implementation for RGBA inputs, just permute the
// RGB constants.
ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_y, width,
&kRgb24I601DotProdConstants);
ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_y, width, &kRgbaI601Constants);
}
void RGBAToYJRow_NEON_DotProd(const uint8_t* src_rgba,
uint8_t* dst_yj,
int width) {
// No need for a separate implementation for RGBA inputs, just permute the
// RGB constants.
ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_yj, width,
&kRgb24JPEGDotProdConstants);
ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
}
void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra,
uint8_t* dst_y,
int width) {
// No need for a separate implementation for RGBA inputs, just permute the
// RGB constants.
ARGBToYMatrixRow_NEON_DotProd(src_bgra, dst_y, width,
&kRawI601DotProdConstants);
ARGBToYMatrixRow_NEON_DotProd(src_bgra, dst_y, width, &kBgraI601Constants);
}
void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
@ -3978,30 +3712,32 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
int width,
const struct ArgbConstants* c) {
asm volatile(
"ldr d0, [%3] \n" // load rgbconstants
"dup v5.16b, v0.b[0] \n"
"dup v6.16b, v0.b[1] \n"
"dup v7.16b, v0.b[2] \n"
"dup v16.8h, v0.h[2] \n"
"ldr s16, [%3] \n" // load 4 coeffs
"ldr s17, [%3, #48] \n" // load kAddY[0]
"dup v18.16b, v16.b[0] \n" // B
"dup v19.16b, v16.b[1] \n" // G
"dup v20.16b, v16.b[2] \n" // R
"dup v21.8h, v17.h[0] \n" // bias
"1: \n"
"ld3 {v2.16b,v3.16b,v4.16b}, [%0], #48 \n" // load 16 pixels.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"umull v0.8h, v2.8b, v5.8b \n" // B
"umull2 v1.8h, v2.16b, v5.16b \n"
"umull v0.8h, v2.8b, v18.8b \n" // B
"umull2 v1.8h, v2.16b, v18.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"umlal v0.8h, v3.8b, v6.8b \n" // G
"umlal2 v1.8h, v3.16b, v6.16b \n"
"umlal v0.8h, v4.8b, v7.8b \n" // R
"umlal2 v1.8h, v4.16b, v7.16b \n"
"addhn v0.8b, v0.8h, v16.8h \n" // 16 bit to 8 bit Y
"addhn v1.8b, v1.8h, v16.8h \n"
"umlal v0.8h, v3.8b, v19.8b \n" // G
"umlal2 v1.8h, v3.16b, v19.16b \n"
"umlal v0.8h, v4.8b, v20.8b \n" // R
"umlal2 v1.8h, v4.16b, v20.16b \n"
"addhn v0.8b, v0.8h, v21.8h \n" // 16 bit to 8 bit Y
"addhn v1.8b, v1.8h, v21.8h \n"
"st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
"b.gt 1b \n"
: "+r"(src_rgb), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(c) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18",
"v19", "v20", "v21");
}