ABGRToJ420 call ARGBToI420Matrix

- Standardize libyuv ARGB-family (ARGB, ABGR, RGBA, BGRA) to YUV conversion by utilizing the generic MatrixRow architecture and explicit ArgbConstants.
- Consolidated ARGBToI420, ABGRToI420, BGRAToI420, and RGBAToI420 as wrappers for ARGBToI420Matrix.
- Refactored ABGRToJ420, ABGRToJ422, and ABGRToI422 to use generic matrix functions.
- Added matrix-based versions for NV21, I400, YUY2, and UYVY.
- Updated RAW and RGB24 to I420/I422/I444 dispatchers to use MatrixRow logic and explicit constants.
- Fixed parameter swap bugs in ARGBToI422, ARGBToJ422, and ABGRToJ422.
- Fixed a bug in the generic C implementation of matrix row functions ensuring all 4 channels are processed correctly for all ARGB-family formats.
- Moved kShuffleAARRGGBB in row_gcc.cc to the top of the libyuv namespace for visibility.
- Cleaned up redundant format-specific row implementations.

Bug: libyuv:42280902
Change-Id: I67ffa4c476abc0d2dcc4650510d7bda91b65988e
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7830291
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
Frank Barchard 2026-05-07 19:58:19 -07:00 committed by libyuv-scoped@luci-project-accounts.iam.gserviceaccount.com
parent 4aacbbdfb4
commit 4b4e68b372
10 changed files with 2858 additions and 3678 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/ URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1936 Version: 1937
Revision: DEPS Revision: DEPS
License: BSD-3-Clause License: BSD-3-Clause
License File: LICENSE License File: LICENSE

View File

@ -875,6 +875,19 @@ int BGRAToI420(const uint8_t* src_bgra,
int width, int width,
int height); int height);
// BGRA little endian (argb in memory) to I422.
LIBYUV_API
int BGRAToI422(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// ABGR little endian (rgba in memory) to I420. // ABGR little endian (rgba in memory) to I420.
LIBYUV_API LIBYUV_API
int ABGRToI420(const uint8_t* src_abgr, int ABGRToI420(const uint8_t* src_abgr,
@ -888,6 +901,19 @@ int ABGRToI420(const uint8_t* src_abgr,
int width, int width,
int height); int height);
// ABGR little endian (rgba in memory) to I422.
LIBYUV_API
int ABGRToI422(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// RGBA little endian (abgr in memory) to I420. // RGBA little endian (abgr in memory) to I420.
LIBYUV_API LIBYUV_API
int RGBAToI420(const uint8_t* src_rgba, int RGBAToI420(const uint8_t* src_rgba,
@ -901,6 +927,19 @@ int RGBAToI420(const uint8_t* src_rgba,
int width, int width,
int height); int height);
// RGBA little endian (abgr in memory) to I422.
LIBYUV_API
int RGBAToI422(const uint8_t* src_rgba,
int src_stride_rgba,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// RGB little endian (bgr in memory) to I420. // RGB little endian (bgr in memory) to I420.
LIBYUV_API LIBYUV_API
int RGB24ToI420(const uint8_t* src_rgb24, int RGB24ToI420(const uint8_t* src_rgb24,

View File

@ -245,6 +245,19 @@ int ARGBToI422(const uint8_t* src_argb,
int width, int width,
int height); int height);
// Convert ABGR To I422.
LIBYUV_API
int ABGRToI422(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height);
// RGB to I444 with matrix. See ArgbConstants at the top of this file for usage. // RGB to I444 with matrix. See ArgbConstants at the top of this file for usage.
LIBYUV_API LIBYUV_API
int ARGBToI422Matrix(const uint8_t* src_argb, int ARGBToI422Matrix(const uint8_t* src_argb,
@ -458,7 +471,7 @@ int ARGBToUYVY(const uint8_t* src_argb,
// RAW to NV21 with Matrix // RAW to NV21 with Matrix
LIBYUV_API LIBYUV_API
int RGBToNV21Matrix(const uint8_t* src_raw, int RAWToNV21Matrix(const uint8_t* src_raw,
int src_stride_raw, int src_stride_raw,
uint8_t* dst_y, uint8_t* dst_y,
int dst_stride_y, int dst_stride_y,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1936 #define LIBYUV_VERSION 1937
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree. * be found in the AUTHORS file in the root of the source tree.
*/ */
#include "libyuv/convert_from_argb.h" // For ArgbConstants
#include "libyuv/planar_functions.h" #include "libyuv/planar_functions.h"
#include <assert.h> #include <assert.h>
@ -15,12 +16,10 @@
#include "libyuv/cpu_id.h" #include "libyuv/cpu_id.h"
#include "libyuv/row.h" #include "libyuv/row.h"
#include "libyuv/convert_from_argb.h"
#include "libyuv/scale_row.h" // for ScaleRowDown2 #include "libyuv/scale_row.h" // for ScaleRowDown2
#ifdef __cplusplus #ifdef __cplusplus
namespace libyuv { namespace libyuv {
extern "C" { extern "C" {
#endif #endif
@ -4745,8 +4744,8 @@ static int ARGBSobelize(const uint8_t* src_argb,
uint8_t* dst, uint8_t* dst,
int width)) { int width)) {
int y; int y;
void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width, void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) =
const struct ArgbConstants* c) = ARGBToYMatrixRow_C; ARGBToYJRow_C;
void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1, void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1,
uint8_t* dst_sobely, int width) = SobelYRow_C; uint8_t* dst_sobely, int width) = SobelYRow_C;
void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1, void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1,
@ -4763,65 +4762,57 @@ static int ARGBSobelize(const uint8_t* src_argb,
src_stride_argb = -src_stride_argb; src_stride_argb = -src_stride_argb;
} }
#if defined(HAS_ARGBTOYMATRIXROW_SSSE3) #if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3; ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3; ARGBToYJRow = ARGBToYJRow_SSSE3;
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYMATRIXROW_AVX2) #if defined(HAS_ARGBTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) { if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2; ARGBToYJRow = ARGBToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2; ARGBToYJRow = ARGBToYJRow_AVX2;
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW) #if defined(HAS_ARGBTOYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) { if (TestCpuFlag(kCpuHasAVX512BW)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW; ARGBToYJRow = ARGBToYJRow_Any_AVX512BW;
if (IS_ALIGNED(width, 64)) { if (IS_ALIGNED(width, 64)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW; ARGBToYJRow = ARGBToYJRow_AVX512BW;
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYMATRIXROW_NEON) #if defined(HAS_ARGBTOYJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON; ARGBToYJRow = ARGBToYJRow_Any_NEON;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON; ARGBToYJRow = ARGBToYJRow_NEON;
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD) #if defined(HAS_ARGBTOYJROW_LSX)
if (TestCpuFlag(kCpuHasNeonDotProd)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
}
}
#endif
#if defined(HAS_ARGBTOYMATRIXROW_LSX)
if (TestCpuFlag(kCpuHasLSX)) { if (TestCpuFlag(kCpuHasLSX)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX; ARGBToYJRow = ARGBToYJRow_Any_LSX;
if (IS_ALIGNED(width, 16)) { if (IS_ALIGNED(width, 16)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_LSX; ARGBToYJRow = ARGBToYJRow_LSX;
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYMATRIXROW_LASX) #if defined(HAS_ARGBTOYJROW_LASX)
if (TestCpuFlag(kCpuHasLASX)) { if (TestCpuFlag(kCpuHasLASX)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX; ARGBToYJRow = ARGBToYJRow_Any_LASX;
if (IS_ALIGNED(width, 32)) { if (IS_ALIGNED(width, 32)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_LASX; ARGBToYJRow = ARGBToYJRow_LASX;
} }
} }
#endif #endif
#if defined(HAS_ARGBTOYMATRIXROW_RVV) #if defined(HAS_ARGBTOYJROW_RVV)
if (TestCpuFlag(kCpuHasRVV)) { if (TestCpuFlag(kCpuHasRVV)) {
ARGBToYMatrixRow = ARGBToYMatrixRow_RVV; ARGBToYJRow = ARGBToYJRow_RVV;
} }
#endif #endif
@ -4859,10 +4850,10 @@ static int ARGBSobelize(const uint8_t* src_argb,
uint8_t* row_y2 = row_y1 + row_size; uint8_t* row_y2 = row_y1 + row_size;
if (!rows) if (!rows)
return 1; return 1;
ARGBToYMatrixRow(src_argb, row_y0, width, &kArgbJPEGConstants); ARGBToYJRow(src_argb, row_y0, width);
row_y0[-1] = row_y0[0]; row_y0[-1] = row_y0[0];
memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind. memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind.
ARGBToYMatrixRow(src_argb, row_y1, width, &kArgbJPEGConstants); ARGBToYJRow(src_argb, row_y1, width);
row_y1[-1] = row_y1[0]; row_y1[-1] = row_y1[0];
memset(row_y1 + width, row_y1[width - 1], 16); memset(row_y1 + width, row_y1[width - 1], 16);
memset(row_y2 + width, 0, 16); memset(row_y2 + width, 0, 16);
@ -4872,7 +4863,7 @@ static int ARGBSobelize(const uint8_t* src_argb,
if (y < (height - 1)) { if (y < (height - 1)) {
src_argb += src_stride_argb; src_argb += src_stride_argb;
} }
ARGBToYMatrixRow(src_argb, row_y2, width, &kArgbJPEGConstants); ARGBToYJRow(src_argb, row_y2, width);
row_y2[-1] = row_y2[0]; row_y2[-1] = row_y2[0];
row_y2[width] = row_y2[width - 1]; row_y2[width] = row_y2[width - 1];

View File

@ -753,28 +753,31 @@ MAKEROWYJ(ABGR, 0, 1, 2, 4)
MAKEROWYJ(RGBA, 3, 2, 1, 4) MAKEROWYJ(RGBA, 3, 2, 1, 4)
#undef MAKEROWYJ #undef MAKEROWYJ
static __inline uint8_t RGBToYMatrix(uint8_t r, static __inline uint8_t RGBToYMatrix(uint8_t b0,
uint8_t g, uint8_t b1,
uint8_t b, uint8_t b2,
uint8_t b3,
const struct ArgbConstants* c) { const struct ArgbConstants* c) {
return (c->kRGBToY[2] * r + c->kRGBToY[1] * g + c->kRGBToY[0] * b + return (c->kRGBToY[0] * b0 + c->kRGBToY[1] * b1 + c->kRGBToY[2] * b2 +
c->kAddY[0]) >> c->kRGBToY[3] * b3 + c->kAddY[0]) >>
8; 8;
} }
static __inline uint8_t RGBToUMatrix(uint8_t r, static __inline uint8_t RGBToUMatrix(uint8_t b0,
uint8_t g, uint8_t b1,
uint8_t b, uint8_t b2,
uint8_t b3,
const struct ArgbConstants* c) { const struct ArgbConstants* c) {
return (c->kAddUV[0] - return (c->kAddUV[0] - (c->kRGBToU[0] * b0 + c->kRGBToU[1] * b1 +
(c->kRGBToU[2] * r + c->kRGBToU[1] * g + c->kRGBToU[0] * b)) >> c->kRGBToU[2] * b2 + c->kRGBToU[3] * b3)) >>
8; 8;
} }
static __inline uint8_t RGBToVMatrix(uint8_t r, static __inline uint8_t RGBToVMatrix(uint8_t b0,
uint8_t g, uint8_t b1,
uint8_t b, uint8_t b2,
uint8_t b3,
const struct ArgbConstants* c) { const struct ArgbConstants* c) {
return (c->kAddUV[0] - return (c->kAddUV[0] - (c->kRGBToV[0] * b0 + c->kRGBToV[1] * b1 +
(c->kRGBToV[2] * r + c->kRGBToV[1] * g + c->kRGBToV[0] * b)) >> c->kRGBToV[2] * b2 + c->kRGBToV[3] * b3)) >>
8; 8;
} }
@ -784,7 +787,7 @@ void ARGBToYMatrixRow_C(const uint8_t* src_argb,
const struct ArgbConstants* c) { const struct ArgbConstants* c) {
int x; int x;
for (x = 0; x < width; ++x) { for (x = 0; x < width; ++x) {
dst_y[0] = RGBToYMatrix(src_argb[2], src_argb[1], src_argb[0], c); dst_y[0] = RGBToYMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
src_argb += 4; src_argb += 4;
dst_y += 1; dst_y += 1;
} }
@ -799,25 +802,28 @@ void ARGBToUVMatrixRow_C(const uint8_t* src_argb,
const uint8_t* src_argb1 = src_argb + src_stride_argb; const uint8_t* src_argb1 = src_argb + src_stride_argb;
int x; int x;
for (x = 0; x < width - 1; x += 2) { for (x = 0; x < width - 1; x += 2) {
uint8_t ab = uint8_t b0 =
(src_argb[0] + src_argb[4] + src_argb1[0] + src_argb1[4] + 2) >> 2; (src_argb[0] + src_argb[4] + src_argb1[0] + src_argb1[4] + 2) >> 2;
uint8_t ag = uint8_t b1 =
(src_argb[1] + src_argb[5] + src_argb1[1] + src_argb1[5] + 2) >> 2; (src_argb[1] + src_argb[5] + src_argb1[1] + src_argb1[5] + 2) >> 2;
uint8_t ar = uint8_t b2 =
(src_argb[2] + src_argb[6] + src_argb1[2] + src_argb1[6] + 2) >> 2; (src_argb[2] + src_argb[6] + src_argb1[2] + src_argb1[6] + 2) >> 2;
dst_u[0] = RGBToUMatrix(ar, ag, ab, c); uint8_t b3 =
dst_v[0] = RGBToVMatrix(ar, ag, ab, c); (src_argb[3] + src_argb[7] + src_argb1[3] + src_argb1[7] + 2) >> 2;
dst_u[0] = RGBToUMatrix(b0, b1, b2, b3, c);
dst_v[0] = RGBToVMatrix(b0, b1, b2, b3, c);
src_argb += 8; src_argb += 8;
src_argb1 += 8; src_argb1 += 8;
dst_u += 1; dst_u += 1;
dst_v += 1; dst_v += 1;
} }
if (width & 1) { if (width & 1) {
uint8_t ab = (src_argb[0] + src_argb1[0] + 1) >> 1; uint8_t b0 = (src_argb[0] + src_argb1[0] + 1) >> 1;
uint8_t ag = (src_argb[1] + src_argb1[1] + 1) >> 1; uint8_t b1 = (src_argb[1] + src_argb1[1] + 1) >> 1;
uint8_t ar = (src_argb[2] + src_argb1[2] + 1) >> 1; uint8_t b2 = (src_argb[2] + src_argb1[2] + 1) >> 1;
dst_u[0] = RGBToUMatrix(ar, ag, ab, c); uint8_t b3 = (src_argb[3] + src_argb1[3] + 1) >> 1;
dst_v[0] = RGBToVMatrix(ar, ag, ab, c); dst_u[0] = RGBToUMatrix(b0, b1, b2, b3, c);
dst_v[0] = RGBToVMatrix(b0, b1, b2, b3, c);
} }
} }
@ -828,11 +834,10 @@ void ARGBToUV444MatrixRow_C(const uint8_t* src_argb,
const struct ArgbConstants* c) { const struct ArgbConstants* c) {
int x; int x;
for (x = 0; x < width; ++x) { for (x = 0; x < width; ++x) {
uint8_t ab = src_argb[0]; dst_u[0] =
uint8_t ag = src_argb[1]; RGBToUMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
uint8_t ar = src_argb[2]; dst_v[0] =
dst_u[0] = RGBToUMatrix(ar, ag, ab, c); RGBToVMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
src_argb += 4; src_argb += 4;
dst_u += 1; dst_u += 1;
dst_v += 1; dst_v += 1;
@ -1513,16 +1518,16 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB); YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
#define MAKEARGBCONSTANTS(name, RY, GY, BY, RU, GU, BU, RV, GV, BV, AY, AUV) \ #define MAKEARGBCONSTANTS(name, RY, GY, BY, RU, GU, BU, RV, GV, BV, AY, AUV) \
const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) = \ extern const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) = \
ARGBCONSTANTSBODY(BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), -(GV), \ ARGBCONSTANTSBODY(BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), -(GV), \
-(RV), 0, AY, AUV); \ -(RV), 0, AY, AUV); \
const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) = \ extern const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) = \
ARGBCONSTANTSBODY(RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), -(GV), \ ARGBCONSTANTSBODY(RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), -(GV), \
-(BV), 0, AY, AUV); \ -(BV), 0, AY, AUV); \
const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) = \ extern const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) = \
ARGBCONSTANTSBODY(0, BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), \ ARGBCONSTANTSBODY(0, BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), \
-(GV), -(RV), AY, AUV); \ -(GV), -(RV), AY, AUV); \
const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) = \ extern const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) = \
ARGBCONSTANTSBODY(0, RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), \ ARGBCONSTANTSBODY(0, RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), \
-(GV), -(BV), AY, AUV); -(GV), -(BV), AY, AUV);

View File

@ -1848,32 +1848,41 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
int width, int width,
const struct ArgbConstants* c) { const struct ArgbConstants* c) {
asm volatile( asm volatile(
"vld1.8 {d16}, [%4] \n" // load kRGBToU "vld1.8 {d24}, [%4] \n" // load kRGBToU
"vld1.8 {d17}, [%5] \n" // load kRGBToV "vld1.8 {d25}, [%5] \n" // load kRGBToV
"vld1.16 {d18[0]}, [%6] \n" // load kAddUV[0] "vld1.16 {d26[0]}, [%6] \n" // load kAddUV[0]
"vabs.s8 d16, d16 \n" // BU, GU, RU "vmovl.s8 q10, d24 \n" // U coeffs (8 shorts)
"vabs.s8 d17, d17 \n" // BV, GV, RV "vmovl.s8 q11, d25 \n" // V coeffs (8 shorts)
"vdup.8 d20, d16[0] \n" // BU "vdup.16 q6, d26[0] \n" // bias
"vdup.8 d21, d16[1] \n" // GU
"vdup.8 d22, d16[2] \n" // RU
"vdup.8 d23, d17[0] \n" // BV
"vdup.8 d24, d17[1] \n" // GV
"vdup.8 d25, d17[2] \n" // RV
"vdup.16 q15, d18[0] \n" // kAddUV
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop. "subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d20 \n" // B * BU
"vmlsl.u8 q2, d1, d21 \n" // - G * GU
"vmlsl.u8 q2, d2, d22 \n" // - R * RU
"vmull.u8 q3, d2, d25 \n" // R * RV "vmovl.u8 q4, d0 \n" // B
"vmlsl.u8 q3, d1, d24 \n" // - G * GV "vmovl.u8 q5, d1 \n" // G
"vmlsl.u8 q3, d0, d23 \n" // - B * BV "vmovl.u8 q7, d2 \n" // R
"vmovl.u8 q8, d3 \n" // A
"vaddhn.u16 d0, q2, q15 \n" // signed -> unsigned "vdup.16 q12, d20[0] \n"
"vaddhn.u16 d1, q3, q15 \n" "vmul.s16 q2, q4, q12 \n" // U = B * U0
"vdup.16 q12, d20[1] \n"
"vmla.s16 q2, q5, q12 \n" // U += G * U1
"vdup.16 q12, d20[2] \n"
"vmla.s16 q2, q7, q12 \n" // U += R * U2
"vdup.16 q12, d20[3] \n"
"vmla.s16 q2, q8, q12 \n" // U += A * U3
"vdup.16 q12, d22[0] \n"
"vmul.s16 q3, q4, q12 \n" // V = B * V0
"vdup.16 q12, d22[1] \n"
"vmla.s16 q3, q5, q12 \n" // V += G * V1
"vdup.16 q12, d22[2] \n"
"vmla.s16 q3, q7, q12 \n" // V += R * V2
"vdup.16 q12, d22[3] \n"
"vmla.s16 q3, q8, q12 \n" // V += A * V3
"vsubhn.s16 d0, q6, q2 \n" // 128.0 - U
"vsubhn.s16 d1, q6, q3 \n" // 128.0 - V
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U. "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V. "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
@ -1885,8 +1894,8 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
: "r"(&c->kRGBToU), // %4 : "r"(&c->kRGBToU), // %4
"r"(&c->kRGBToV), // %5 "r"(&c->kRGBToV), // %5
"r"(&c->kAddUV) // %6 "r"(&c->kAddUV) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
"q12", "q13", "q14", "q15"); "q10", "q11", "q12");
} }
void ARGBToUV444Row_NEON(const uint8_t* src_argb, void ARGBToUV444Row_NEON(const uint8_t* src_argb,
@ -1926,16 +1935,11 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
const struct ArgbConstants* c) { const struct ArgbConstants* c) {
const uint8_t* src_argb_1 = src_argb + src_stride_argb; const uint8_t* src_argb_1 = src_argb + src_stride_argb;
asm volatile ( asm volatile (
"vld1.8 {d18}, [%5] \n" // load kRGBToU "vld1.8 {d24}, [%5] \n" // load kRGBToU (8 bytes, only 4 used)
"vld1.8 {d19}, [%6] \n" // load kRGBToV "vld1.8 {d25}, [%6] \n" // load kRGBToV
"vmovl.s8 q8, d18 \n" // U coeffs in q8 (d16, d17) "vmovl.s8 q14, d24 \n" // U coeffs in d28
"vmovl.s8 q9, d19 \n" // V coeffs in q9 (d18, d19) "vmovl.s8 q15, d25 \n" // V coeffs in d30
"vdup.16 q10, d16[0] \n" // U0 "vmov.u16 q11, #0x8000 \n" // 128.0 bias
"vdup.16 q11, d16[1] \n" // U1
"vdup.16 q12, d16[2] \n" // U2
"vdup.16 q13, d18[0] \n" // V0
"vdup.16 q14, d18[1] \n" // V1
"vdup.16 q15, d18[2] \n" // V2
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
@ -1944,28 +1948,39 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts. "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts. "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts. "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels. "vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels. "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. "vld4.8 {d9, d11, d13, d15}, [%1]! \n"
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. "vpadal.u8 q0, q4 \n" // B
"vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts. "vpadal.u8 q1, q5 \n" // G
"vpadal.u8 q2, q6 \n" // R
"vpadal.u8 q3, q7 \n" // A
"vrshr.u16 q0, q0, #2 \n" // average of 4 "vrshr.u16 q0, q0, #2 \n" // average of 4
"vrshr.u16 q1, q1, #2 \n" "vrshr.u16 q1, q1, #2 \n"
"vrshr.u16 q2, q2, #2 \n" "vrshr.u16 q2, q2, #2 \n"
"vrshr.u16 q3, q3, #2 \n"
"vmov.u16 q3, #0x8000 \n" // 128.0 "vdup.16 q12, d28[0] \n"
"vmul.s16 q8, q0, q12 \n" // U = B * U0
"vmul.s16 q8, q0, q10 \n" // U = B * U0 "vdup.16 q12, d28[1] \n"
"vmla.s16 q8, q1, q11 \n" // U += G * U1 "vmla.s16 q8, q1, q12 \n" // U += G * U1
"vdup.16 q12, d28[2] \n"
"vmla.s16 q8, q2, q12 \n" // U += R * U2 "vmla.s16 q8, q2, q12 \n" // U += R * U2
"vdup.16 q12, d28[3] \n"
"vmla.s16 q8, q3, q12 \n" // U += A * U3
"vmul.s16 q9, q0, q13 \n" // V = B * V0 "vdup.16 q12, d30[0] \n"
"vmla.s16 q9, q1, q14 \n" // V += G * V1 "vmul.s16 q9, q0, q12 \n" // V = B * V0
"vmla.s16 q9, q2, q15 \n" // V += R * V2 "vdup.16 q12, d30[1] \n"
"vmla.s16 q9, q1, q12 \n" // V += G * V1
"vdup.16 q12, d30[2] \n"
"vmla.s16 q9, q2, q12 \n" // V += R * V2
"vdup.16 q12, d30[3] \n"
"vmla.s16 q9, q3, q12 \n" // V += A * V3
"vsubhn.s16 d0, q3, q8 \n" // 128.0 - U "vsubhn.s16 d0, q11, q8 \n" // 128.0 - U
"vsubhn.s16 d1, q3, q9 \n" // 128.0 - V "vsubhn.s16 d1, q11, q9 \n" // 128.0 - V
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U. "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V. "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
@ -1978,7 +1993,7 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
: "r"(&c->kRGBToU), // %5 : "r"(&c->kRGBToU), // %5
"r"(&c->kRGBToV) // %6 "r"(&c->kRGBToV) // %6
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" "q8", "q9", "q11", "q12", "q14", "q15"
); );
} }
@ -2212,44 +2227,8 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( ARGBToUVMatrixRow_NEON(src_bgra, src_stride_bgra, dst_u, dst_v, width,
"add %1, %0, %1 \n" // src_stride + src_bgra &kBgraI601Constants);
"vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient
"vmov.s16 q11, #74 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #38 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #18 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8000 \n" // 128.0
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
"vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
"vrshr.u16 q1, q1, #2 \n" // average of 4
"vrshr.u16 q2, q2, #2 \n"
"vrshr.u16 q3, q3, #2 \n"
RGBTOUV(q3, q2, q1)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_bgra), // %0
"+r"(src_stride_bgra), // %1
"+r"(dst_u), // %2-
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
} }
void ABGRToUVRow_NEON(const uint8_t* src_abgr, void ABGRToUVRow_NEON(const uint8_t* src_abgr,
@ -2257,44 +2236,8 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_u, dst_v, width,
"add %1, %0, %1 \n" // src_stride + src_abgr &kAbgrI601Constants);
"vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient
"vmov.s16 q11, #74 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #38 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #18 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8000 \n" // 128.0
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
"vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
"vrshr.u16 q0, q0, #2 \n" // average of 4
"vrshr.u16 q1, q1, #2 \n"
"vrshr.u16 q2, q2, #2 \n"
RGBTOUV(q2, q1, q0)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_abgr), // %0
"+r"(src_stride_abgr), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
} }
void RGBAToUVRow_NEON(const uint8_t* src_rgba, void RGBAToUVRow_NEON(const uint8_t* src_rgba,
@ -2302,44 +2245,8 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( ARGBToUVMatrixRow_NEON(src_rgba, src_stride_rgba, dst_u, dst_v, width,
"add %1, %0, %1 \n" // src_stride + src_rgba &kRgbaI601Constants);
"vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient
"vmov.s16 q11, #74 \n" // UG -0.5781 coefficient
"vmov.s16 q12, #38 \n" // UR -0.2969 coefficient
"vmov.s16 q13, #18 \n" // VB -0.1406 coefficient
"vmov.s16 q14, #94 \n" // VG -0.7344 coefficient
"vmov.u16 q15, #0x8000 \n" // 128.0
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
"subs %4, %4, #16 \n" // 16 processed per loop.
"vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
"vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
"vrshr.u16 q0, q0, #2 \n" // average of 4
"vrshr.u16 q1, q1, #2 \n"
"vrshr.u16 q2, q2, #2 \n"
RGBTOUV(q0, q1, q2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_rgba), // %0
"+r"(src_stride_rgba), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
} }
void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
@ -2801,15 +2708,16 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
int width, int width,
const struct ArgbConstants* c) { const struct ArgbConstants* c) {
asm volatile( asm volatile(
"vld1.8 {d16}, [%3] \n" // load kRGBToY "vld1.8 {d24}, [%3] \n" // load kRGBToY
"vld1.16 {d18[0]}, [%4] \n" // load kAddY[0] "vld1.16 {d25[0]}, [%4] \n" // load kAddY[0]
"vdup.8 d20, d16[0] \n" // BY "vdup.8 d20, d24[0] \n" // B
"vdup.8 d21, d16[1] \n" // GY "vdup.8 d21, d24[1] \n" // G
"vdup.8 d22, d16[2] \n" // RY "vdup.8 d22, d24[2] \n" // R
"vdup.16 q12, d18[0] \n" // AY "vdup.8 d23, d24[3] \n" // A
"vdup.16 q12, d25[0] \n" // bias
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 pixels
"subs %1, %1, #16 \n" // 16 processed per loop. "subs %1, %1, #16 \n" // 16 processed per loop.
"vmull.u8 q8, d0, d20 \n" // B "vmull.u8 q8, d0, d20 \n" // B
"vmull.u8 q9, d1, d20 \n" "vmull.u8 q9, d1, d20 \n"
@ -2817,6 +2725,8 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
"vmlal.u8 q9, d3, d21 \n" "vmlal.u8 q9, d3, d21 \n"
"vmlal.u8 q8, d4, d22 \n" // R "vmlal.u8 q8, d4, d22 \n" // R
"vmlal.u8 q9, d5, d22 \n" "vmlal.u8 q9, d5, d22 \n"
"vmlal.u8 q8, d6, d23 \n" // A
"vmlal.u8 q9, d7, d23 \n"
"vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y "vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
"vaddhn.u16 d1, q9, q12 \n" "vaddhn.u16 d1, q9, q12 \n"
"vst1.8 {d0, d1}, [%2]! \n" // store 16 pixels Y. "vst1.8 {d0, d1}, [%2]! \n" // store 16 pixels Y.
@ -2826,8 +2736,8 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
"+r"(dst_y) // %2 "+r"(dst_y) // %2
: "r"(&c->kRGBToY), // %3 : "r"(&c->kRGBToY), // %3
"r"(&c->kAddY) // %4 "r"(&c->kAddY) // %4
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22", : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12",
"q12"); "d24", "d25");
} }
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
@ -2846,52 +2756,20 @@ void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kAbgrJPEGConstants); ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
} }
// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
// Same code as ARGB, except the LD4
static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
asm volatile(
"vld1.8 {d16}, [%3] \n" // load kRGBToY
"vld1.16 {d18[0]}, [%4] \n" // load kAddY[0]
"vdup.8 d20, d16[0] \n" // BY
"vdup.8 d21, d16[1] \n" // GY
"vdup.8 d22, d16[2] \n" // RY
"vdup.16 q12, d18[0] \n" // AY
"1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of RGBA
"vld4.8 {d1, d3, d5, d7}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop.
"vmull.u8 q8, d2, d20 \n" // B
"vmull.u8 q9, d3, d20 \n"
"vmlal.u8 q8, d4, d21 \n" // G
"vmlal.u8 q9, d5, d21 \n"
"vmlal.u8 q8, d6, d22 \n" // R
"vmlal.u8 q9, d7, d22 \n"
"vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
"vaddhn.u16 d1, q9, q12 \n"
"vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y.
"bgt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(&c->kRGBToY), // %3
"r"(&c->kAddY) // %4
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
"q12");
}
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kArgbI601Constants); ARGBToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgbaI601Constants);
} }
void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kArgbJPEGConstants); ARGBToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
} }
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kAbgrI601Constants); ARGBToYMatrixRow_NEON(src_bgra, dst_y, width, &kBgraI601Constants);
}
void BGRAToYJRow_NEON(const uint8_t* src_bgra, uint8_t* dst_yj, int width) {
ARGBToYMatrixRow_NEON(src_bgra, dst_yj, width, &kBgraJPEGConstants);
} }
void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
@ -2899,12 +2777,12 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
int width, int width,
const struct ArgbConstants* c) { const struct ArgbConstants* c) {
asm volatile( asm volatile(
"vld1.8 {d16}, [%3] \n" // load kRGBToY "vld1.8 {d24}, [%3] \n" // load kRGBToY
"vld1.16 {d18[0]}, [%4] \n" // load kAddY[0] "vld1.16 {d25[0]}, [%4] \n" // load kAddY[0]
"vdup.8 d20, d16[0] \n" // BY "vdup.8 d20, d24[0] \n" // BY
"vdup.8 d21, d16[1] \n" // GY "vdup.8 d21, d24[1] \n" // GY
"vdup.8 d22, d16[2] \n" // RY "vdup.8 d22, d24[2] \n" // RY
"vdup.16 q12, d18[0] \n" // AY "vdup.16 q12, d25[0] \n" // AY
"1: \n" "1: \n"
"vld3.8 {d2, d4, d6}, [%0]! \n" // load 16 pixels of "vld3.8 {d2, d4, d6}, [%0]! \n" // load 16 pixels of
// RGB24. // RGB24.
@ -2925,8 +2803,8 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
"+r"(width) // %2 "+r"(width) // %2
: "r"(&c->kRGBToY), // %3 : "r"(&c->kRGBToY), // %3
"r"(&c->kAddY) // %4 "r"(&c->kAddY) // %4
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22", : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12",
"q12"); "d24", "d25");
} }

View File

@ -2736,47 +2736,61 @@ struct RgbUVConstants {
}; };
// 8x1 pixels. // 8x1 pixels.
static void ARGBToUV444MatrixRow_NEON( void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width, int width,
const struct RgbUVConstants* rgbuvconstants) { const struct ArgbConstants* c) {
asm volatile( asm volatile(
"ldr d0, [%4] \n" // load rgbuvconstants "ldr q16, [%[c], #16] \n" // kRGBToU
"dup v24.16b, v0.b[0] \n" // UB 0.875 coefficient "ldr q17, [%[c], #32] \n" // kRGBToV
"dup v25.16b, v0.b[1] \n" // UG -0.5781 coefficient "ldr s0, [%[c], #64] \n" // kAddUV
"dup v26.16b, v0.b[2] \n" // UR -0.2969 coefficient "sxtl v16.8h, v16.8b \n" // sign extend U coeffs to 16-bit
"dup v27.16b, v0.b[4] \n" // VB -0.1406 coefficient "sxtl v17.8h, v17.8b \n" // sign extend V coeffs to 16-bit
"dup v28.16b, v0.b[5] \n" // VG -0.7344 coefficient "dup v20.8h, v16.h[0] \n" // U0
"neg v24.16b, v24.16b \n" "dup v21.8h, v16.h[1] \n" // U1
"movi v29.8h, #0x80, lsl #8 \n" // 128.0 "dup v22.8h, v16.h[2] \n" // U2
"dup v23.8h, v16.h[3] \n" // U3
"dup v24.8h, v17.h[0] \n" // V0
"dup v26.8h, v17.h[1] \n" // V1
"dup v27.8h, v17.h[2] \n" // V2
"dup v28.8h, v17.h[3] \n" // V3
"dup v25.8h, v0.h[0] \n" // kAddUV
"1: \n" "1: \n"
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
"subs %w3, %w3, #8 \n" // 8 processed per loop. "subs %w3, %w3, #8 \n" // 8 processed per loop.
"umull v4.8h, v0.8b, v24.8b \n" // B
"umlsl v4.8h, v1.8b, v25.8b \n" // G
"umlsl v4.8h, v2.8b, v26.8b \n" // R
"prfm pldl1keep, [%0, 448] \n"
"umull v3.8h, v2.8b, v24.8b \n" // R "uxtl v4.8h, v0.8b \n"
"umlsl v3.8h, v1.8b, v28.8b \n" // G "uxtl v5.8h, v1.8b \n"
"umlsl v3.8h, v0.8b, v27.8b \n" // B "uxtl v6.8h, v2.8b \n"
"uxtl v7.8h, v3.8b \n"
"addhn v0.8b, v4.8h, v29.8h \n" // signed -> unsigned // U = B*U0 + G*U1 + R*U2 + A*U3
"addhn v1.8b, v3.8h, v29.8h \n" "mul v18.8h, v4.8h, v20.8h \n"
"mla v18.8h, v5.8h, v21.8h \n"
"mla v18.8h, v6.8h, v22.8h \n"
"mla v18.8h, v7.8h, v23.8h \n"
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. // V = B*V0 + G*V1 + R*V2 + A*V3
"st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. "mul v19.8h, v4.8h, v24.8h \n"
"mla v19.8h, v5.8h, v26.8h \n"
"mla v19.8h, v6.8h, v27.8h \n"
"mla v19.8h, v7.8h, v28.8h \n"
"subhn v0.8b, v25.8h, v18.8h \n"
"subhn v1.8b, v25.8h, v19.8h \n"
"st1 {v0.8b}, [%1], #8 \n"
"st1 {v1.8b}, [%2], #8 \n"
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
"+r"(width) // %3 "+r"(width) // %3
: "r"(rgbuvconstants) // %4 : [c] "r"(c) // %4
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v27", "v28", "v29"); "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28");
} }
static void ARGBToUV444MatrixRow_NEON_I8MM( static void ARGBToUV444MatrixRow_NEON_I8MM(
@ -2784,10 +2798,12 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width, int width,
const struct RgbUVConstants* rgbuvconstants) { const struct ArgbConstants* c) {
asm volatile( asm volatile(
"ld2r {v16.4s, v17.4s}, [%[rgbuvconstants]] \n" "ldr q16, [%[c], #16] \n" // kRGBToU
"movi v29.8h, #0x80, lsl #8 \n" // 128.0 "ldr q17, [%[c], #32] \n" // kRGBToV
"ldr s0, [%[c], #64] \n" // kAddUV
"dup v29.8h, v0.h[0] \n" // 128.0
"1: \n" "1: \n"
"ldp q0, q1, [%[src]], #32 \n" "ldp q0, q1, [%[src]], #32 \n"
"subs %w[width], %w[width], #8 \n" // 8 processed per loop. "subs %w[width], %w[width], #8 \n" // 8 processed per loop.
@ -2811,7 +2827,7 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
[dst_u] "+r"(dst_u), // %[dst_u] [dst_u] "+r"(dst_u), // %[dst_u]
[dst_v] "+r"(dst_v), // %[dst_v] [dst_v] "+r"(dst_v), // %[dst_v]
[width] "+r"(width) // %[width] [width] "+r"(width) // %[width]
: [rgbuvconstants] "r"(rgbuvconstants) // %[rgbuvconstants] : [c] "r"(c) // %[c]
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17",
"v29"); "v29");
} }
@ -2824,15 +2840,12 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
// VG -0.7344 coefficient = -94 // VG -0.7344 coefficient = -94
// VR 0.875 coefficient = 112 // VR 0.875 coefficient = 112
static const struct RgbUVConstants kARGBI601UVConstants = {{-112, 74, 38, 0},
{18, 94, -112, 0}};
void ARGBToUV444Row_NEON(const uint8_t* src_argb, void ARGBToUV444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
&kARGBI601UVConstants); &kArgbI601Constants);
} }
void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb, void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
@ -2840,26 +2853,15 @@ void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width, ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width,
&kARGBI601UVConstants); &kArgbI601Constants);
} }
// RGB to JPEG coefficients
// UB 0.500 coefficient = 128
// UG -0.33126 coefficient = -85
// UR -0.16874 coefficient = -43
// VB -0.08131 coefficient = -21
// VG -0.41869 coefficient = -107
// VR 0.500 coefficient = 128
static const struct RgbUVConstants kARGBJPEGUVConstants = {{-128, 85, 43, 0},
{21, 107, -128, 0}};
void ARGBToUVJ444Row_NEON(const uint8_t* src_argb, void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width, ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
&kARGBJPEGUVConstants); &kArgbJPEGConstants);
} }
void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb, void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
@ -2867,7 +2869,7 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width, ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width,
&kARGBJPEGUVConstants); &kArgbJPEGConstants);
} }
#define RGBTOUV_SETUP_REG \ #define RGBTOUV_SETUP_REG \
@ -2906,12 +2908,14 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
"ldr q17, [%[c], #32] \n" // kRGBToV "ldr q17, [%[c], #32] \n" // kRGBToV
"sxtl v16.8h, v16.8b \n" // sign extend U coeffs to 16-bit "sxtl v16.8h, v16.8b \n" // sign extend U coeffs to 16-bit
"sxtl v17.8h, v17.8b \n" // sign extend V coeffs to 16-bit "sxtl v17.8h, v17.8b \n" // sign extend V coeffs to 16-bit
"dup v20.8h, v16.h[0] \n" // U0 (-BU) "dup v20.8h, v16.h[0] \n" // U0
"dup v21.8h, v16.h[1] \n" // U1 (-GU) "dup v21.8h, v16.h[1] \n" // U1
"dup v22.8h, v16.h[2] \n" // U2 (-RU) "dup v22.8h, v16.h[2] \n" // U2
"dup v23.8h, v17.h[0] \n" // V0 (-BV) "dup v23.8h, v16.h[3] \n" // U3
"dup v24.8h, v17.h[1] \n" // V1 (-GV) "dup v24.8h, v17.h[0] \n" // V0
"dup v26.8h, v17.h[2] \n" // V2 (-RV) "dup v26.8h, v17.h[1] \n" // V1
"dup v27.8h, v17.h[2] \n" // V2
"dup v28.8h, v17.h[3] \n" // V3
"movi v25.8h, #0x80, lsl #8 \n" // 128.0 in 16-bit (0x8000) "movi v25.8h, #0x80, lsl #8 \n" // 128.0 in 16-bit (0x8000)
"1: \n" "1: \n"
@ -2921,26 +2925,31 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
"prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"uaddlp v18.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more. "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts. "uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n" "prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts. "uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts. "uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
"uadalp v18.8h, v7.16b \n" // A 16 bytes -> 8 shorts.
"urshr v0.8h, v0.8h, #2 \n" // average of 4 "urshr v0.8h, v0.8h, #2 \n" // average of 4
"urshr v1.8h, v1.8h, #2 \n" "urshr v1.8h, v1.8h, #2 \n"
"urshr v2.8h, v2.8h, #2 \n" "urshr v2.8h, v2.8h, #2 \n"
"urshr v18.8h, v18.8h, #2 \n"
// U = B*U0 + G*U1 + R*U2 // U = B*U0 + G*U1 + R*U2 + A*U3
"mul v3.8h, v0.8h, v20.8h \n" "mul v3.8h, v0.8h, v20.8h \n"
"mla v3.8h, v1.8h, v21.8h \n" "mla v3.8h, v1.8h, v21.8h \n"
"mla v3.8h, v2.8h, v22.8h \n" "mla v3.8h, v2.8h, v22.8h \n"
"mla v3.8h, v18.8h, v23.8h \n"
// V = B*V0 + G*V1 + R*V2 // V = B*V0 + G*V1 + R*V2 + A*V3
"mul v4.8h, v0.8h, v23.8h \n" "mul v4.8h, v0.8h, v24.8h \n"
"mla v4.8h, v1.8h, v24.8h \n" "mla v4.8h, v1.8h, v26.8h \n"
"mla v4.8h, v2.8h, v26.8h \n" "mla v4.8h, v2.8h, v27.8h \n"
"mla v4.8h, v18.8h, v28.8h \n"
// U = (128.0 - U) >> 8, V = (128.0 - V) >> 8 // U = (128.0 - U) >> 8, V = (128.0 - V) >> 8
"subhn v0.8b, v25.8h, v3.8h \n" "subhn v0.8b, v25.8h, v3.8h \n"
@ -2956,7 +2965,8 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
"+r"(width) // %4 "+r"(width) // %4
: [c] "r"(c) // %5 : [c] "r"(c) // %5
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v20", "v21", "v22", "v23", "v24", "v25", "v26" "v16", "v17", "v18", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
"v27", "v28"
); );
} }
@ -2974,44 +2984,35 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
const uint8_t* src_argb_1 = src_argb + src_stride_argb; ARGBToUVMatrixRow_NEON(src_argb, src_stride_argb, dst_u, dst_v, width,
asm volatile ( &kArgbJPEGConstants);
"movi v20.8h, #128 \n" // UB/VR coeff (0.500) }
"movi v21.8h, #85 \n" // UG coeff (-0.33126)
"movi v22.8h, #43 \n" // UR coeff (-0.16874)
"movi v23.8h, #21 \n" // VB coeff (-0.08131)
"movi v24.8h, #107 \n" // VG coeff (-0.41869)
"movi v25.8h, #0x80, lsl #8 \n" // 128.0 (0x8000 in 16-bit)
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
"urshr v0.8h, v0.8h, #2 \n" // average of 4 void ABGRToUVRow_NEON(const uint8_t* src_abgr,
"urshr v1.8h, v1.8h, #2 \n" int src_stride_abgr,
"urshr v2.8h, v2.8h, #2 \n" uint8_t* dst_u,
uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_u, dst_v, width,
&kAbgrI601Constants);
}
RGBTOUV(v0.8h, v1.8h, v2.8h) void BGRAToUVRow_NEON(const uint8_t* src_bgra,
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U. int src_stride_bgra,
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V. uint8_t* dst_u,
"b.gt 1b \n" uint8_t* dst_v,
: "+r"(src_argb), // %0 int width) {
"+r"(src_argb_1), // %1 ARGBToUVMatrixRow_NEON(src_bgra, src_stride_bgra, dst_u, dst_v, width,
"+r"(dst_u), // %2 &kBgraI601Constants);
"+r"(dst_v), // %3 }
"+r"(width) // %4
: void RGBAToUVRow_NEON(const uint8_t* src_rgba,
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", int src_stride_rgba,
"v20", "v21", "v22", "v23", "v24", "v25" uint8_t* dst_u,
); uint8_t* dst_v,
int width) {
ARGBToUVMatrixRow_NEON(src_rgba, src_stride_rgba, dst_u, dst_v, width,
&kRgbaI601Constants);
} }
void ABGRToUVJRow_NEON(const uint8_t* src_abgr, void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
@ -3019,44 +3020,8 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
uint8_t* dst_uj, uint8_t* dst_uj,
uint8_t* dst_vj, uint8_t* dst_vj,
int width) { int width) {
const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr; ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_uj, dst_vj, width,
asm volatile ( &kAbgrJPEGConstants);
"movi v20.8h, #128 \n" // UB/VR coeff (0.500)
"movi v21.8h, #85 \n" // UG coeff (-0.33126)
"movi v22.8h, #43 \n" // UR coeff (-0.16874)
"movi v23.8h, #21 \n" // VB coeff (-0.08131)
"movi v24.8h, #107 \n" // VG coeff (-0.41869)
"movi v25.8h, #0x80, lsl #8 \n" // 128.0 (0x8000 in 16-bit)
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
"uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
"urshr v0.8h, v0.8h, #2 \n" // average of 4
"urshr v1.8h, v1.8h, #2 \n"
"urshr v2.8h, v2.8h, #2 \n"
RGBTOUV(v2.8h, v1.8h, v0.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_abgr), // %0
"+r"(src_abgr_1), // %1
"+r"(dst_uj), // %2
"+r"(dst_vj), // %3
"+r"(width) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
} }
void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24, void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
@ -3149,126 +3114,6 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
); );
} }
void BGRAToUVRow_NEON(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
asm volatile (
RGBTOUV_SETUP_REG
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
"uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
"urshr v0.8h, v0.8h, #2 \n" // average of 4
"urshr v1.8h, v3.8h, #2 \n"
"urshr v2.8h, v2.8h, #2 \n"
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_bgra), // %0
"+r"(src_bgra_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
void ABGRToUVRow_NEON(const uint8_t* src_abgr,
int src_stride_abgr,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
asm volatile (
RGBTOUV_SETUP_REG
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
"uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
"urshr v0.8h, v3.8h, #2 \n" // average of 4
"urshr v2.8h, v2.8h, #2 \n"
"urshr v1.8h, v1.8h, #2 \n"
RGBTOUV(v0.8h, v2.8h, v1.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_abgr), // %0
"+r"(src_abgr_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
void RGBAToUVRow_NEON(const uint8_t* src_rgba,
int src_stride_rgba,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
asm volatile (
RGBTOUV_SETUP_REG
"1: \n"
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
"uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
"uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
"urshr v0.8h, v0.8h, #2 \n" // average of 4
"urshr v1.8h, v1.8h, #2 \n"
"urshr v2.8h, v2.8h, #2 \n"
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(src_rgba_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
void RGB24ToUVRow_NEON(const uint8_t* src_rgb24, void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
int src_stride_rgb24, int src_stride_rgb24,
uint8_t* dst_u, uint8_t* dst_u,
@ -3483,18 +3328,19 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
); );
} }
// Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the uvconstants layout. // Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the ArgbConstants layout.
static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src, static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
int src_stride, int src_stride,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width, int width,
const int8_t* uvconstants) { const struct ArgbConstants* c) {
const uint8_t* src1 = src + src_stride; const uint8_t* src1 = src + src_stride;
asm volatile( asm volatile(
"movi v23.8h, #0x80, lsl #8 \n" // 128.0 (0x8000 in "movi v23.8h, #0x80, lsl #8 \n" // 128.0 (0x8000 in
// 16-bit) // 16-bit)
"ld2r {v24.4s, v25.4s}, [%[uvconstants]] \n" "ldr q24, [%[c], #16] \n" // kRGBToU
"ldr q25, [%[c], #32] \n" // kRGBToV
"1: \n" "1: \n"
"ld2 {v0.4s, v1.4s}, [%[src]], #32 \n" // load 8 pixels "ld2 {v0.4s, v1.4s}, [%[src]], #32 \n" // load 8 pixels
@ -3547,51 +3393,19 @@ static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
[dst_u] "+r"(dst_u), // %[dst_u] [dst_u] "+r"(dst_u), // %[dst_u]
[dst_v] "+r"(dst_v), // %[dst_v] [dst_v] "+r"(dst_v), // %[dst_v]
[width] "+r"(width) // %[width] [width] "+r"(width) // %[width]
: [uvconstants] "r"(uvconstants) // %[uvconstants] : [c] "r"(c) // %[c]
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v23", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v23",
"v24", "v25"); "v24", "v25");
} }
// RGB to BT601 coefficients
// UB 0.875 coefficient = 112
// UG -0.5781 coefficient = -74
// UR -0.2969 coefficient = -38
// VB -0.1406 coefficient = -18
// VG -0.7344 coefficient = -94
// VR 0.875 coefficient = 112
// I8MM constants are stored negated such that we can store 128 in int8_t.
static const int8_t kARGBToUVCoefficients[] = {
// -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
-112, 74, 38, 0, 18, 94, -112, 0,
};
static const int8_t kABGRToUVCoefficients[] = {
// -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
38, 74, -112, 0, -112, 94, 18, 0,
};
static const int8_t kBGRAToUVCoefficients[] = {
// 0, -UR, -UG, -UB, 0, -VR, -VG, -VB
0, 38, 74, -112, 0, -112, 94, 18,
};
static const int8_t kRGBAToUVCoefficients[] = {
// 0, -UB, -UG, -UR, 0, -VB, -VG, -VR
0, -112, 74, 38, 0, 18, 94, -112,
};
void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb, void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
int src_stride_argb, int src_stride_argb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width, int width,
const struct ArgbConstants* c) { const struct ArgbConstants* c) {
int8_t uvconstants[8] = {
(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
(int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width, ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
uvconstants); c);
} }
void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb, void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
@ -3600,7 +3414,7 @@ void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width, ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
kARGBToUVCoefficients); &kArgbI601Constants);
} }
void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr, void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
@ -3609,7 +3423,7 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width, ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
kABGRToUVCoefficients); &kAbgrI601Constants);
} }
void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra, void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
@ -3618,7 +3432,7 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width, ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width,
kBGRAToUVCoefficients); &kBgraI601Constants);
} }
void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba, void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
@ -3627,35 +3441,16 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width, ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width,
kRGBAToUVCoefficients); &kRgbaI601Constants);
} }
// RGB to JPEG coefficients
// UB 0.500 coefficient = 128
// UG -0.33126 coefficient = -85
// UR -0.16874 coefficient = -43
// VB -0.08131 coefficient = -21
// VG -0.41869 coefficient = -107
// VR 0.500 coefficient = 128
// I8MM constants are stored negated such that we can store 128 in int8_t.
static const int8_t kARGBToUVJCoefficients[] = {
// -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
-128, 85, 43, 0, 21, 107, -128, 0,
};
static const int8_t kABGRToUVJCoefficients[] = {
// -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
43, 85, -128, 0, -128, 107, 21, 0,
};
void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb, void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb,
int src_stride_argb, int src_stride_argb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width, ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
kARGBToUVJCoefficients); &kArgbJPEGConstants);
} }
void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr, void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
@ -3664,7 +3459,7 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width, ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
kABGRToUVJCoefficients); &kAbgrJPEGConstants);
} }
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
@ -3771,206 +3566,145 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
int width, int width,
const struct ArgbConstants* c) { const struct ArgbConstants* c) {
asm volatile( asm volatile(
"ldr s0, [%3] \n" // load rgbconstants "ldr s16, [%3] \n" // load 4 coeffs
"ldr s1, [%3, #48] \n" "ldr s17, [%3, #48] \n" // load kAddY[0]
"dup v6.16b, v0.b[0] \n" "dup v18.16b, v16.b[0] \n" // B
"dup v7.16b, v0.b[1] \n" "dup v19.16b, v16.b[1] \n" // G
"dup v16.16b, v0.b[2] \n" "dup v20.16b, v16.b[2] \n" // R
"dup v17.8h, v1.h[0] \n" "dup v21.16b, v16.b[3] \n" // A
"dup v22.8h, v17.h[0] \n" // bias
"1: \n" "1: \n"
"ld4 {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n" // load 16 "ld4 {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n" // load 16
// pixels.
"subs %w2, %w2, #16 \n" // 16 processed per loop. "subs %w2, %w2, #16 \n" // 16 processed per loop.
"umull v0.8h, v2.8b, v6.8b \n" // B "umull v0.8h, v2.8b, v18.8b \n" // B
"umull2 v1.8h, v2.16b, v6.16b \n" "umull2 v1.8h, v2.16b, v18.16b \n"
"prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%0, 448] \n"
"umlal v0.8h, v3.8b, v7.8b \n" // G "umlal v0.8h, v3.8b, v19.8b \n" // G
"umlal2 v1.8h, v3.16b, v7.16b \n" "umlal2 v1.8h, v3.16b, v19.16b \n"
"umlal v0.8h, v4.8b, v16.8b \n" // R "umlal v0.8h, v4.8b, v20.8b \n" // R
"umlal2 v1.8h, v4.16b, v16.16b \n" "umlal2 v1.8h, v4.16b, v20.16b \n"
"addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y "umlal v0.8h, v5.8b, v21.8b \n" // A
"addhn v1.8b, v1.8h, v17.8h \n" "umlal2 v1.8h, v5.16b, v21.16b \n"
"addhn v0.8b, v0.8h, v22.8h \n" // 16 bit to 8 bit Y
"addhn v1.8b, v1.8h, v22.8h \n"
"st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(width) // %2 "+r"(width) // %2
: "r"(c) // %3 : "r"(c) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
"v17"); "v19", "v20", "v21", "v22");
} }
void ARGBToYMatrixRow_NEON_DotProd( void ARGBToYMatrixRow_NEON_DotProd(
const uint8_t* src_argb, const uint8_t* src_argb,
uint8_t* dst_y, uint8_t* dst_y,
int width, int width,
const struct ArgbConstants* c) { const struct ArgbConstants* c) {
asm volatile( asm volatile(
"ldr s0, [%3] \n" // load rgbconstants "ldr s16, [%3] \n" // load 4 coeffs
"ldr s1, [%3, #48] \n" "ldr s17, [%3, #48] \n" // load kAddY[0]
"dup v16.4s, v0.s[0] \n" "dup v18.4s, v16.s[0] \n"
"dup v17.8h, v1.h[0] \n" "dup v19.8h, v17.h[0] \n"
"1: \n" "1: \n"
"ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%0], #64 \n" // load 16 "ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%0], #64 \n" // load 16
// pixels.
"subs %w2, %w2, #16 \n" // 16 processed per loop. "subs %w2, %w2, #16 \n" // 16 processed per loop.
"movi v0.16b, #0 \n" "movi v0.16b, #0 \n"
"movi v1.16b, #0 \n" "movi v1.16b, #0 \n"
"movi v2.16b, #0 \n" "movi v2.16b, #0 \n"
"movi v3.16b, #0 \n" "movi v3.16b, #0 \n"
"udot v0.4s, v4.16b, v16.16b \n" "udot v0.4s, v4.16b, v18.16b \n"
"udot v1.4s, v5.16b, v16.16b \n" "udot v1.4s, v5.16b, v18.16b \n"
"udot v2.4s, v6.16b, v16.16b \n" "udot v2.4s, v6.16b, v18.16b \n"
"udot v3.4s, v7.16b, v16.16b \n" "udot v3.4s, v7.16b, v18.16b \n"
"uzp1 v0.8h, v0.8h, v1.8h \n" "uzp1 v0.8h, v0.8h, v1.8h \n"
"uzp1 v1.8h, v2.8h, v3.8h \n" "uzp1 v1.8h, v2.8h, v3.8h \n"
"addhn v0.8b, v0.8h, v17.8h \n" "addhn v0.8b, v0.8h, v19.8h \n"
"addhn v1.8b, v1.8h, v17.8h \n" "addhn v1.8b, v1.8h, v19.8h \n"
"st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(width) // %2 "+r"(width) // %2
: "r"(c) // %3 : "r"(c) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19");
"v17");
} }
// RGB to JPeg coefficients // RGB to JPeg coefficients
// B * 0.1140 coefficient = 29
// G * 0.5870 coefficient = 150
// R * 0.2990 coefficient = 77
// Add 0.5
static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, {}, {}, {0x0080}, {}};
static const struct ArgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77}, {}, {}, {0x0080}, {}};
static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {}, {}, {0x0080}, {}};
// RGB to BT.601 coefficients
// B * 0.1016 coefficient = 25
// G * 0.5078 coefficient = 129
// R * 0.2578 coefficient = 66
// Add 16.5 = 0x1080
static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, {}, {}, {0x1080}, {}};
static const struct ArgbConstants kRgb24I601DotProdConstants = {{0, 25, 129, 66}, {}, {}, {0x1080}, {}};
static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, {}, {}, {0x1080}, {}};
static const struct ArgbConstants kRawI601DotProdConstants = {{0, 66, 129, 25}, {}, {}, {0x1080}, {}};
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) { void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants); ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kArgbI601Constants);
} }
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) { void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants); ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kArgbJPEGConstants);
} }
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) { void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants); ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kAbgrI601Constants);
} }
void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) { void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants); ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
} }
void ARGBToYRow_NEON_DotProd(const uint8_t* src_argb, void ARGBToYRow_NEON_DotProd(const uint8_t* src_argb,
uint8_t* dst_y, uint8_t* dst_y,
int width) { int width) {
ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_y, width, &kRgb24I601Constants); ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_y, width, &kArgbI601Constants);
} }
void ARGBToYJRow_NEON_DotProd(const uint8_t* src_argb, void ARGBToYJRow_NEON_DotProd(const uint8_t* src_argb,
uint8_t* dst_yj, uint8_t* dst_yj,
int width) { int width) {
ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_yj, width, &kRgb24JPEGConstants); ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_yj, width, &kArgbJPEGConstants);
} }
void ABGRToYRow_NEON_DotProd(const uint8_t* src_abgr, void ABGRToYRow_NEON_DotProd(const uint8_t* src_abgr,
uint8_t* dst_y, uint8_t* dst_y,
int width) { int width) {
ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_y, width, &kRawI601Constants); ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_y, width, &kAbgrI601Constants);
} }
void ABGRToYJRow_NEON_DotProd(const uint8_t* src_abgr, void ABGRToYJRow_NEON_DotProd(const uint8_t* src_abgr,
uint8_t* dst_yj, uint8_t* dst_yj,
int width) { int width) {
ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_yj, width, &kRawJPEGConstants); ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
} }
// RGBA expects first value to be A and ignored, then 3 values to contain RGB. // RGBA expects first value to be A and ignored, then 3 values to contain RGB.
// Same code as ARGB, except the LD4
static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_y,
int width,
const struct ArgbConstants* c) {
asm volatile(
"ldr s0, [%3] \n" // load rgbconstants
"ldr s1, [%3, #48] \n"
"dup v6.16b, v0.b[0] \n"
"dup v7.16b, v0.b[1] \n"
"dup v16.16b, v0.b[2] \n"
"dup v17.8h, v1.h[0] \n"
"1: \n"
"ld4 {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n" // load 16
// pixels.
"subs %w2, %w2, #16 \n" // 16 processed per loop.
"umull v0.8h, v2.8b, v6.8b \n" // B
"umull2 v1.8h, v2.16b, v6.16b \n"
"prfm pldl1keep, [%0, 448] \n"
"umlal v0.8h, v3.8b, v7.8b \n" // G
"umlal2 v1.8h, v3.16b, v7.16b \n"
"umlal v0.8h, v4.8b, v16.8b \n" // R
"umlal2 v1.8h, v4.16b, v16.16b \n"
"addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y
"addhn v1.8b, v1.8h, v17.8h \n"
"st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
"b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(c) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
"v17");
}
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) { void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants); ARGBToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgbaI601Constants);
} }
void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) { void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants); ARGBToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
} }
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) { void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants); ARGBToYMatrixRow_NEON(src_bgra, dst_y, width, &kBgraI601Constants);
} }
void RGBAToYRow_NEON_DotProd(const uint8_t* src_rgba, void RGBAToYRow_NEON_DotProd(const uint8_t* src_rgba,
uint8_t* dst_y, uint8_t* dst_y,
int width) { int width) {
// No need for a separate implementation for RGBA inputs, just permute the ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_y, width, &kRgbaI601Constants);
// RGB constants.
ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_y, width,
&kRgb24I601DotProdConstants);
} }
void RGBAToYJRow_NEON_DotProd(const uint8_t* src_rgba, void RGBAToYJRow_NEON_DotProd(const uint8_t* src_rgba,
uint8_t* dst_yj, uint8_t* dst_yj,
int width) { int width) {
// No need for a separate implementation for RGBA inputs, just permute the ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
// RGB constants.
ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_yj, width,
&kRgb24JPEGDotProdConstants);
} }
void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra, void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra,
uint8_t* dst_y, uint8_t* dst_y,
int width) { int width) {
// No need for a separate implementation for RGBA inputs, just permute the ARGBToYMatrixRow_NEON_DotProd(src_bgra, dst_y, width, &kBgraI601Constants);
// RGB constants.
ARGBToYMatrixRow_NEON_DotProd(src_bgra, dst_y, width,
&kRawI601DotProdConstants);
} }
void RGBToYMatrixRow_NEON(const uint8_t* src_rgb, void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
@ -3978,30 +3712,32 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
int width, int width,
const struct ArgbConstants* c) { const struct ArgbConstants* c) {
asm volatile( asm volatile(
"ldr d0, [%3] \n" // load rgbconstants "ldr s16, [%3] \n" // load 4 coeffs
"dup v5.16b, v0.b[0] \n" "ldr s17, [%3, #48] \n" // load kAddY[0]
"dup v6.16b, v0.b[1] \n" "dup v18.16b, v16.b[0] \n" // B
"dup v7.16b, v0.b[2] \n" "dup v19.16b, v16.b[1] \n" // G
"dup v16.8h, v0.h[2] \n" "dup v20.16b, v16.b[2] \n" // R
"dup v21.8h, v17.h[0] \n" // bias
"1: \n" "1: \n"
"ld3 {v2.16b,v3.16b,v4.16b}, [%0], #48 \n" // load 16 pixels. "ld3 {v2.16b,v3.16b,v4.16b}, [%0], #48 \n" // load 16 pixels.
"subs %w2, %w2, #16 \n" // 16 processed per loop. "subs %w2, %w2, #16 \n" // 16 processed per loop.
"umull v0.8h, v2.8b, v5.8b \n" // B "umull v0.8h, v2.8b, v18.8b \n" // B
"umull2 v1.8h, v2.16b, v5.16b \n" "umull2 v1.8h, v2.16b, v18.16b \n"
"prfm pldl1keep, [%0, 448] \n" "prfm pldl1keep, [%0, 448] \n"
"umlal v0.8h, v3.8b, v6.8b \n" // G "umlal v0.8h, v3.8b, v19.8b \n" // G
"umlal2 v1.8h, v3.16b, v6.16b \n" "umlal2 v1.8h, v3.16b, v19.16b \n"
"umlal v0.8h, v4.8b, v7.8b \n" // R "umlal v0.8h, v4.8b, v20.8b \n" // R
"umlal2 v1.8h, v4.16b, v7.16b \n" "umlal2 v1.8h, v4.16b, v20.16b \n"
"addhn v0.8b, v0.8h, v16.8h \n" // 16 bit to 8 bit Y "addhn v0.8b, v0.8h, v21.8h \n" // 16 bit to 8 bit Y
"addhn v1.8b, v1.8h, v16.8h \n" "addhn v1.8b, v1.8h, v21.8h \n"
"st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y. "st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
"b.gt 1b \n" "b.gt 1b \n"
: "+r"(src_rgb), // %0 : "+r"(src_rgb), // %0
"+r"(dst_y), // %1 "+r"(dst_y), // %1
"+r"(width) // %2 "+r"(width) // %2
: "r"(c) // %3 : "r"(c) // %3
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"); : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18",
"v19", "v20", "v21");
} }