mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-06-15 00:16:08 +08:00
ABGRToJ420 call ARGBToI420Matrix
- Standardize libyuv ARGB-family (ARGB, ABGR, RGBA, BGRA) to YUV conversion by utilizing the generic MatrixRow architecture and explicit ArgbConstants. - Consolidated ARGBToI420, ABGRToI420, BGRAToI420, and RGBAToI420 as wrappers for ARGBToI420Matrix. - Refactored ABGRToJ420, ABGRToJ422, and ABGRToI422 to use generic matrix functions. - Added matrix-based versions for NV21, I400, YUY2, and UYVY. - Updated RAW and RGB24 to I420/I422/I444 dispatchers to use MatrixRow logic and explicit constants. - Fixed parameter swap bugs in ARGBToI422, ARGBToJ422, and ABGRToJ422. - Fixed a bug in the generic C implementation of matrix row functions ensuring all 4 channels are processed correctly for all ARGB-family formats. - Moved kShuffleAARRGGBB in row_gcc.cc to the top of the libyuv namespace for visibility. - Cleaned up redundant format-specific row implementations. Bug: libyuv:42280902 Change-Id: I67ffa4c476abc0d2dcc4650510d7bda91b65988e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/7830291 Reviewed-by: richard winterton <rrwinterton@gmail.com> Commit-Queue: Frank Barchard <fbarchard@google.com>
This commit is contained in:
parent
4aacbbdfb4
commit
4b4e68b372
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
||||
Version: 1936
|
||||
Version: 1937
|
||||
Revision: DEPS
|
||||
License: BSD-3-Clause
|
||||
License File: LICENSE
|
||||
|
||||
@ -875,6 +875,19 @@ int BGRAToI420(const uint8_t* src_bgra,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// BGRA little endian (argb in memory) to I422.
|
||||
LIBYUV_API
|
||||
int BGRAToI422(const uint8_t* src_bgra,
|
||||
int src_stride_bgra,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint8_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// ABGR little endian (rgba in memory) to I420.
|
||||
LIBYUV_API
|
||||
int ABGRToI420(const uint8_t* src_abgr,
|
||||
@ -888,6 +901,19 @@ int ABGRToI420(const uint8_t* src_abgr,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// ABGR little endian (rgba in memory) to I422.
|
||||
LIBYUV_API
|
||||
int ABGRToI422(const uint8_t* src_abgr,
|
||||
int src_stride_abgr,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint8_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// RGBA little endian (abgr in memory) to I420.
|
||||
LIBYUV_API
|
||||
int RGBAToI420(const uint8_t* src_rgba,
|
||||
@ -901,6 +927,19 @@ int RGBAToI420(const uint8_t* src_rgba,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// RGBA little endian (abgr in memory) to I422.
|
||||
LIBYUV_API
|
||||
int RGBAToI422(const uint8_t* src_rgba,
|
||||
int src_stride_rgba,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint8_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// RGB little endian (bgr in memory) to I420.
|
||||
LIBYUV_API
|
||||
int RGB24ToI420(const uint8_t* src_rgb24,
|
||||
|
||||
@ -245,6 +245,19 @@ int ARGBToI422(const uint8_t* src_argb,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Convert ABGR To I422.
|
||||
LIBYUV_API
|
||||
int ABGRToI422(const uint8_t* src_abgr,
|
||||
int src_stride_abgr,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
uint8_t* dst_u,
|
||||
int dst_stride_u,
|
||||
uint8_t* dst_v,
|
||||
int dst_stride_v,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// RGB to I444 with matrix. See ArgbConstants at the top of this file for usage.
|
||||
LIBYUV_API
|
||||
int ARGBToI422Matrix(const uint8_t* src_argb,
|
||||
@ -458,7 +471,7 @@ int ARGBToUYVY(const uint8_t* src_argb,
|
||||
|
||||
// RAW to NV21 with Matrix
|
||||
LIBYUV_API
|
||||
int RGBToNV21Matrix(const uint8_t* src_raw,
|
||||
int RAWToNV21Matrix(const uint8_t* src_raw,
|
||||
int src_stride_raw,
|
||||
uint8_t* dst_y,
|
||||
int dst_stride_y,
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1936
|
||||
#define LIBYUV_VERSION 1937
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
3315
source/convert.cc
3315
source/convert.cc
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -8,6 +8,7 @@
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/convert_from_argb.h" // For ArgbConstants
|
||||
#include "libyuv/planar_functions.h"
|
||||
|
||||
#include <assert.h>
|
||||
@ -15,12 +16,10 @@
|
||||
|
||||
#include "libyuv/cpu_id.h"
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/convert_from_argb.h"
|
||||
#include "libyuv/scale_row.h" // for ScaleRowDown2
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
@ -4745,8 +4744,8 @@ static int ARGBSobelize(const uint8_t* src_argb,
|
||||
uint8_t* dst,
|
||||
int width)) {
|
||||
int y;
|
||||
void (*ARGBToYMatrixRow)(const uint8_t* src_argb, uint8_t* dst_y, int width,
|
||||
const struct ArgbConstants* c) = ARGBToYMatrixRow_C;
|
||||
void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_g, int width) =
|
||||
ARGBToYJRow_C;
|
||||
void (*SobelYRow)(const uint8_t* src_y0, const uint8_t* src_y1,
|
||||
uint8_t* dst_sobely, int width) = SobelYRow_C;
|
||||
void (*SobelXRow)(const uint8_t* src_y0, const uint8_t* src_y1,
|
||||
@ -4763,65 +4762,57 @@ static int ARGBSobelize(const uint8_t* src_argb,
|
||||
src_stride_argb = -src_stride_argb;
|
||||
}
|
||||
|
||||
#if defined(HAS_ARGBTOYMATRIXROW_SSSE3)
|
||||
#if defined(HAS_ARGBTOYJROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_SSSE3;
|
||||
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_SSSE3;
|
||||
ARGBToYJRow = ARGBToYJRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYMATRIXROW_AVX2)
|
||||
#if defined(HAS_ARGBTOYJROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX2;
|
||||
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_AVX2;
|
||||
ARGBToYJRow = ARGBToYJRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYMATRIXROW_AVX512BW)
|
||||
#if defined(HAS_ARGBTOYROW_AVX512BW)
|
||||
if (TestCpuFlag(kCpuHasAVX512BW)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_AVX512BW;
|
||||
ARGBToYJRow = ARGBToYJRow_Any_AVX512BW;
|
||||
if (IS_ALIGNED(width, 64)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_AVX512BW;
|
||||
ARGBToYJRow = ARGBToYJRow_AVX512BW;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYMATRIXROW_NEON)
|
||||
#if defined(HAS_ARGBTOYJROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON;
|
||||
ARGBToYJRow = ARGBToYJRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON;
|
||||
ARGBToYJRow = ARGBToYJRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYMATRIXROW_NEON_DOTPROD)
|
||||
if (TestCpuFlag(kCpuHasNeonDotProd)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_NEON_DotProd;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_NEON_DotProd;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYMATRIXROW_LSX)
|
||||
#if defined(HAS_ARGBTOYJROW_LSX)
|
||||
if (TestCpuFlag(kCpuHasLSX)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LSX;
|
||||
ARGBToYJRow = ARGBToYJRow_Any_LSX;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_LSX;
|
||||
ARGBToYJRow = ARGBToYJRow_LSX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYMATRIXROW_LASX)
|
||||
#if defined(HAS_ARGBTOYJROW_LASX)
|
||||
if (TestCpuFlag(kCpuHasLASX)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_Any_LASX;
|
||||
ARGBToYJRow = ARGBToYJRow_Any_LASX;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_LASX;
|
||||
ARGBToYJRow = ARGBToYJRow_LASX;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYMATRIXROW_RVV)
|
||||
#if defined(HAS_ARGBTOYJROW_RVV)
|
||||
if (TestCpuFlag(kCpuHasRVV)) {
|
||||
ARGBToYMatrixRow = ARGBToYMatrixRow_RVV;
|
||||
ARGBToYJRow = ARGBToYJRow_RVV;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -4859,10 +4850,10 @@ static int ARGBSobelize(const uint8_t* src_argb,
|
||||
uint8_t* row_y2 = row_y1 + row_size;
|
||||
if (!rows)
|
||||
return 1;
|
||||
ARGBToYMatrixRow(src_argb, row_y0, width, &kArgbJPEGConstants);
|
||||
ARGBToYJRow(src_argb, row_y0, width);
|
||||
row_y0[-1] = row_y0[0];
|
||||
memset(row_y0 + width, row_y0[width - 1], 16); // Extrude 16 for valgrind.
|
||||
ARGBToYMatrixRow(src_argb, row_y1, width, &kArgbJPEGConstants);
|
||||
ARGBToYJRow(src_argb, row_y1, width);
|
||||
row_y1[-1] = row_y1[0];
|
||||
memset(row_y1 + width, row_y1[width - 1], 16);
|
||||
memset(row_y2 + width, 0, 16);
|
||||
@ -4872,7 +4863,7 @@ static int ARGBSobelize(const uint8_t* src_argb,
|
||||
if (y < (height - 1)) {
|
||||
src_argb += src_stride_argb;
|
||||
}
|
||||
ARGBToYMatrixRow(src_argb, row_y2, width, &kArgbJPEGConstants);
|
||||
ARGBToYJRow(src_argb, row_y2, width);
|
||||
row_y2[-1] = row_y2[0];
|
||||
row_y2[width] = row_y2[width - 1];
|
||||
|
||||
|
||||
@ -753,28 +753,31 @@ MAKEROWYJ(ABGR, 0, 1, 2, 4)
|
||||
MAKEROWYJ(RGBA, 3, 2, 1, 4)
|
||||
#undef MAKEROWYJ
|
||||
|
||||
static __inline uint8_t RGBToYMatrix(uint8_t r,
|
||||
uint8_t g,
|
||||
uint8_t b,
|
||||
static __inline uint8_t RGBToYMatrix(uint8_t b0,
|
||||
uint8_t b1,
|
||||
uint8_t b2,
|
||||
uint8_t b3,
|
||||
const struct ArgbConstants* c) {
|
||||
return (c->kRGBToY[2] * r + c->kRGBToY[1] * g + c->kRGBToY[0] * b +
|
||||
c->kAddY[0]) >>
|
||||
return (c->kRGBToY[0] * b0 + c->kRGBToY[1] * b1 + c->kRGBToY[2] * b2 +
|
||||
c->kRGBToY[3] * b3 + c->kAddY[0]) >>
|
||||
8;
|
||||
}
|
||||
static __inline uint8_t RGBToUMatrix(uint8_t r,
|
||||
uint8_t g,
|
||||
uint8_t b,
|
||||
static __inline uint8_t RGBToUMatrix(uint8_t b0,
|
||||
uint8_t b1,
|
||||
uint8_t b2,
|
||||
uint8_t b3,
|
||||
const struct ArgbConstants* c) {
|
||||
return (c->kAddUV[0] -
|
||||
(c->kRGBToU[2] * r + c->kRGBToU[1] * g + c->kRGBToU[0] * b)) >>
|
||||
return (c->kAddUV[0] - (c->kRGBToU[0] * b0 + c->kRGBToU[1] * b1 +
|
||||
c->kRGBToU[2] * b2 + c->kRGBToU[3] * b3)) >>
|
||||
8;
|
||||
}
|
||||
static __inline uint8_t RGBToVMatrix(uint8_t r,
|
||||
uint8_t g,
|
||||
uint8_t b,
|
||||
static __inline uint8_t RGBToVMatrix(uint8_t b0,
|
||||
uint8_t b1,
|
||||
uint8_t b2,
|
||||
uint8_t b3,
|
||||
const struct ArgbConstants* c) {
|
||||
return (c->kAddUV[0] -
|
||||
(c->kRGBToV[2] * r + c->kRGBToV[1] * g + c->kRGBToV[0] * b)) >>
|
||||
return (c->kAddUV[0] - (c->kRGBToV[0] * b0 + c->kRGBToV[1] * b1 +
|
||||
c->kRGBToV[2] * b2 + c->kRGBToV[3] * b3)) >>
|
||||
8;
|
||||
}
|
||||
|
||||
@ -784,7 +787,7 @@ void ARGBToYMatrixRow_C(const uint8_t* src_argb,
|
||||
const struct ArgbConstants* c) {
|
||||
int x;
|
||||
for (x = 0; x < width; ++x) {
|
||||
dst_y[0] = RGBToYMatrix(src_argb[2], src_argb[1], src_argb[0], c);
|
||||
dst_y[0] = RGBToYMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
|
||||
src_argb += 4;
|
||||
dst_y += 1;
|
||||
}
|
||||
@ -799,25 +802,28 @@ void ARGBToUVMatrixRow_C(const uint8_t* src_argb,
|
||||
const uint8_t* src_argb1 = src_argb + src_stride_argb;
|
||||
int x;
|
||||
for (x = 0; x < width - 1; x += 2) {
|
||||
uint8_t ab =
|
||||
uint8_t b0 =
|
||||
(src_argb[0] + src_argb[4] + src_argb1[0] + src_argb1[4] + 2) >> 2;
|
||||
uint8_t ag =
|
||||
uint8_t b1 =
|
||||
(src_argb[1] + src_argb[5] + src_argb1[1] + src_argb1[5] + 2) >> 2;
|
||||
uint8_t ar =
|
||||
uint8_t b2 =
|
||||
(src_argb[2] + src_argb[6] + src_argb1[2] + src_argb1[6] + 2) >> 2;
|
||||
dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
|
||||
dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
|
||||
uint8_t b3 =
|
||||
(src_argb[3] + src_argb[7] + src_argb1[3] + src_argb1[7] + 2) >> 2;
|
||||
dst_u[0] = RGBToUMatrix(b0, b1, b2, b3, c);
|
||||
dst_v[0] = RGBToVMatrix(b0, b1, b2, b3, c);
|
||||
src_argb += 8;
|
||||
src_argb1 += 8;
|
||||
dst_u += 1;
|
||||
dst_v += 1;
|
||||
}
|
||||
if (width & 1) {
|
||||
uint8_t ab = (src_argb[0] + src_argb1[0] + 1) >> 1;
|
||||
uint8_t ag = (src_argb[1] + src_argb1[1] + 1) >> 1;
|
||||
uint8_t ar = (src_argb[2] + src_argb1[2] + 1) >> 1;
|
||||
dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
|
||||
dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
|
||||
uint8_t b0 = (src_argb[0] + src_argb1[0] + 1) >> 1;
|
||||
uint8_t b1 = (src_argb[1] + src_argb1[1] + 1) >> 1;
|
||||
uint8_t b2 = (src_argb[2] + src_argb1[2] + 1) >> 1;
|
||||
uint8_t b3 = (src_argb[3] + src_argb1[3] + 1) >> 1;
|
||||
dst_u[0] = RGBToUMatrix(b0, b1, b2, b3, c);
|
||||
dst_v[0] = RGBToVMatrix(b0, b1, b2, b3, c);
|
||||
}
|
||||
}
|
||||
|
||||
@ -828,11 +834,10 @@ void ARGBToUV444MatrixRow_C(const uint8_t* src_argb,
|
||||
const struct ArgbConstants* c) {
|
||||
int x;
|
||||
for (x = 0; x < width; ++x) {
|
||||
uint8_t ab = src_argb[0];
|
||||
uint8_t ag = src_argb[1];
|
||||
uint8_t ar = src_argb[2];
|
||||
dst_u[0] = RGBToUMatrix(ar, ag, ab, c);
|
||||
dst_v[0] = RGBToVMatrix(ar, ag, ab, c);
|
||||
dst_u[0] =
|
||||
RGBToUMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
|
||||
dst_v[0] =
|
||||
RGBToVMatrix(src_argb[0], src_argb[1], src_argb[2], src_argb[3], c);
|
||||
src_argb += 4;
|
||||
dst_u += 1;
|
||||
dst_v += 1;
|
||||
@ -1513,16 +1518,16 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
|
||||
YUVCONSTANTSBODY(YG, YB, VR, VG, UG, UB);
|
||||
|
||||
#define MAKEARGBCONSTANTS(name, RY, GY, BY, RU, GU, BU, RV, GV, BV, AY, AUV) \
|
||||
const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) = \
|
||||
extern const struct ArgbConstants SIMD_ALIGNED(kArgb##name##Constants) = \
|
||||
ARGBCONSTANTSBODY(BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), -(GV), \
|
||||
-(RV), 0, AY, AUV); \
|
||||
const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) = \
|
||||
extern const struct ArgbConstants SIMD_ALIGNED(kAbgr##name##Constants) = \
|
||||
ARGBCONSTANTSBODY(RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), -(GV), \
|
||||
-(BV), 0, AY, AUV); \
|
||||
const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) = \
|
||||
extern const struct ArgbConstants SIMD_ALIGNED(kRgba##name##Constants) = \
|
||||
ARGBCONSTANTSBODY(0, BY, GY, RY, 0, -(BU), -(GU), -(RU), 0, -(BV), \
|
||||
-(GV), -(RV), AY, AUV); \
|
||||
const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) = \
|
||||
extern const struct ArgbConstants SIMD_ALIGNED(kBgra##name##Constants) = \
|
||||
ARGBCONSTANTSBODY(0, RY, GY, BY, 0, -(RU), -(GU), -(BU), 0, -(RV), \
|
||||
-(GV), -(BV), AY, AUV);
|
||||
|
||||
|
||||
@ -1848,32 +1848,41 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"vld1.8 {d16}, [%4] \n" // load kRGBToU
|
||||
"vld1.8 {d17}, [%5] \n" // load kRGBToV
|
||||
"vld1.16 {d18[0]}, [%6] \n" // load kAddUV[0]
|
||||
"vabs.s8 d16, d16 \n" // BU, GU, RU
|
||||
"vabs.s8 d17, d17 \n" // BV, GV, RV
|
||||
"vdup.8 d20, d16[0] \n" // BU
|
||||
"vdup.8 d21, d16[1] \n" // GU
|
||||
"vdup.8 d22, d16[2] \n" // RU
|
||||
"vdup.8 d23, d17[0] \n" // BV
|
||||
"vdup.8 d24, d17[1] \n" // GV
|
||||
"vdup.8 d25, d17[2] \n" // RV
|
||||
"vdup.16 q15, d18[0] \n" // kAddUV
|
||||
|
||||
"vld1.8 {d24}, [%4] \n" // load kRGBToU
|
||||
"vld1.8 {d25}, [%5] \n" // load kRGBToV
|
||||
"vld1.16 {d26[0]}, [%6] \n" // load kAddUV[0]
|
||||
"vmovl.s8 q10, d24 \n" // U coeffs (8 shorts)
|
||||
"vmovl.s8 q11, d25 \n" // V coeffs (8 shorts)
|
||||
"vdup.16 q6, d26[0] \n" // bias
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
||||
"subs %3, %3, #8 \n" // 8 processed per loop.
|
||||
"vmull.u8 q2, d0, d20 \n" // B * BU
|
||||
"vmlsl.u8 q2, d1, d21 \n" // - G * GU
|
||||
"vmlsl.u8 q2, d2, d22 \n" // - R * RU
|
||||
|
||||
"vmull.u8 q3, d2, d25 \n" // R * RV
|
||||
"vmlsl.u8 q3, d1, d24 \n" // - G * GV
|
||||
"vmlsl.u8 q3, d0, d23 \n" // - B * BV
|
||||
"vmovl.u8 q4, d0 \n" // B
|
||||
"vmovl.u8 q5, d1 \n" // G
|
||||
"vmovl.u8 q7, d2 \n" // R
|
||||
"vmovl.u8 q8, d3 \n" // A
|
||||
|
||||
"vaddhn.u16 d0, q2, q15 \n" // signed -> unsigned
|
||||
"vaddhn.u16 d1, q3, q15 \n"
|
||||
"vdup.16 q12, d20[0] \n"
|
||||
"vmul.s16 q2, q4, q12 \n" // U = B * U0
|
||||
"vdup.16 q12, d20[1] \n"
|
||||
"vmla.s16 q2, q5, q12 \n" // U += G * U1
|
||||
"vdup.16 q12, d20[2] \n"
|
||||
"vmla.s16 q2, q7, q12 \n" // U += R * U2
|
||||
"vdup.16 q12, d20[3] \n"
|
||||
"vmla.s16 q2, q8, q12 \n" // U += A * U3
|
||||
|
||||
"vdup.16 q12, d22[0] \n"
|
||||
"vmul.s16 q3, q4, q12 \n" // V = B * V0
|
||||
"vdup.16 q12, d22[1] \n"
|
||||
"vmla.s16 q3, q5, q12 \n" // V += G * V1
|
||||
"vdup.16 q12, d22[2] \n"
|
||||
"vmla.s16 q3, q7, q12 \n" // V += R * V2
|
||||
"vdup.16 q12, d22[3] \n"
|
||||
"vmla.s16 q3, q8, q12 \n" // V += A * V3
|
||||
|
||||
"vsubhn.s16 d0, q6, q2 \n" // 128.0 - U
|
||||
"vsubhn.s16 d1, q6, q3 \n" // 128.0 - V
|
||||
|
||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
|
||||
@ -1885,8 +1894,8 @@ void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
|
||||
: "r"(&c->kRGBToU), // %4
|
||||
"r"(&c->kRGBToV), // %5
|
||||
"r"(&c->kAddUV) // %6
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11",
|
||||
"q12", "q13", "q14", "q15");
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
|
||||
"q10", "q11", "q12");
|
||||
}
|
||||
|
||||
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
||||
@ -1926,16 +1935,11 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
|
||||
const struct ArgbConstants* c) {
|
||||
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
|
||||
asm volatile (
|
||||
"vld1.8 {d18}, [%5] \n" // load kRGBToU
|
||||
"vld1.8 {d19}, [%6] \n" // load kRGBToV
|
||||
"vmovl.s8 q8, d18 \n" // U coeffs in q8 (d16, d17)
|
||||
"vmovl.s8 q9, d19 \n" // V coeffs in q9 (d18, d19)
|
||||
"vdup.16 q10, d16[0] \n" // U0
|
||||
"vdup.16 q11, d16[1] \n" // U1
|
||||
"vdup.16 q12, d16[2] \n" // U2
|
||||
"vdup.16 q13, d18[0] \n" // V0
|
||||
"vdup.16 q14, d18[1] \n" // V1
|
||||
"vdup.16 q15, d18[2] \n" // V2
|
||||
"vld1.8 {d24}, [%5] \n" // load kRGBToU (8 bytes, only 4 used)
|
||||
"vld1.8 {d25}, [%6] \n" // load kRGBToV
|
||||
"vmovl.s8 q14, d24 \n" // U coeffs in d28
|
||||
"vmovl.s8 q15, d25 \n" // V coeffs in d30
|
||||
"vmov.u16 q11, #0x8000 \n" // 128.0 bias
|
||||
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
|
||||
@ -1944,28 +1948,39 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
|
||||
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
|
||||
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
|
||||
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
|
||||
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
|
||||
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more pixels.
|
||||
"vld4.8 {d9, d11, d13, d15}, [%1]! \n"
|
||||
"vpadal.u8 q0, q4 \n" // B
|
||||
"vpadal.u8 q1, q5 \n" // G
|
||||
"vpadal.u8 q2, q6 \n" // R
|
||||
"vpadal.u8 q3, q7 \n" // A
|
||||
|
||||
"vrshr.u16 q0, q0, #2 \n" // average of 4
|
||||
"vrshr.u16 q1, q1, #2 \n"
|
||||
"vrshr.u16 q2, q2, #2 \n"
|
||||
"vrshr.u16 q3, q3, #2 \n"
|
||||
|
||||
"vmov.u16 q3, #0x8000 \n" // 128.0
|
||||
|
||||
"vmul.s16 q8, q0, q10 \n" // U = B * U0
|
||||
"vmla.s16 q8, q1, q11 \n" // U += G * U1
|
||||
"vdup.16 q12, d28[0] \n"
|
||||
"vmul.s16 q8, q0, q12 \n" // U = B * U0
|
||||
"vdup.16 q12, d28[1] \n"
|
||||
"vmla.s16 q8, q1, q12 \n" // U += G * U1
|
||||
"vdup.16 q12, d28[2] \n"
|
||||
"vmla.s16 q8, q2, q12 \n" // U += R * U2
|
||||
"vdup.16 q12, d28[3] \n"
|
||||
"vmla.s16 q8, q3, q12 \n" // U += A * U3
|
||||
|
||||
"vmul.s16 q9, q0, q13 \n" // V = B * V0
|
||||
"vmla.s16 q9, q1, q14 \n" // V += G * V1
|
||||
"vmla.s16 q9, q2, q15 \n" // V += R * V2
|
||||
"vdup.16 q12, d30[0] \n"
|
||||
"vmul.s16 q9, q0, q12 \n" // V = B * V0
|
||||
"vdup.16 q12, d30[1] \n"
|
||||
"vmla.s16 q9, q1, q12 \n" // V += G * V1
|
||||
"vdup.16 q12, d30[2] \n"
|
||||
"vmla.s16 q9, q2, q12 \n" // V += R * V2
|
||||
"vdup.16 q12, d30[3] \n"
|
||||
"vmla.s16 q9, q3, q12 \n" // V += A * V3
|
||||
|
||||
"vsubhn.s16 d0, q3, q8 \n" // 128.0 - U
|
||||
"vsubhn.s16 d1, q3, q9 \n" // 128.0 - V
|
||||
"vsubhn.s16 d0, q11, q8 \n" // 128.0 - U
|
||||
"vsubhn.s16 d1, q11, q9 \n" // 128.0 - V
|
||||
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
|
||||
@ -1978,7 +1993,7 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
|
||||
: "r"(&c->kRGBToU), // %5
|
||||
"r"(&c->kRGBToV) // %6
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
"q8", "q9", "q11", "q12", "q14", "q15"
|
||||
);
|
||||
}
|
||||
|
||||
@ -2212,44 +2227,8 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
asm volatile (
|
||||
"add %1, %0, %1 \n" // src_stride + src_bgra
|
||||
"vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient
|
||||
"vmov.s16 q11, #74 \n" // UG -0.5781 coefficient
|
||||
"vmov.s16 q12, #38 \n" // UR -0.2969 coefficient
|
||||
"vmov.s16 q13, #18 \n" // VB -0.1406 coefficient
|
||||
"vmov.s16 q14, #94 \n" // VG -0.7344 coefficient
|
||||
"vmov.u16 q15, #0x8000 \n" // 128.0
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
|
||||
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
|
||||
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
|
||||
"vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
|
||||
|
||||
"vrshr.u16 q1, q1, #2 \n" // average of 4
|
||||
"vrshr.u16 q2, q2, #2 \n"
|
||||
"vrshr.u16 q3, q3, #2 \n"
|
||||
|
||||
RGBTOUV(q3, q2, q1)
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_bgra), // %0
|
||||
"+r"(src_stride_bgra), // %1
|
||||
"+r"(dst_u), // %2-
|
||||
"+r"(dst_v), // %3
|
||||
"+r"(width) // %4
|
||||
:
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
ARGBToUVMatrixRow_NEON(src_bgra, src_stride_bgra, dst_u, dst_v, width,
|
||||
&kBgraI601Constants);
|
||||
}
|
||||
|
||||
void ABGRToUVRow_NEON(const uint8_t* src_abgr,
|
||||
@ -2257,44 +2236,8 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
asm volatile (
|
||||
"add %1, %0, %1 \n" // src_stride + src_abgr
|
||||
"vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient
|
||||
"vmov.s16 q11, #74 \n" // UG -0.5781 coefficient
|
||||
"vmov.s16 q12, #38 \n" // UR -0.2969 coefficient
|
||||
"vmov.s16 q13, #18 \n" // VB -0.1406 coefficient
|
||||
"vmov.s16 q14, #94 \n" // VG -0.7344 coefficient
|
||||
"vmov.u16 q15, #0x8000 \n" // 128.0
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
|
||||
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
|
||||
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
|
||||
"vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
|
||||
|
||||
"vrshr.u16 q0, q0, #2 \n" // average of 4
|
||||
"vrshr.u16 q1, q1, #2 \n"
|
||||
"vrshr.u16 q2, q2, #2 \n"
|
||||
|
||||
RGBTOUV(q2, q1, q0)
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_abgr), // %0
|
||||
"+r"(src_stride_abgr), // %1
|
||||
"+r"(dst_u), // %2
|
||||
"+r"(dst_v), // %3
|
||||
"+r"(width) // %4
|
||||
:
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_u, dst_v, width,
|
||||
&kAbgrI601Constants);
|
||||
}
|
||||
|
||||
void RGBAToUVRow_NEON(const uint8_t* src_rgba,
|
||||
@ -2302,44 +2245,8 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
asm volatile (
|
||||
"add %1, %0, %1 \n" // src_stride + src_rgba
|
||||
"vmov.s16 q10, #112 \n" // UB/VR 0.875 coefficient
|
||||
"vmov.s16 q11, #74 \n" // UG -0.5781 coefficient
|
||||
"vmov.s16 q12, #38 \n" // UR -0.2969 coefficient
|
||||
"vmov.s16 q13, #18 \n" // VB -0.1406 coefficient
|
||||
"vmov.s16 q14, #94 \n" // VG -0.7344 coefficient
|
||||
"vmov.u16 q15, #0x8000 \n" // 128.0
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
|
||||
"subs %4, %4, #16 \n" // 16 processed per loop.
|
||||
"vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
|
||||
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
|
||||
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
|
||||
"vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
|
||||
"vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
|
||||
"vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
|
||||
|
||||
"vrshr.u16 q0, q0, #2 \n" // average of 4
|
||||
"vrshr.u16 q1, q1, #2 \n"
|
||||
"vrshr.u16 q2, q2, #2 \n"
|
||||
|
||||
RGBTOUV(q0, q1, q2)
|
||||
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
|
||||
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_rgba), // %0
|
||||
"+r"(src_stride_rgba), // %1
|
||||
"+r"(dst_u), // %2
|
||||
"+r"(dst_v), // %3
|
||||
"+r"(width) // %4
|
||||
:
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
|
||||
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
|
||||
);
|
||||
ARGBToUVMatrixRow_NEON(src_rgba, src_stride_rgba, dst_u, dst_v, width,
|
||||
&kRgbaI601Constants);
|
||||
}
|
||||
|
||||
void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
|
||||
@ -2801,15 +2708,16 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"vld1.8 {d16}, [%3] \n" // load kRGBToY
|
||||
"vld1.16 {d18[0]}, [%4] \n" // load kAddY[0]
|
||||
"vdup.8 d20, d16[0] \n" // BY
|
||||
"vdup.8 d21, d16[1] \n" // GY
|
||||
"vdup.8 d22, d16[2] \n" // RY
|
||||
"vdup.16 q12, d18[0] \n" // AY
|
||||
"vld1.8 {d24}, [%3] \n" // load kRGBToY
|
||||
"vld1.16 {d25[0]}, [%4] \n" // load kAddY[0]
|
||||
"vdup.8 d20, d24[0] \n" // B
|
||||
"vdup.8 d21, d24[1] \n" // G
|
||||
"vdup.8 d22, d24[2] \n" // R
|
||||
"vdup.8 d23, d24[3] \n" // A
|
||||
"vdup.16 q12, d25[0] \n" // bias
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 pixels
|
||||
"subs %1, %1, #16 \n" // 16 processed per loop.
|
||||
"vmull.u8 q8, d0, d20 \n" // B
|
||||
"vmull.u8 q9, d1, d20 \n"
|
||||
@ -2817,6 +2725,8 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
||||
"vmlal.u8 q9, d3, d21 \n"
|
||||
"vmlal.u8 q8, d4, d22 \n" // R
|
||||
"vmlal.u8 q9, d5, d22 \n"
|
||||
"vmlal.u8 q8, d6, d23 \n" // A
|
||||
"vmlal.u8 q9, d7, d23 \n"
|
||||
"vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
|
||||
"vaddhn.u16 d1, q9, q12 \n"
|
||||
"vst1.8 {d0, d1}, [%2]! \n" // store 16 pixels Y.
|
||||
@ -2826,8 +2736,8 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
||||
"+r"(dst_y) // %2
|
||||
: "r"(&c->kRGBToY), // %3
|
||||
"r"(&c->kAddY) // %4
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
|
||||
"q12");
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12",
|
||||
"d24", "d25");
|
||||
}
|
||||
|
||||
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
@ -2846,52 +2756,20 @@ void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
|
||||
ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
|
||||
}
|
||||
|
||||
// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
|
||||
// Same code as ARGB, except the LD4
|
||||
static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"vld1.8 {d16}, [%3] \n" // load kRGBToY
|
||||
"vld1.16 {d18[0]}, [%4] \n" // load kAddY[0]
|
||||
"vdup.8 d20, d16[0] \n" // BY
|
||||
"vdup.8 d21, d16[1] \n" // GY
|
||||
"vdup.8 d22, d16[2] \n" // RY
|
||||
"vdup.16 q12, d18[0] \n" // AY
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of RGBA
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n"
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop.
|
||||
"vmull.u8 q8, d2, d20 \n" // B
|
||||
"vmull.u8 q9, d3, d20 \n"
|
||||
"vmlal.u8 q8, d4, d21 \n" // G
|
||||
"vmlal.u8 q9, d5, d21 \n"
|
||||
"vmlal.u8 q8, d6, d22 \n" // R
|
||||
"vmlal.u8 q9, d7, d22 \n"
|
||||
"vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
|
||||
"vaddhn.u16 d1, q9, q12 \n"
|
||||
"vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y.
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_rgba), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(&c->kRGBToY), // %3
|
||||
"r"(&c->kAddY) // %4
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
|
||||
"q12");
|
||||
}
|
||||
|
||||
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||
RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kArgbI601Constants);
|
||||
ARGBToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgbaI601Constants);
|
||||
}
|
||||
|
||||
void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
|
||||
RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kArgbJPEGConstants);
|
||||
ARGBToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
|
||||
}
|
||||
|
||||
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
|
||||
RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kAbgrI601Constants);
|
||||
ARGBToYMatrixRow_NEON(src_bgra, dst_y, width, &kBgraI601Constants);
|
||||
}
|
||||
|
||||
void BGRAToYJRow_NEON(const uint8_t* src_bgra, uint8_t* dst_yj, int width) {
|
||||
ARGBToYMatrixRow_NEON(src_bgra, dst_yj, width, &kBgraJPEGConstants);
|
||||
}
|
||||
|
||||
void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
|
||||
@ -2899,12 +2777,12 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"vld1.8 {d16}, [%3] \n" // load kRGBToY
|
||||
"vld1.16 {d18[0]}, [%4] \n" // load kAddY[0]
|
||||
"vdup.8 d20, d16[0] \n" // BY
|
||||
"vdup.8 d21, d16[1] \n" // GY
|
||||
"vdup.8 d22, d16[2] \n" // RY
|
||||
"vdup.16 q12, d18[0] \n" // AY
|
||||
"vld1.8 {d24}, [%3] \n" // load kRGBToY
|
||||
"vld1.16 {d25[0]}, [%4] \n" // load kAddY[0]
|
||||
"vdup.8 d20, d24[0] \n" // BY
|
||||
"vdup.8 d21, d24[1] \n" // GY
|
||||
"vdup.8 d22, d24[2] \n" // RY
|
||||
"vdup.16 q12, d25[0] \n" // AY
|
||||
"1: \n"
|
||||
"vld3.8 {d2, d4, d6}, [%0]! \n" // load 16 pixels of
|
||||
// RGB24.
|
||||
@ -2925,8 +2803,8 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
|
||||
"+r"(width) // %2
|
||||
: "r"(&c->kRGBToY), // %3
|
||||
"r"(&c->kAddY) // %4
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
|
||||
"q12");
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12",
|
||||
"d24", "d25");
|
||||
}
|
||||
|
||||
|
||||
|
||||
@ -2736,47 +2736,61 @@ struct RgbUVConstants {
|
||||
};
|
||||
|
||||
// 8x1 pixels.
|
||||
static void ARGBToUV444MatrixRow_NEON(
|
||||
const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct RgbUVConstants* rgbuvconstants) {
|
||||
void ARGBToUV444MatrixRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"ldr d0, [%4] \n" // load rgbuvconstants
|
||||
"dup v24.16b, v0.b[0] \n" // UB 0.875 coefficient
|
||||
"dup v25.16b, v0.b[1] \n" // UG -0.5781 coefficient
|
||||
"dup v26.16b, v0.b[2] \n" // UR -0.2969 coefficient
|
||||
"dup v27.16b, v0.b[4] \n" // VB -0.1406 coefficient
|
||||
"dup v28.16b, v0.b[5] \n" // VG -0.7344 coefficient
|
||||
"neg v24.16b, v24.16b \n"
|
||||
"movi v29.8h, #0x80, lsl #8 \n" // 128.0
|
||||
|
||||
"ldr q16, [%[c], #16] \n" // kRGBToU
|
||||
"ldr q17, [%[c], #32] \n" // kRGBToV
|
||||
"ldr s0, [%[c], #64] \n" // kAddUV
|
||||
"sxtl v16.8h, v16.8b \n" // sign extend U coeffs to 16-bit
|
||||
"sxtl v17.8h, v17.8b \n" // sign extend V coeffs to 16-bit
|
||||
"dup v20.8h, v16.h[0] \n" // U0
|
||||
"dup v21.8h, v16.h[1] \n" // U1
|
||||
"dup v22.8h, v16.h[2] \n" // U2
|
||||
"dup v23.8h, v16.h[3] \n" // U3
|
||||
"dup v24.8h, v17.h[0] \n" // V0
|
||||
"dup v26.8h, v17.h[1] \n" // V1
|
||||
"dup v27.8h, v17.h[2] \n" // V2
|
||||
"dup v28.8h, v17.h[3] \n" // V3
|
||||
"dup v25.8h, v0.h[0] \n" // kAddUV
|
||||
"1: \n"
|
||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||
"umull v4.8h, v0.8b, v24.8b \n" // B
|
||||
"umlsl v4.8h, v1.8b, v25.8b \n" // G
|
||||
"umlsl v4.8h, v2.8b, v26.8b \n" // R
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
|
||||
"umull v3.8h, v2.8b, v24.8b \n" // R
|
||||
"umlsl v3.8h, v1.8b, v28.8b \n" // G
|
||||
"umlsl v3.8h, v0.8b, v27.8b \n" // B
|
||||
"uxtl v4.8h, v0.8b \n"
|
||||
"uxtl v5.8h, v1.8b \n"
|
||||
"uxtl v6.8h, v2.8b \n"
|
||||
"uxtl v7.8h, v3.8b \n"
|
||||
|
||||
"addhn v0.8b, v4.8h, v29.8h \n" // signed -> unsigned
|
||||
"addhn v1.8b, v3.8h, v29.8h \n"
|
||||
// U = B*U0 + G*U1 + R*U2 + A*U3
|
||||
"mul v18.8h, v4.8h, v20.8h \n"
|
||||
"mla v18.8h, v5.8h, v21.8h \n"
|
||||
"mla v18.8h, v6.8h, v22.8h \n"
|
||||
"mla v18.8h, v7.8h, v23.8h \n"
|
||||
|
||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
|
||||
// V = B*V0 + G*V1 + R*V2 + A*V3
|
||||
"mul v19.8h, v4.8h, v24.8h \n"
|
||||
"mla v19.8h, v5.8h, v26.8h \n"
|
||||
"mla v19.8h, v6.8h, v27.8h \n"
|
||||
"mla v19.8h, v7.8h, v28.8h \n"
|
||||
|
||||
"subhn v0.8b, v25.8h, v18.8h \n"
|
||||
"subhn v1.8b, v25.8h, v19.8h \n"
|
||||
|
||||
"st1 {v0.8b}, [%1], #8 \n"
|
||||
"st1 {v1.8b}, [%2], #8 \n"
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_u), // %1
|
||||
"+r"(dst_v), // %2
|
||||
"+r"(width) // %3
|
||||
: "r"(rgbuvconstants) // %4
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26",
|
||||
"v27", "v28", "v29");
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_u), // %1
|
||||
"+r"(dst_v), // %2
|
||||
"+r"(width) // %3
|
||||
: [c] "r"(c) // %4
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
|
||||
"v26", "v27", "v28");
|
||||
}
|
||||
|
||||
static void ARGBToUV444MatrixRow_NEON_I8MM(
|
||||
@ -2784,10 +2798,12 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct RgbUVConstants* rgbuvconstants) {
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"ld2r {v16.4s, v17.4s}, [%[rgbuvconstants]] \n"
|
||||
"movi v29.8h, #0x80, lsl #8 \n" // 128.0
|
||||
"ldr q16, [%[c], #16] \n" // kRGBToU
|
||||
"ldr q17, [%[c], #32] \n" // kRGBToV
|
||||
"ldr s0, [%[c], #64] \n" // kAddUV
|
||||
"dup v29.8h, v0.h[0] \n" // 128.0
|
||||
"1: \n"
|
||||
"ldp q0, q1, [%[src]], #32 \n"
|
||||
"subs %w[width], %w[width], #8 \n" // 8 processed per loop.
|
||||
@ -2807,11 +2823,11 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
|
||||
"str d0, [%[dst_u]], #8 \n" // store 8 pixels U.
|
||||
"str d1, [%[dst_v]], #8 \n" // store 8 pixels V.
|
||||
"b.gt 1b \n"
|
||||
: [src] "+r"(src_argb), // %[src]
|
||||
[dst_u] "+r"(dst_u), // %[dst_u]
|
||||
[dst_v] "+r"(dst_v), // %[dst_v]
|
||||
[width] "+r"(width) // %[width]
|
||||
: [rgbuvconstants] "r"(rgbuvconstants) // %[rgbuvconstants]
|
||||
: [src] "+r"(src_argb), // %[src]
|
||||
[dst_u] "+r"(dst_u), // %[dst_u]
|
||||
[dst_v] "+r"(dst_v), // %[dst_v]
|
||||
[width] "+r"(width) // %[width]
|
||||
: [c] "r"(c) // %[c]
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17",
|
||||
"v29");
|
||||
}
|
||||
@ -2824,15 +2840,12 @@ static void ARGBToUV444MatrixRow_NEON_I8MM(
|
||||
// VG -0.7344 coefficient = -94
|
||||
// VR 0.875 coefficient = 112
|
||||
|
||||
static const struct RgbUVConstants kARGBI601UVConstants = {{-112, 74, 38, 0},
|
||||
{18, 94, -112, 0}};
|
||||
|
||||
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
|
||||
&kARGBI601UVConstants);
|
||||
&kArgbI601Constants);
|
||||
}
|
||||
|
||||
void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
|
||||
@ -2840,26 +2853,15 @@ void ARGBToUV444Row_NEON_I8MM(const uint8_t* src_argb,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width,
|
||||
&kARGBI601UVConstants);
|
||||
&kArgbI601Constants);
|
||||
}
|
||||
|
||||
// RGB to JPEG coefficients
|
||||
// UB 0.500 coefficient = 128
|
||||
// UG -0.33126 coefficient = -85
|
||||
// UR -0.16874 coefficient = -43
|
||||
// VB -0.08131 coefficient = -21
|
||||
// VG -0.41869 coefficient = -107
|
||||
// VR 0.500 coefficient = 128
|
||||
|
||||
static const struct RgbUVConstants kARGBJPEGUVConstants = {{-128, 85, 43, 0},
|
||||
{21, 107, -128, 0}};
|
||||
|
||||
void ARGBToUVJ444Row_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUV444MatrixRow_NEON(src_argb, dst_u, dst_v, width,
|
||||
&kARGBJPEGUVConstants);
|
||||
&kArgbJPEGConstants);
|
||||
}
|
||||
|
||||
void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
|
||||
@ -2867,7 +2869,7 @@ void ARGBToUVJ444Row_NEON_I8MM(const uint8_t* src_argb,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUV444MatrixRow_NEON_I8MM(src_argb, dst_u, dst_v, width,
|
||||
&kARGBJPEGUVConstants);
|
||||
&kArgbJPEGConstants);
|
||||
}
|
||||
|
||||
#define RGBTOUV_SETUP_REG \
|
||||
@ -2906,12 +2908,14 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
|
||||
"ldr q17, [%[c], #32] \n" // kRGBToV
|
||||
"sxtl v16.8h, v16.8b \n" // sign extend U coeffs to 16-bit
|
||||
"sxtl v17.8h, v17.8b \n" // sign extend V coeffs to 16-bit
|
||||
"dup v20.8h, v16.h[0] \n" // U0 (-BU)
|
||||
"dup v21.8h, v16.h[1] \n" // U1 (-GU)
|
||||
"dup v22.8h, v16.h[2] \n" // U2 (-RU)
|
||||
"dup v23.8h, v17.h[0] \n" // V0 (-BV)
|
||||
"dup v24.8h, v17.h[1] \n" // V1 (-GV)
|
||||
"dup v26.8h, v17.h[2] \n" // V2 (-RV)
|
||||
"dup v20.8h, v16.h[0] \n" // U0
|
||||
"dup v21.8h, v16.h[1] \n" // U1
|
||||
"dup v22.8h, v16.h[2] \n" // U2
|
||||
"dup v23.8h, v16.h[3] \n" // U3
|
||||
"dup v24.8h, v17.h[0] \n" // V0
|
||||
"dup v26.8h, v17.h[1] \n" // V1
|
||||
"dup v27.8h, v17.h[2] \n" // V2
|
||||
"dup v28.8h, v17.h[3] \n" // V3
|
||||
"movi v25.8h, #0x80, lsl #8 \n" // 128.0 in 16-bit (0x8000)
|
||||
|
||||
"1: \n"
|
||||
@ -2921,26 +2925,31 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
|
||||
"uaddlp v18.8h, v3.16b \n" // A 16 bytes -> 8 shorts.
|
||||
|
||||
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
|
||||
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
|
||||
"uadalp v18.8h, v7.16b \n" // A 16 bytes -> 8 shorts.
|
||||
|
||||
"urshr v0.8h, v0.8h, #2 \n" // average of 4
|
||||
"urshr v1.8h, v1.8h, #2 \n"
|
||||
"urshr v2.8h, v2.8h, #2 \n"
|
||||
"urshr v18.8h, v18.8h, #2 \n"
|
||||
|
||||
// U = B*U0 + G*U1 + R*U2
|
||||
// U = B*U0 + G*U1 + R*U2 + A*U3
|
||||
"mul v3.8h, v0.8h, v20.8h \n"
|
||||
"mla v3.8h, v1.8h, v21.8h \n"
|
||||
"mla v3.8h, v2.8h, v22.8h \n"
|
||||
"mla v3.8h, v18.8h, v23.8h \n"
|
||||
|
||||
// V = B*V0 + G*V1 + R*V2
|
||||
"mul v4.8h, v0.8h, v23.8h \n"
|
||||
"mla v4.8h, v1.8h, v24.8h \n"
|
||||
"mla v4.8h, v2.8h, v26.8h \n"
|
||||
// V = B*V0 + G*V1 + R*V2 + A*V3
|
||||
"mul v4.8h, v0.8h, v24.8h \n"
|
||||
"mla v4.8h, v1.8h, v26.8h \n"
|
||||
"mla v4.8h, v2.8h, v27.8h \n"
|
||||
"mla v4.8h, v18.8h, v28.8h \n"
|
||||
|
||||
// U = (128.0 - U) >> 8, V = (128.0 - V) >> 8
|
||||
"subhn v0.8b, v25.8h, v3.8h \n"
|
||||
@ -2956,7 +2965,8 @@ void ARGBToUVMatrixRow_NEON(const uint8_t* src_argb,
|
||||
"+r"(width) // %4
|
||||
: [c] "r"(c) // %5
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v16", "v17", "v20", "v21", "v22", "v23", "v24", "v25", "v26"
|
||||
"v16", "v17", "v18", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
|
||||
"v27", "v28"
|
||||
);
|
||||
}
|
||||
|
||||
@ -2974,44 +2984,35 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
const uint8_t* src_argb_1 = src_argb + src_stride_argb;
|
||||
asm volatile (
|
||||
"movi v20.8h, #128 \n" // UB/VR coeff (0.500)
|
||||
"movi v21.8h, #85 \n" // UG coeff (-0.33126)
|
||||
"movi v22.8h, #43 \n" // UR coeff (-0.16874)
|
||||
"movi v23.8h, #21 \n" // VB coeff (-0.08131)
|
||||
"movi v24.8h, #107 \n" // VG coeff (-0.41869)
|
||||
"movi v25.8h, #0x80, lsl #8 \n" // 128.0 (0x8000 in 16-bit)
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
|
||||
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
|
||||
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
|
||||
ARGBToUVMatrixRow_NEON(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
&kArgbJPEGConstants);
|
||||
}
|
||||
|
||||
"urshr v0.8h, v0.8h, #2 \n" // average of 4
|
||||
"urshr v1.8h, v1.8h, #2 \n"
|
||||
"urshr v2.8h, v2.8h, #2 \n"
|
||||
void ABGRToUVRow_NEON(const uint8_t* src_abgr,
|
||||
int src_stride_abgr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_u, dst_v, width,
|
||||
&kAbgrI601Constants);
|
||||
}
|
||||
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(src_argb_1), // %1
|
||||
"+r"(dst_u), // %2
|
||||
"+r"(dst_v), // %3
|
||||
"+r"(width) // %4
|
||||
:
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v20", "v21", "v22", "v23", "v24", "v25"
|
||||
);
|
||||
void BGRAToUVRow_NEON(const uint8_t* src_bgra,
|
||||
int src_stride_bgra,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON(src_bgra, src_stride_bgra, dst_u, dst_v, width,
|
||||
&kBgraI601Constants);
|
||||
}
|
||||
|
||||
void RGBAToUVRow_NEON(const uint8_t* src_rgba,
|
||||
int src_stride_rgba,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON(src_rgba, src_stride_rgba, dst_u, dst_v, width,
|
||||
&kRgbaI601Constants);
|
||||
}
|
||||
|
||||
void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
|
||||
@ -3019,44 +3020,8 @@ void ABGRToUVJRow_NEON(const uint8_t* src_abgr,
|
||||
uint8_t* dst_uj,
|
||||
uint8_t* dst_vj,
|
||||
int width) {
|
||||
const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
|
||||
asm volatile (
|
||||
"movi v20.8h, #128 \n" // UB/VR coeff (0.500)
|
||||
"movi v21.8h, #85 \n" // UG coeff (-0.33126)
|
||||
"movi v22.8h, #43 \n" // UR coeff (-0.16874)
|
||||
"movi v23.8h, #21 \n" // VB coeff (-0.08131)
|
||||
"movi v24.8h, #107 \n" // VG coeff (-0.41869)
|
||||
"movi v25.8h, #0x80, lsl #8 \n" // 128.0 (0x8000 in 16-bit)
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"uaddlp v0.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
|
||||
"uadalp v0.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uadalp v2.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
|
||||
|
||||
"urshr v0.8h, v0.8h, #2 \n" // average of 4
|
||||
"urshr v1.8h, v1.8h, #2 \n"
|
||||
"urshr v2.8h, v2.8h, #2 \n"
|
||||
|
||||
RGBTOUV(v2.8h, v1.8h, v0.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_abgr), // %0
|
||||
"+r"(src_abgr_1), // %1
|
||||
"+r"(dst_uj), // %2
|
||||
"+r"(dst_vj), // %3
|
||||
"+r"(width) // %4
|
||||
:
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v20", "v21", "v22", "v23", "v24", "v25"
|
||||
);
|
||||
ARGBToUVMatrixRow_NEON(src_abgr, src_stride_abgr, dst_uj, dst_vj, width,
|
||||
&kAbgrJPEGConstants);
|
||||
}
|
||||
|
||||
void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
|
||||
@ -3149,126 +3114,6 @@ void RAWToUVJRow_NEON(const uint8_t* src_raw,
|
||||
);
|
||||
}
|
||||
|
||||
void BGRAToUVRow_NEON(const uint8_t* src_bgra,
|
||||
int src_stride_bgra,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
const uint8_t* src_bgra_1 = src_bgra + src_stride_bgra;
|
||||
asm volatile (
|
||||
RGBTOUV_SETUP_REG
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"uaddlp v0.8h, v3.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v3.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uaddlp v2.8h, v1.16b \n" // R 16 bytes -> 8 shorts.
|
||||
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more
|
||||
"uadalp v0.8h, v7.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"uadalp v3.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uadalp v2.8h, v5.16b \n" // R 16 bytes -> 8 shorts.
|
||||
|
||||
"urshr v0.8h, v0.8h, #2 \n" // average of 4
|
||||
"urshr v1.8h, v3.8h, #2 \n"
|
||||
"urshr v2.8h, v2.8h, #2 \n"
|
||||
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_bgra), // %0
|
||||
"+r"(src_bgra_1), // %1
|
||||
"+r"(dst_u), // %2
|
||||
"+r"(dst_v), // %3
|
||||
"+r"(width) // %4
|
||||
:
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v20", "v21", "v22", "v23", "v24", "v25"
|
||||
);
|
||||
}
|
||||
|
||||
void ABGRToUVRow_NEON(const uint8_t* src_abgr,
|
||||
int src_stride_abgr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
const uint8_t* src_abgr_1 = src_abgr + src_stride_abgr;
|
||||
asm volatile (
|
||||
RGBTOUV_SETUP_REG
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"uaddlp v3.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v2.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uaddlp v1.8h, v0.16b \n" // R 16 bytes -> 8 shorts.
|
||||
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
|
||||
"uadalp v3.8h, v6.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"uadalp v2.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uadalp v1.8h, v4.16b \n" // R 16 bytes -> 8 shorts.
|
||||
|
||||
"urshr v0.8h, v3.8h, #2 \n" // average of 4
|
||||
"urshr v2.8h, v2.8h, #2 \n"
|
||||
"urshr v1.8h, v1.8h, #2 \n"
|
||||
|
||||
RGBTOUV(v0.8h, v2.8h, v1.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_abgr), // %0
|
||||
"+r"(src_abgr_1), // %1
|
||||
"+r"(dst_u), // %2
|
||||
"+r"(dst_v), // %3
|
||||
"+r"(width) // %4
|
||||
:
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v20", "v21", "v22", "v23", "v24", "v25"
|
||||
);
|
||||
}
|
||||
|
||||
void RGBAToUVRow_NEON(const uint8_t* src_rgba,
|
||||
int src_stride_rgba,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
const uint8_t* src_rgba_1 = src_rgba + src_stride_rgba;
|
||||
asm volatile (
|
||||
RGBTOUV_SETUP_REG
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
|
||||
"subs %w4, %w4, #16 \n" // 16 processed per loop.
|
||||
"uaddlp v0.8h, v1.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uaddlp v1.8h, v2.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uaddlp v2.8h, v3.16b \n" // R 16 bytes -> 8 shorts.
|
||||
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load 16 more.
|
||||
"uadalp v0.8h, v5.16b \n" // B 16 bytes -> 8 shorts.
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"uadalp v1.8h, v6.16b \n" // G 16 bytes -> 8 shorts.
|
||||
"uadalp v2.8h, v7.16b \n" // R 16 bytes -> 8 shorts.
|
||||
|
||||
"urshr v0.8h, v0.8h, #2 \n" // average of 4
|
||||
"urshr v1.8h, v1.8h, #2 \n"
|
||||
"urshr v2.8h, v2.8h, #2 \n"
|
||||
|
||||
RGBTOUV(v0.8h, v1.8h, v2.8h)
|
||||
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
|
||||
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_rgba), // %0
|
||||
"+r"(src_rgba_1), // %1
|
||||
"+r"(dst_u), // %2
|
||||
"+r"(dst_v), // %3
|
||||
"+r"(width) // %4
|
||||
:
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
|
||||
"v20", "v21", "v22", "v23", "v24", "v25"
|
||||
);
|
||||
}
|
||||
|
||||
void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
|
||||
int src_stride_rgb24,
|
||||
uint8_t* dst_u,
|
||||
@ -3483,18 +3328,19 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
|
||||
);
|
||||
}
|
||||
|
||||
// Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the uvconstants layout.
|
||||
// Process any of ARGB, ABGR, BGRA, RGBA, by adjusting the ArgbConstants layout.
|
||||
static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
|
||||
int src_stride,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const int8_t* uvconstants) {
|
||||
const struct ArgbConstants* c) {
|
||||
const uint8_t* src1 = src + src_stride;
|
||||
asm volatile(
|
||||
"movi v23.8h, #0x80, lsl #8 \n" // 128.0 (0x8000 in
|
||||
// 16-bit)
|
||||
"ld2r {v24.4s, v25.4s}, [%[uvconstants]] \n"
|
||||
"ldr q24, [%[c], #16] \n" // kRGBToU
|
||||
"ldr q25, [%[c], #32] \n" // kRGBToV
|
||||
|
||||
"1: \n"
|
||||
"ld2 {v0.4s, v1.4s}, [%[src]], #32 \n" // load 8 pixels
|
||||
@ -3547,51 +3393,19 @@ static void ARGBToUVMatrixRow_NEON_I8MM_Impl(const uint8_t* src,
|
||||
[dst_u] "+r"(dst_u), // %[dst_u]
|
||||
[dst_v] "+r"(dst_v), // %[dst_v]
|
||||
[width] "+r"(width) // %[width]
|
||||
: [uvconstants] "r"(uvconstants) // %[uvconstants]
|
||||
: [c] "r"(c) // %[c]
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v23",
|
||||
"v24", "v25");
|
||||
}
|
||||
|
||||
// RGB to BT601 coefficients
|
||||
// UB 0.875 coefficient = 112
|
||||
// UG -0.5781 coefficient = -74
|
||||
// UR -0.2969 coefficient = -38
|
||||
// VB -0.1406 coefficient = -18
|
||||
// VG -0.7344 coefficient = -94
|
||||
// VR 0.875 coefficient = 112
|
||||
// I8MM constants are stored negated such that we can store 128 in int8_t.
|
||||
|
||||
static const int8_t kARGBToUVCoefficients[] = {
|
||||
// -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
|
||||
-112, 74, 38, 0, 18, 94, -112, 0,
|
||||
};
|
||||
|
||||
static const int8_t kABGRToUVCoefficients[] = {
|
||||
// -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
|
||||
38, 74, -112, 0, -112, 94, 18, 0,
|
||||
};
|
||||
|
||||
static const int8_t kBGRAToUVCoefficients[] = {
|
||||
// 0, -UR, -UG, -UB, 0, -VR, -VG, -VB
|
||||
0, 38, 74, -112, 0, -112, 94, 18,
|
||||
};
|
||||
|
||||
static const int8_t kRGBAToUVCoefficients[] = {
|
||||
// 0, -UB, -UG, -UR, 0, -VB, -VG, -VR
|
||||
0, -112, 74, 38, 0, 18, 94, -112,
|
||||
};
|
||||
|
||||
void ARGBToUVMatrixRow_NEON_I8MM(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
int8_t uvconstants[8] = {
|
||||
(int8_t)c->kRGBToU[0], (int8_t)c->kRGBToU[1], (int8_t)c->kRGBToU[2], (int8_t)c->kRGBToU[3],
|
||||
(int8_t)c->kRGBToV[0], (int8_t)c->kRGBToV[1], (int8_t)c->kRGBToV[2], (int8_t)c->kRGBToV[3]};
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
uvconstants);
|
||||
c);
|
||||
}
|
||||
|
||||
void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
|
||||
@ -3600,7 +3414,7 @@ void ARGBToUVRow_NEON_I8MM(const uint8_t* src_argb,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
kARGBToUVCoefficients);
|
||||
&kArgbI601Constants);
|
||||
}
|
||||
|
||||
void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
|
||||
@ -3609,7 +3423,7 @@ void ABGRToUVRow_NEON_I8MM(const uint8_t* src_abgr,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
|
||||
kABGRToUVCoefficients);
|
||||
&kAbgrI601Constants);
|
||||
}
|
||||
|
||||
void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
|
||||
@ -3618,7 +3432,7 @@ void BGRAToUVRow_NEON_I8MM(const uint8_t* src_bgra,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_bgra, src_stride_bgra, dst_u, dst_v, width,
|
||||
kBGRAToUVCoefficients);
|
||||
&kBgraI601Constants);
|
||||
}
|
||||
|
||||
void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
|
||||
@ -3627,35 +3441,16 @@ void RGBAToUVRow_NEON_I8MM(const uint8_t* src_rgba,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_rgba, src_stride_rgba, dst_u, dst_v, width,
|
||||
kRGBAToUVCoefficients);
|
||||
&kRgbaI601Constants);
|
||||
}
|
||||
|
||||
// RGB to JPEG coefficients
|
||||
// UB 0.500 coefficient = 128
|
||||
// UG -0.33126 coefficient = -85
|
||||
// UR -0.16874 coefficient = -43
|
||||
// VB -0.08131 coefficient = -21
|
||||
// VG -0.41869 coefficient = -107
|
||||
// VR 0.500 coefficient = 128
|
||||
// I8MM constants are stored negated such that we can store 128 in int8_t.
|
||||
|
||||
static const int8_t kARGBToUVJCoefficients[] = {
|
||||
// -UB, -UG, -UR, 0, -VB, -VG, -VR, 0
|
||||
-128, 85, 43, 0, 21, 107, -128, 0,
|
||||
};
|
||||
|
||||
static const int8_t kABGRToUVJCoefficients[] = {
|
||||
// -UR, -UG, -UB, 0, -VR, -VG, -VB, 0
|
||||
43, 85, -128, 0, -128, 107, 21, 0,
|
||||
};
|
||||
|
||||
void ARGBToUVJRow_NEON_I8MM(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_argb, src_stride_argb, dst_u, dst_v, width,
|
||||
kARGBToUVJCoefficients);
|
||||
&kArgbJPEGConstants);
|
||||
}
|
||||
|
||||
void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
|
||||
@ -3664,7 +3459,7 @@ void ABGRToUVJRow_NEON_I8MM(const uint8_t* src_abgr,
|
||||
uint8_t* dst_v,
|
||||
int width) {
|
||||
ARGBToUVMatrixRow_NEON_I8MM_Impl(src_abgr, src_stride_abgr, dst_u, dst_v, width,
|
||||
kABGRToUVJCoefficients);
|
||||
&kAbgrJPEGConstants);
|
||||
}
|
||||
|
||||
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
|
||||
@ -3771,206 +3566,145 @@ void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"ldr s0, [%3] \n" // load rgbconstants
|
||||
"ldr s1, [%3, #48] \n"
|
||||
"dup v6.16b, v0.b[0] \n"
|
||||
"dup v7.16b, v0.b[1] \n"
|
||||
"dup v16.16b, v0.b[2] \n"
|
||||
"dup v17.8h, v1.h[0] \n"
|
||||
"ldr s16, [%3] \n" // load 4 coeffs
|
||||
"ldr s17, [%3, #48] \n" // load kAddY[0]
|
||||
"dup v18.16b, v16.b[0] \n" // B
|
||||
"dup v19.16b, v16.b[1] \n" // G
|
||||
"dup v20.16b, v16.b[2] \n" // R
|
||||
"dup v21.16b, v16.b[3] \n" // A
|
||||
"dup v22.8h, v17.h[0] \n" // bias
|
||||
"1: \n"
|
||||
"ld4 {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n" // load 16
|
||||
// pixels.
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||
"umull v0.8h, v2.8b, v6.8b \n" // B
|
||||
"umull2 v1.8h, v2.16b, v6.16b \n"
|
||||
"umull v0.8h, v2.8b, v18.8b \n" // B
|
||||
"umull2 v1.8h, v2.16b, v18.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"umlal v0.8h, v3.8b, v7.8b \n" // G
|
||||
"umlal2 v1.8h, v3.16b, v7.16b \n"
|
||||
"umlal v0.8h, v4.8b, v16.8b \n" // R
|
||||
"umlal2 v1.8h, v4.16b, v16.16b \n"
|
||||
"addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y
|
||||
"addhn v1.8b, v1.8h, v17.8h \n"
|
||||
"umlal v0.8h, v3.8b, v19.8b \n" // G
|
||||
"umlal2 v1.8h, v3.16b, v19.16b \n"
|
||||
"umlal v0.8h, v4.8b, v20.8b \n" // R
|
||||
"umlal2 v1.8h, v4.16b, v20.16b \n"
|
||||
"umlal v0.8h, v5.8b, v21.8b \n" // A
|
||||
"umlal2 v1.8h, v5.16b, v21.16b \n"
|
||||
"addhn v0.8b, v0.8h, v22.8h \n" // 16 bit to 8 bit Y
|
||||
"addhn v1.8b, v1.8h, v22.8h \n"
|
||||
"st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(c) // %3
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||
"v17");
|
||||
: "r"(c) // %3
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18",
|
||||
"v19", "v20", "v21", "v22");
|
||||
}
|
||||
|
||||
|
||||
void ARGBToYMatrixRow_NEON_DotProd(
|
||||
const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"ldr s0, [%3] \n" // load rgbconstants
|
||||
"ldr s1, [%3, #48] \n"
|
||||
"dup v16.4s, v0.s[0] \n"
|
||||
"dup v17.8h, v1.h[0] \n"
|
||||
"ldr s16, [%3] \n" // load 4 coeffs
|
||||
"ldr s17, [%3, #48] \n" // load kAddY[0]
|
||||
"dup v18.4s, v16.s[0] \n"
|
||||
"dup v19.8h, v17.h[0] \n"
|
||||
"1: \n"
|
||||
"ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [%0], #64 \n" // load 16
|
||||
// pixels.
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||
"movi v0.16b, #0 \n"
|
||||
"movi v1.16b, #0 \n"
|
||||
"movi v2.16b, #0 \n"
|
||||
"movi v3.16b, #0 \n"
|
||||
"udot v0.4s, v4.16b, v16.16b \n"
|
||||
"udot v1.4s, v5.16b, v16.16b \n"
|
||||
"udot v2.4s, v6.16b, v16.16b \n"
|
||||
"udot v3.4s, v7.16b, v16.16b \n"
|
||||
"udot v0.4s, v4.16b, v18.16b \n"
|
||||
"udot v1.4s, v5.16b, v18.16b \n"
|
||||
"udot v2.4s, v6.16b, v18.16b \n"
|
||||
"udot v3.4s, v7.16b, v18.16b \n"
|
||||
"uzp1 v0.8h, v0.8h, v1.8h \n"
|
||||
"uzp1 v1.8h, v2.8h, v3.8h \n"
|
||||
"addhn v0.8b, v0.8h, v17.8h \n"
|
||||
"addhn v1.8b, v1.8h, v17.8h \n"
|
||||
"addhn v0.8b, v0.8h, v19.8h \n"
|
||||
"addhn v1.8b, v1.8h, v19.8h \n"
|
||||
"st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(c) // %3
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||
"v17");
|
||||
: "r"(c) // %3
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19");
|
||||
}
|
||||
|
||||
|
||||
// RGB to JPeg coefficients
|
||||
// B * 0.1140 coefficient = 29
|
||||
// G * 0.5870 coefficient = 150
|
||||
// R * 0.2990 coefficient = 77
|
||||
// Add 0.5
|
||||
static const struct ArgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, {}, {}, {0x0080}, {}};
|
||||
static const struct ArgbConstants kRgb24JPEGDotProdConstants = {{0, 29, 150, 77}, {}, {}, {0x0080}, {}};
|
||||
|
||||
static const struct ArgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, {}, {}, {0x0080}, {}};
|
||||
|
||||
// RGB to BT.601 coefficients
|
||||
// B * 0.1016 coefficient = 25
|
||||
// G * 0.5078 coefficient = 129
|
||||
// R * 0.2578 coefficient = 66
|
||||
// Add 16.5 = 0x1080
|
||||
|
||||
static const struct ArgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, {}, {}, {0x1080}, {}};
|
||||
static const struct ArgbConstants kRgb24I601DotProdConstants = {{0, 25, 129, 66}, {}, {}, {0x1080}, {}};
|
||||
|
||||
static const struct ArgbConstants kRawI601Constants = {{66, 129, 25, 0}, {}, {}, {0x1080}, {}};
|
||||
static const struct ArgbConstants kRawI601DotProdConstants = {{0, 66, 129, 25}, {}, {}, {0x1080}, {}};
|
||||
|
||||
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
|
||||
ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kArgbI601Constants);
|
||||
}
|
||||
|
||||
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
|
||||
ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
|
||||
ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kArgbJPEGConstants);
|
||||
}
|
||||
|
||||
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
|
||||
ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
|
||||
ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kAbgrI601Constants);
|
||||
}
|
||||
|
||||
void ABGRToYJRow_NEON(const uint8_t* src_abgr, uint8_t* dst_yj, int width) {
|
||||
ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kRawJPEGConstants);
|
||||
ARGBToYMatrixRow_NEON(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
|
||||
}
|
||||
|
||||
void ARGBToYRow_NEON_DotProd(const uint8_t* src_argb,
|
||||
uint8_t* dst_y,
|
||||
int width) {
|
||||
ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_y, width, &kRgb24I601Constants);
|
||||
ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_y, width, &kArgbI601Constants);
|
||||
}
|
||||
|
||||
void ARGBToYJRow_NEON_DotProd(const uint8_t* src_argb,
|
||||
uint8_t* dst_yj,
|
||||
int width) {
|
||||
ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_yj, width, &kRgb24JPEGConstants);
|
||||
ARGBToYMatrixRow_NEON_DotProd(src_argb, dst_yj, width, &kArgbJPEGConstants);
|
||||
}
|
||||
|
||||
void ABGRToYRow_NEON_DotProd(const uint8_t* src_abgr,
|
||||
uint8_t* dst_y,
|
||||
int width) {
|
||||
ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_y, width, &kRawI601Constants);
|
||||
ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_y, width, &kAbgrI601Constants);
|
||||
}
|
||||
|
||||
void ABGRToYJRow_NEON_DotProd(const uint8_t* src_abgr,
|
||||
uint8_t* dst_yj,
|
||||
int width) {
|
||||
ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_yj, width, &kRawJPEGConstants);
|
||||
ARGBToYMatrixRow_NEON_DotProd(src_abgr, dst_yj, width, &kAbgrJPEGConstants);
|
||||
}
|
||||
|
||||
// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
|
||||
// Same code as ARGB, except the LD4
|
||||
static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
|
||||
uint8_t* dst_y,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"ldr s0, [%3] \n" // load rgbconstants
|
||||
"ldr s1, [%3, #48] \n"
|
||||
"dup v6.16b, v0.b[0] \n"
|
||||
"dup v7.16b, v0.b[1] \n"
|
||||
"dup v16.16b, v0.b[2] \n"
|
||||
"dup v17.8h, v1.h[0] \n"
|
||||
"1: \n"
|
||||
"ld4 {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n" // load 16
|
||||
// pixels.
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||
"umull v0.8h, v2.8b, v6.8b \n" // B
|
||||
"umull2 v1.8h, v2.16b, v6.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"umlal v0.8h, v3.8b, v7.8b \n" // G
|
||||
"umlal2 v1.8h, v3.16b, v7.16b \n"
|
||||
"umlal v0.8h, v4.8b, v16.8b \n" // R
|
||||
"umlal2 v1.8h, v4.16b, v16.16b \n"
|
||||
"addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y
|
||||
"addhn v1.8b, v1.8h, v17.8h \n"
|
||||
"st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_rgba), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(c) // %3
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||
"v17");
|
||||
}
|
||||
|
||||
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||
RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
|
||||
ARGBToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgbaI601Constants);
|
||||
}
|
||||
|
||||
void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
|
||||
RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
|
||||
ARGBToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
|
||||
}
|
||||
|
||||
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
|
||||
RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
|
||||
ARGBToYMatrixRow_NEON(src_bgra, dst_y, width, &kBgraI601Constants);
|
||||
}
|
||||
|
||||
void RGBAToYRow_NEON_DotProd(const uint8_t* src_rgba,
|
||||
uint8_t* dst_y,
|
||||
int width) {
|
||||
// No need for a separate implementation for RGBA inputs, just permute the
|
||||
// RGB constants.
|
||||
ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_y, width,
|
||||
&kRgb24I601DotProdConstants);
|
||||
ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_y, width, &kRgbaI601Constants);
|
||||
}
|
||||
|
||||
void RGBAToYJRow_NEON_DotProd(const uint8_t* src_rgba,
|
||||
uint8_t* dst_yj,
|
||||
int width) {
|
||||
// No need for a separate implementation for RGBA inputs, just permute the
|
||||
// RGB constants.
|
||||
ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_yj, width,
|
||||
&kRgb24JPEGDotProdConstants);
|
||||
ARGBToYMatrixRow_NEON_DotProd(src_rgba, dst_yj, width, &kRgbaJPEGConstants);
|
||||
}
|
||||
|
||||
void BGRAToYRow_NEON_DotProd(const uint8_t* src_bgra,
|
||||
uint8_t* dst_y,
|
||||
int width) {
|
||||
// No need for a separate implementation for RGBA inputs, just permute the
|
||||
// RGB constants.
|
||||
ARGBToYMatrixRow_NEON_DotProd(src_bgra, dst_y, width,
|
||||
&kRawI601DotProdConstants);
|
||||
ARGBToYMatrixRow_NEON_DotProd(src_bgra, dst_y, width, &kBgraI601Constants);
|
||||
}
|
||||
|
||||
void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
|
||||
@ -3978,30 +3712,32 @@ void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
|
||||
int width,
|
||||
const struct ArgbConstants* c) {
|
||||
asm volatile(
|
||||
"ldr d0, [%3] \n" // load rgbconstants
|
||||
"dup v5.16b, v0.b[0] \n"
|
||||
"dup v6.16b, v0.b[1] \n"
|
||||
"dup v7.16b, v0.b[2] \n"
|
||||
"dup v16.8h, v0.h[2] \n"
|
||||
"ldr s16, [%3] \n" // load 4 coeffs
|
||||
"ldr s17, [%3, #48] \n" // load kAddY[0]
|
||||
"dup v18.16b, v16.b[0] \n" // B
|
||||
"dup v19.16b, v16.b[1] \n" // G
|
||||
"dup v20.16b, v16.b[2] \n" // R
|
||||
"dup v21.8h, v17.h[0] \n" // bias
|
||||
"1: \n"
|
||||
"ld3 {v2.16b,v3.16b,v4.16b}, [%0], #48 \n" // load 16 pixels.
|
||||
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||
"umull v0.8h, v2.8b, v5.8b \n" // B
|
||||
"umull2 v1.8h, v2.16b, v5.16b \n"
|
||||
"umull v0.8h, v2.8b, v18.8b \n" // B
|
||||
"umull2 v1.8h, v2.16b, v18.16b \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"umlal v0.8h, v3.8b, v6.8b \n" // G
|
||||
"umlal2 v1.8h, v3.16b, v6.16b \n"
|
||||
"umlal v0.8h, v4.8b, v7.8b \n" // R
|
||||
"umlal2 v1.8h, v4.16b, v7.16b \n"
|
||||
"addhn v0.8b, v0.8h, v16.8h \n" // 16 bit to 8 bit Y
|
||||
"addhn v1.8b, v1.8h, v16.8h \n"
|
||||
"umlal v0.8h, v3.8b, v19.8b \n" // G
|
||||
"umlal2 v1.8h, v3.16b, v19.16b \n"
|
||||
"umlal v0.8h, v4.8b, v20.8b \n" // R
|
||||
"umlal2 v1.8h, v4.16b, v20.16b \n"
|
||||
"addhn v0.8b, v0.8h, v21.8h \n" // 16 bit to 8 bit Y
|
||||
"addhn v1.8b, v1.8h, v21.8h \n"
|
||||
"st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_rgb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(c) // %3
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v16", "v17", "v18",
|
||||
"v19", "v20", "v21");
|
||||
}
|
||||
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user