mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
RAWToJ400 faster version for ARM
- Unrolled to 16 pixels - Take constants via structure, allowing different colorspace and channel order - Use ADDHN to add 16.5 and take upper 8 bits of 16 bit values, narrowing to 8 bits - clang-format applied, affecting mips code On Cortex A510 Was RAWToJ400_Opt (1623 ms) Now RAWToJ400_Opt (862 ms) C RAWToJ400_Opt (1627 ms) Bug: b/220171611 Change-Id: I06a9baf9650ebe2802fb6ff6dfbd524e2c06ada0 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3534023 Reviewed-by: Wan-Teh Chang <wtc@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
3aebf69d66
commit
95b14b2446
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 1813
|
Version: 1814
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -2117,7 +2117,7 @@ int P210ToARGBMatrixFilter(const uint16_t* src_y,
|
|||||||
const struct YuvConstants* yuvconstants,
|
const struct YuvConstants* yuvconstants,
|
||||||
int width,
|
int width,
|
||||||
int height,
|
int height,
|
||||||
enum FilterMode filter) ;
|
enum FilterMode filter);
|
||||||
|
|
||||||
// Convert P010 to AR30 with matrix and UV filter mode.
|
// Convert P010 to AR30 with matrix and UV filter mode.
|
||||||
LIBYUV_API
|
LIBYUV_API
|
||||||
|
|||||||
@ -91,7 +91,8 @@
|
|||||||
* out : 23,40,41,26, 23,40,41,26
|
* out : 23,40,41,26, 23,40,41,26
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
|
static inline __m128i __lsx_vdp2add_h_b(__m128i in_c,
|
||||||
|
__m128i in_h,
|
||||||
__m128i in_l) {
|
__m128i in_l) {
|
||||||
__m128i out;
|
__m128i out;
|
||||||
|
|
||||||
@ -117,7 +118,8 @@ static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
|
|||||||
* out : 23,40,41,26, 23,40,41,26
|
* out : 23,40,41,26, 23,40,41,26
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
|
static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c,
|
||||||
|
__m128i in_h,
|
||||||
__m128i in_l) {
|
__m128i in_l) {
|
||||||
__m128i out;
|
__m128i out;
|
||||||
|
|
||||||
@ -143,7 +145,8 @@ static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
|
|||||||
* out : -4,-24,-60,-112, 6,26,62,114
|
* out : -4,-24,-60,-112, 6,26,62,114
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
|
static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c,
|
||||||
|
__m128i in_h,
|
||||||
__m128i in_l) {
|
__m128i in_l) {
|
||||||
__m128i out;
|
__m128i out;
|
||||||
|
|
||||||
@ -169,7 +172,8 @@ static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
|
|||||||
* out : 23,40,41,26
|
* out : 23,40,41,26
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
|
static inline __m128i __lsx_vdp2add_w_h(__m128i in_c,
|
||||||
|
__m128i in_h,
|
||||||
__m128i in_l) {
|
__m128i in_l) {
|
||||||
__m128i out;
|
__m128i out;
|
||||||
|
|
||||||
@ -414,8 +418,8 @@ static inline __m128i __lsx_vclip255_w(__m128i _in) {
|
|||||||
_out0, _out1, _out2, _out3, _out4, _out5, _out6, \
|
_out0, _out1, _out2, _out3, _out4, _out5, _out6, \
|
||||||
_out7) \
|
_out7) \
|
||||||
{ \
|
{ \
|
||||||
__m128i zero = { 0 }; \
|
__m128i zero = {0}; \
|
||||||
__m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \
|
__m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; \
|
||||||
__m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
|
__m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
|
||||||
\
|
\
|
||||||
_t0 = __lsx_vilvl_b(_in2, _in0); \
|
_t0 = __lsx_vilvl_b(_in2, _in0); \
|
||||||
@ -828,7 +832,8 @@ static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
|
|||||||
* Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
|
* Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
|
static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,
|
||||||
|
__m256i in_h,
|
||||||
__m256i in_l) {
|
__m256i in_l) {
|
||||||
__m256i out;
|
__m256i out;
|
||||||
|
|
||||||
@ -851,7 +856,8 @@ static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
|
|||||||
* Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
|
* Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
|
static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c,
|
||||||
|
__m256i in_h,
|
||||||
__m256i in_l) {
|
__m256i in_l) {
|
||||||
__m256i out;
|
__m256i out;
|
||||||
|
|
||||||
@ -874,7 +880,8 @@ static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
|
|||||||
* Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
|
* Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
|
static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c,
|
||||||
|
__m256i in_h,
|
||||||
__m256i in_l) {
|
__m256i in_l) {
|
||||||
__m256i out;
|
__m256i out;
|
||||||
|
|
||||||
@ -901,7 +908,8 @@ static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
|
|||||||
* out : 23,40,41,26, 23,40,41,26
|
* out : 23,40,41,26, 23,40,41,26
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
|
static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c,
|
||||||
|
__m256i in_h,
|
||||||
__m256i in_l) {
|
__m256i in_l) {
|
||||||
__m256i out;
|
__m256i out;
|
||||||
|
|
||||||
@ -924,7 +932,8 @@ static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
|
|||||||
* Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
|
* Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
|
static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c,
|
||||||
|
__m256i in_h,
|
||||||
__m256i in_l) {
|
__m256i in_l) {
|
||||||
__m256i out;
|
__m256i out;
|
||||||
|
|
||||||
@ -947,7 +956,8 @@ static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
|
|||||||
* Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
|
* Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
|
static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c,
|
||||||
|
__m256i in_h,
|
||||||
__m256i in_l) {
|
__m256i in_l) {
|
||||||
__m256i out;
|
__m256i out;
|
||||||
|
|
||||||
@ -971,7 +981,8 @@ static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
|
|||||||
* Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
|
* Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
|
static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c,
|
||||||
|
__m256i in_h,
|
||||||
__m256i in_l) {
|
__m256i in_l) {
|
||||||
__m256i out;
|
__m256i out;
|
||||||
|
|
||||||
@ -1000,7 +1011,8 @@ static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
|
|||||||
* out : -7,-3,0,0, 0,-1,0,-1
|
* out : -7,-3,0,0, 0,-1,0,-1
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
|
static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c,
|
||||||
|
__m256i in_h,
|
||||||
__m256i in_l) {
|
__m256i in_l) {
|
||||||
__m256i out;
|
__m256i out;
|
||||||
|
|
||||||
@ -1201,7 +1213,8 @@ static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
|
|||||||
* out : 201, 602,1203,2004, -995, -1794,-2793,-3992
|
* out : 201, 602,1203,2004, -995, -1794,-2793,-3992
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
|
static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c,
|
||||||
|
__m256i in_h,
|
||||||
__m256i in_l) {
|
__m256i in_l) {
|
||||||
__m256i tmp0, tmp1, out;
|
__m256i tmp0, tmp1, out;
|
||||||
|
|
||||||
@ -1225,7 +1238,8 @@ static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
|
|||||||
* Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
|
* Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
|
||||||
* =============================================================================
|
* =============================================================================
|
||||||
*/
|
*/
|
||||||
static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
|
static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c,
|
||||||
|
__m256i in_h,
|
||||||
__m256i in_l) {
|
__m256i in_l) {
|
||||||
__m256i tmp0, tmp1, out;
|
__m256i tmp0, tmp1, out;
|
||||||
|
|
||||||
@ -1303,7 +1317,7 @@ static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
|
|||||||
*/
|
*/
|
||||||
static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
|
static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
|
||||||
__m256i tmp1, out;
|
__m256i tmp1, out;
|
||||||
__m256i zero = { 0 };
|
__m256i zero = {0};
|
||||||
|
|
||||||
tmp1 = __lasx_xvilvl_b(zero, in_l);
|
tmp1 = __lasx_xvilvl_b(zero, in_l);
|
||||||
out = __lasx_xvsadd_hu(in_h, tmp1);
|
out = __lasx_xvsadd_hu(in_h, tmp1);
|
||||||
@ -1925,8 +1939,10 @@ static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
|
|||||||
{ \
|
{ \
|
||||||
RTYPE _tmp0 = (RTYPE)in0; \
|
RTYPE _tmp0 = (RTYPE)in0; \
|
||||||
int _i = 0; \
|
int _i = 0; \
|
||||||
if (enter) printf("\nVP:"); \
|
if (enter) \
|
||||||
for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
|
printf("\nVP:"); \
|
||||||
|
for (_i = 0; _i < element_num; _i++) \
|
||||||
|
printf("%d,", _tmp0[_i]); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif /* LOONGSON_INTRINSICS_H */
|
#endif /* LOONGSON_INTRINSICS_H */
|
||||||
|
|||||||
@ -83,16 +83,15 @@
|
|||||||
#else // !(__mips_isa_rev >= 6)
|
#else // !(__mips_isa_rev >= 6)
|
||||||
#define LW(psrc) \
|
#define LW(psrc) \
|
||||||
({ \
|
({ \
|
||||||
uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
|
uint8_t* psrc_lw_m = (uint8_t*)(psrc); \
|
||||||
uint32_t val_lw_m; \
|
uint32_t val_lw_m; \
|
||||||
\
|
\
|
||||||
__asm__ volatile ( \
|
__asm__ volatile( \
|
||||||
"lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
|
"lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
|
||||||
"lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
|
"lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
|
||||||
\
|
\
|
||||||
: [val_lw_m] "=&r"(val_lw_m) \
|
: [val_lw_m] "=&r"(val_lw_m) \
|
||||||
: [psrc_lw_m] "r"(psrc_lw_m) \
|
: [psrc_lw_m] "r"(psrc_lw_m)); \
|
||||||
); \
|
|
||||||
\
|
\
|
||||||
val_lw_m; \
|
val_lw_m; \
|
||||||
})
|
})
|
||||||
@ -100,16 +99,15 @@
|
|||||||
#if (__mips == 64)
|
#if (__mips == 64)
|
||||||
#define LD(psrc) \
|
#define LD(psrc) \
|
||||||
({ \
|
({ \
|
||||||
uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
|
uint8_t* psrc_ld_m = (uint8_t*)(psrc); \
|
||||||
uint64_t val_ld_m = 0; \
|
uint64_t val_ld_m = 0; \
|
||||||
\
|
\
|
||||||
__asm__ volatile ( \
|
__asm__ volatile( \
|
||||||
"ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
|
"ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
|
||||||
"ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
|
"ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
|
||||||
\
|
\
|
||||||
: [val_ld_m] "=&r" (val_ld_m) \
|
: [val_ld_m] "=&r"(val_ld_m) \
|
||||||
: [psrc_ld_m] "r" (psrc_ld_m) \
|
: [psrc_ld_m] "r"(psrc_ld_m)); \
|
||||||
); \
|
|
||||||
\
|
\
|
||||||
val_ld_m; \
|
val_ld_m; \
|
||||||
})
|
})
|
||||||
|
|||||||
@ -1380,7 +1380,9 @@ void BGRAToYRow_LSX(const uint8_t* src_bgra, uint8_t* dst_y, int width);
|
|||||||
void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width);
|
void ABGRToYRow_LSX(const uint8_t* src_abgr, uint8_t* dst_y, int width);
|
||||||
void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width);
|
void RGBAToYRow_LSX(const uint8_t* src_rgba, uint8_t* dst_y, int width);
|
||||||
void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
|
void ARGB1555ToYRow_LSX(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
|
||||||
void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555, uint8_t* dst_y, int width);
|
void ARGB1555ToYRow_LASX(const uint8_t* src_argb1555,
|
||||||
|
uint8_t* dst_y,
|
||||||
|
int width);
|
||||||
void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
|
void RGB565ToYRow_LSX(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
|
||||||
void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
|
void RGB565ToYRow_LASX(const uint8_t* src_rgb565, uint8_t* dst_y, int width);
|
||||||
void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
|
void RGB24ToYRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_y, int width);
|
||||||
@ -2748,7 +2750,9 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
|
|||||||
int width);
|
int width);
|
||||||
void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
|
void RGB24ToARGBRow_MSA(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
|
||||||
void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
|
void RGB24ToARGBRow_LSX(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
|
||||||
void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_argb, int width);
|
void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24,
|
||||||
|
uint8_t* dst_argb,
|
||||||
|
int width);
|
||||||
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
|
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width);
|
||||||
void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
|
void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width);
|
||||||
void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
|
void RAWToARGBRow_MSA(const uint8_t* src_raw, uint8_t* dst_argb, int width);
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1813
|
#define LIBYUV_VERSION 1814
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|||||||
@ -1422,7 +1422,7 @@ int ARGBToI420(const uint8_t* src_argb,
|
|||||||
#if defined(HAS_ARGBTOYROW_NEON)
|
#if defined(HAS_ARGBTOYROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
ARGBToYRow = ARGBToYRow_Any_NEON;
|
ARGBToYRow = ARGBToYRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
ARGBToYRow = ARGBToYRow_NEON;
|
ARGBToYRow = ARGBToYRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1658,7 +1658,7 @@ int ABGRToI420(const uint8_t* src_abgr,
|
|||||||
#if defined(HAS_ABGRTOYROW_NEON)
|
#if defined(HAS_ABGRTOYROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
ABGRToYRow = ABGRToYRow_Any_NEON;
|
ABGRToYRow = ABGRToYRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
ABGRToYRow = ABGRToYRow_NEON;
|
ABGRToYRow = ABGRToYRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1754,7 +1754,7 @@ int RGBAToI420(const uint8_t* src_rgba,
|
|||||||
#if defined(HAS_RGBATOYROW_NEON)
|
#if defined(HAS_RGBATOYROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
RGBAToYRow = RGBAToYRow_Any_NEON;
|
RGBAToYRow = RGBAToYRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
RGBAToYRow = RGBAToYRow_NEON;
|
RGBAToYRow = RGBAToYRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1855,13 +1855,11 @@ int RGB24ToI420(const uint8_t* src_rgb24,
|
|||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
|
RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
|
||||||
RGB24ToYRow = RGB24ToYRow_Any_NEON;
|
RGB24ToYRow = RGB24ToYRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
|
||||||
RGB24ToYRow = RGB24ToYRow_NEON;
|
|
||||||
if (IS_ALIGNED(width, 16)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
|
RGB24ToYRow = RGB24ToYRow_NEON;
|
||||||
RGB24ToUVRow = RGB24ToUVRow_NEON;
|
RGB24ToUVRow = RGB24ToUVRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA)
|
#if defined(HAS_RGB24TOYROW_MSA) && defined(HAS_RGB24TOUVROW_MSA)
|
||||||
if (TestCpuFlag(kCpuHasMSA)) {
|
if (TestCpuFlag(kCpuHasMSA)) {
|
||||||
@ -2031,13 +2029,11 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
|
|||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON;
|
RGB24ToUVJRow = RGB24ToUVJRow_Any_NEON;
|
||||||
RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
|
RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
|
||||||
RGB24ToYJRow = RGB24ToYJRow_NEON;
|
|
||||||
if (IS_ALIGNED(width, 16)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
|
RGB24ToYJRow = RGB24ToYJRow_NEON;
|
||||||
RGB24ToUVJRow = RGB24ToUVJRow_NEON;
|
RGB24ToUVJRow = RGB24ToUVJRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_RGB24TOYJROW_MSA) && defined(HAS_RGB24TOUVJROW_MSA)
|
#if defined(HAS_RGB24TOYJROW_MSA) && defined(HAS_RGB24TOUVJROW_MSA)
|
||||||
if (TestCpuFlag(kCpuHasMSA)) {
|
if (TestCpuFlag(kCpuHasMSA)) {
|
||||||
@ -2095,7 +2091,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
|
|||||||
#endif
|
#endif
|
||||||
#endif // HAS_RGB24TOYJROW
|
#endif // HAS_RGB24TOYJROW
|
||||||
|
|
||||||
{
|
{
|
||||||
#if !defined(HAS_RGB24TOYJROW)
|
#if !defined(HAS_RGB24TOYJROW)
|
||||||
// Allocate 2 rows of ARGB.
|
// Allocate 2 rows of ARGB.
|
||||||
const int kRowSize = (width * 4 + 31) & ~31;
|
const int kRowSize = (width * 4 + 31) & ~31;
|
||||||
@ -2132,8 +2128,8 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
|
|||||||
#if !defined(HAS_RGB24TOYJROW)
|
#if !defined(HAS_RGB24TOYJROW)
|
||||||
free_aligned_buffer_64(row);
|
free_aligned_buffer_64(row);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
#undef HAS_RGB24TOYJROW
|
#undef HAS_RGB24TOYJROW
|
||||||
|
|
||||||
@ -2187,13 +2183,11 @@ int RAWToI420(const uint8_t* src_raw,
|
|||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
RAWToUVRow = RAWToUVRow_Any_NEON;
|
RAWToUVRow = RAWToUVRow_Any_NEON;
|
||||||
RAWToYRow = RAWToYRow_Any_NEON;
|
RAWToYRow = RAWToYRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
|
||||||
RAWToYRow = RAWToYRow_NEON;
|
|
||||||
if (IS_ALIGNED(width, 16)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
|
RAWToYRow = RAWToYRow_NEON;
|
||||||
RAWToUVRow = RAWToUVRow_NEON;
|
RAWToUVRow = RAWToUVRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_RAWTOYROW_MSA) && defined(HAS_RAWTOUVROW_MSA)
|
#if defined(HAS_RAWTOYROW_MSA) && defined(HAS_RAWTOUVROW_MSA)
|
||||||
if (TestCpuFlag(kCpuHasMSA)) {
|
if (TestCpuFlag(kCpuHasMSA)) {
|
||||||
@ -2363,13 +2357,11 @@ int RAWToJ420(const uint8_t* src_raw,
|
|||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
RAWToUVJRow = RAWToUVJRow_Any_NEON;
|
RAWToUVJRow = RAWToUVJRow_Any_NEON;
|
||||||
RAWToYJRow = RAWToYJRow_Any_NEON;
|
RAWToYJRow = RAWToYJRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
|
||||||
RAWToYJRow = RAWToYJRow_NEON;
|
|
||||||
if (IS_ALIGNED(width, 16)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
|
RAWToYJRow = RAWToYJRow_NEON;
|
||||||
RAWToUVJRow = RAWToUVJRow_NEON;
|
RAWToUVJRow = RAWToUVJRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA)
|
#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA)
|
||||||
if (TestCpuFlag(kCpuHasMSA)) {
|
if (TestCpuFlag(kCpuHasMSA)) {
|
||||||
@ -2521,8 +2513,8 @@ int RGB565ToI420(const uint8_t* src_rgb565,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// MSA version does direct RGB565 to YUV.
|
// MSA version does direct RGB565 to YUV.
|
||||||
#elif (defined(HAS_RGB565TOYROW_MSA) || defined(HAS_RGB565TOYROW_LSX) \
|
#elif (defined(HAS_RGB565TOYROW_MSA) || defined(HAS_RGB565TOYROW_LSX) || \
|
||||||
|| defined(HAS_RGB565TOYROW_LASX))
|
defined(HAS_RGB565TOYROW_LASX))
|
||||||
#if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA)
|
#if defined(HAS_RGB565TOYROW_MSA) && defined(HAS_RGB565TOUVROW_MSA)
|
||||||
if (TestCpuFlag(kCpuHasMSA)) {
|
if (TestCpuFlag(kCpuHasMSA)) {
|
||||||
RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
|
RGB565ToUVRow = RGB565ToUVRow_Any_MSA;
|
||||||
@ -2701,8 +2693,8 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
// MSA version does direct ARGB1555 to YUV.
|
// MSA version does direct ARGB1555 to YUV.
|
||||||
#elif (defined(HAS_ARGB1555TOYROW_MSA) || defined(HAS_ARGB1555TOYROW_LSX) \
|
#elif (defined(HAS_ARGB1555TOYROW_MSA) || defined(HAS_ARGB1555TOYROW_LSX) || \
|
||||||
|| defined(HAS_ARGB1555TOYROW_LASX))
|
defined(HAS_ARGB1555TOYROW_LASX))
|
||||||
#if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA)
|
#if defined(HAS_ARGB1555TOYROW_MSA) && defined(HAS_ARGB1555TOUVROW_MSA)
|
||||||
if (TestCpuFlag(kCpuHasMSA)) {
|
if (TestCpuFlag(kCpuHasMSA)) {
|
||||||
ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
|
ARGB1555ToUVRow = ARGB1555ToUVRow_Any_MSA;
|
||||||
@ -3067,7 +3059,7 @@ int RGB24ToJ400(const uint8_t* src_rgb24,
|
|||||||
#if defined(HAS_RGB24TOYJROW_NEON)
|
#if defined(HAS_RGB24TOYJROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
|
RGB24ToYJRow = RGB24ToYJRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
RGB24ToYJRow = RGB24ToYJRow_NEON;
|
RGB24ToYJRow = RGB24ToYJRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -103,7 +103,7 @@ int ARGBToI444(const uint8_t* src_argb,
|
|||||||
#if defined(HAS_ARGBTOYROW_NEON)
|
#if defined(HAS_ARGBTOYROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
ARGBToYRow = ARGBToYRow_Any_NEON;
|
ARGBToYRow = ARGBToYRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
ARGBToYRow = ARGBToYRow_NEON;
|
ARGBToYRow = ARGBToYRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -205,7 +205,7 @@ int ARGBToI422(const uint8_t* src_argb,
|
|||||||
#if defined(HAS_ARGBTOYROW_NEON)
|
#if defined(HAS_ARGBTOYROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
ARGBToYRow = ARGBToYRow_Any_NEON;
|
ARGBToYRow = ARGBToYRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
ARGBToYRow = ARGBToYRow_NEON;
|
ARGBToYRow = ARGBToYRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -283,7 +283,7 @@ int ARGBToNV12(const uint8_t* src_argb,
|
|||||||
#if defined(HAS_ARGBTOYROW_NEON)
|
#if defined(HAS_ARGBTOYROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
ARGBToYRow = ARGBToYRow_Any_NEON;
|
ARGBToYRow = ARGBToYRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
ARGBToYRow = ARGBToYRow_NEON;
|
ARGBToYRow = ARGBToYRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -477,7 +477,7 @@ int ARGBToNV21(const uint8_t* src_argb,
|
|||||||
#if defined(HAS_ARGBTOYROW_NEON)
|
#if defined(HAS_ARGBTOYROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
ARGBToYRow = ARGBToYRow_Any_NEON;
|
ARGBToYRow = ARGBToYRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
ARGBToYRow = ARGBToYRow_NEON;
|
ARGBToYRow = ARGBToYRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -638,7 +638,7 @@ int ABGRToNV12(const uint8_t* src_abgr,
|
|||||||
#if defined(HAS_ABGRTOYROW_NEON)
|
#if defined(HAS_ABGRTOYROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
ABGRToYRow = ABGRToYRow_Any_NEON;
|
ABGRToYRow = ABGRToYRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
ABGRToYRow = ABGRToYRow_NEON;
|
ABGRToYRow = ABGRToYRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -790,7 +790,7 @@ int ABGRToNV21(const uint8_t* src_abgr,
|
|||||||
#if defined(HAS_ABGRTOYROW_NEON)
|
#if defined(HAS_ABGRTOYROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
ABGRToYRow = ABGRToYRow_Any_NEON;
|
ABGRToYRow = ABGRToYRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
ABGRToYRow = ABGRToYRow_NEON;
|
ABGRToYRow = ABGRToYRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -947,7 +947,7 @@ int ARGBToYUY2(const uint8_t* src_argb,
|
|||||||
#if defined(HAS_ARGBTOYROW_NEON)
|
#if defined(HAS_ARGBTOYROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
ARGBToYRow = ARGBToYRow_Any_NEON;
|
ARGBToYRow = ARGBToYRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
ARGBToYRow = ARGBToYRow_NEON;
|
ARGBToYRow = ARGBToYRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1110,7 +1110,7 @@ int ARGBToUYVY(const uint8_t* src_argb,
|
|||||||
#if defined(HAS_ARGBTOYROW_NEON)
|
#if defined(HAS_ARGBTOYROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
ARGBToYRow = ARGBToYRow_Any_NEON;
|
ARGBToYRow = ARGBToYRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
ARGBToYRow = ARGBToYRow_NEON;
|
ARGBToYRow = ARGBToYRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1249,7 +1249,7 @@ int ARGBToI400(const uint8_t* src_argb,
|
|||||||
#if defined(HAS_ARGBTOYROW_NEON)
|
#if defined(HAS_ARGBTOYROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
ARGBToYRow = ARGBToYRow_Any_NEON;
|
ARGBToYRow = ARGBToYRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
ARGBToYRow = ARGBToYRow_NEON;
|
ARGBToYRow = ARGBToYRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -1906,7 +1906,7 @@ int ARGBToJ420(const uint8_t* src_argb,
|
|||||||
#if defined(HAS_ARGBTOYJROW_NEON)
|
#if defined(HAS_ARGBTOYJROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
ARGBToYJRow = ARGBToYJRow_Any_NEON;
|
ARGBToYJRow = ARGBToYJRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
ARGBToYJRow = ARGBToYJRow_NEON;
|
ARGBToYJRow = ARGBToYJRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2029,7 +2029,7 @@ int ARGBToJ422(const uint8_t* src_argb,
|
|||||||
#if defined(HAS_ARGBTOYJROW_NEON)
|
#if defined(HAS_ARGBTOYJROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
ARGBToYJRow = ARGBToYJRow_Any_NEON;
|
ARGBToYJRow = ARGBToYJRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
ARGBToYJRow = ARGBToYJRow_NEON;
|
ARGBToYJRow = ARGBToYJRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2248,7 +2248,7 @@ int ARGBToJ400(const uint8_t* src_argb,
|
|||||||
#if defined(HAS_ARGBTOYJROW_NEON)
|
#if defined(HAS_ARGBTOYJROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
ARGBToYJRow = ARGBToYJRow_Any_NEON;
|
ARGBToYJRow = ARGBToYJRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
ARGBToYJRow = ARGBToYJRow_NEON;
|
ARGBToYJRow = ARGBToYJRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2314,7 +2314,7 @@ int RGBAToJ400(const uint8_t* src_rgba,
|
|||||||
#if defined(HAS_RGBATOYJROW_NEON)
|
#if defined(HAS_RGBATOYJROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
RGBAToYJRow = RGBAToYJRow_Any_NEON;
|
RGBAToYJRow = RGBAToYJRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
RGBAToYJRow = RGBAToYJRow_NEON;
|
RGBAToYJRow = RGBAToYJRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -2387,13 +2387,11 @@ int RAWToJNV21(const uint8_t* src_raw,
|
|||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
RAWToUVJRow = RAWToUVJRow_Any_NEON;
|
RAWToUVJRow = RAWToUVJRow_Any_NEON;
|
||||||
RAWToYJRow = RAWToYJRow_Any_NEON;
|
RAWToYJRow = RAWToYJRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
|
||||||
RAWToYJRow = RAWToYJRow_NEON;
|
|
||||||
if (IS_ALIGNED(width, 16)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
|
RAWToYJRow = RAWToYJRow_NEON;
|
||||||
RAWToUVJRow = RAWToUVJRow_NEON;
|
RAWToUVJRow = RAWToUVJRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA)
|
#if defined(HAS_RAWTOYJROW_MSA) && defined(HAS_RAWTOUVJROW_MSA)
|
||||||
if (TestCpuFlag(kCpuHasMSA)) {
|
if (TestCpuFlag(kCpuHasMSA)) {
|
||||||
|
|||||||
@ -4186,7 +4186,7 @@ static int ARGBSobelize(const uint8_t* src_argb,
|
|||||||
#if defined(HAS_ARGBTOYJROW_NEON)
|
#if defined(HAS_ARGBTOYJROW_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
ARGBToYJRow = ARGBToYJRow_Any_NEON;
|
ARGBToYJRow = ARGBToYJRow_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
ARGBToYJRow = ARGBToYJRow_NEON;
|
ARGBToYJRow = ARGBToYJRow_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -987,7 +987,7 @@ ANY11(ARGBToYJRow_Any_SSSE3, ARGBToYJRow_SSSE3, 0, 4, 1, 15)
|
|||||||
ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15)
|
ANY11(RGBAToYJRow_Any_SSSE3, RGBAToYJRow_SSSE3, 0, 4, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_ARGBTOYROW_NEON
|
#ifdef HAS_ARGBTOYROW_NEON
|
||||||
ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 7)
|
ANY11(ARGBToYRow_Any_NEON, ARGBToYRow_NEON, 0, 4, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_ARGBTOYROW_MSA
|
#ifdef HAS_ARGBTOYROW_MSA
|
||||||
ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
|
ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
|
||||||
@ -996,10 +996,10 @@ ANY11(ARGBToYRow_Any_MSA, ARGBToYRow_MSA, 0, 4, 1, 15)
|
|||||||
ANY11(ARGBToYRow_Any_LASX, ARGBToYRow_LASX, 0, 4, 1, 31)
|
ANY11(ARGBToYRow_Any_LASX, ARGBToYRow_LASX, 0, 4, 1, 31)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_ARGBTOYJROW_NEON
|
#ifdef HAS_ARGBTOYJROW_NEON
|
||||||
ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 7)
|
ANY11(ARGBToYJRow_Any_NEON, ARGBToYJRow_NEON, 0, 4, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_RGBATOYJROW_NEON
|
#ifdef HAS_RGBATOYJROW_NEON
|
||||||
ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 7)
|
ANY11(RGBAToYJRow_Any_NEON, RGBAToYJRow_NEON, 0, 4, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_ARGBTOYJROW_MSA
|
#ifdef HAS_ARGBTOYJROW_MSA
|
||||||
ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
|
ANY11(ARGBToYJRow_Any_MSA, ARGBToYJRow_MSA, 0, 4, 1, 15)
|
||||||
@ -1011,7 +1011,7 @@ ANY11(ARGBToYJRow_Any_LSX, ARGBToYJRow_LSX, 0, 4, 1, 15)
|
|||||||
ANY11(ARGBToYJRow_Any_LASX, ARGBToYJRow_LASX, 0, 4, 1, 31)
|
ANY11(ARGBToYJRow_Any_LASX, ARGBToYJRow_LASX, 0, 4, 1, 31)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_BGRATOYROW_NEON
|
#ifdef HAS_BGRATOYROW_NEON
|
||||||
ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 7)
|
ANY11(BGRAToYRow_Any_NEON, BGRAToYRow_NEON, 0, 4, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_BGRATOYROW_MSA
|
#ifdef HAS_BGRATOYROW_MSA
|
||||||
ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
|
ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
|
||||||
@ -1020,7 +1020,7 @@ ANY11(BGRAToYRow_Any_MSA, BGRAToYRow_MSA, 0, 4, 1, 15)
|
|||||||
ANY11(BGRAToYRow_Any_LSX, BGRAToYRow_LSX, 0, 4, 1, 15)
|
ANY11(BGRAToYRow_Any_LSX, BGRAToYRow_LSX, 0, 4, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_ABGRTOYROW_NEON
|
#ifdef HAS_ABGRTOYROW_NEON
|
||||||
ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 7)
|
ANY11(ABGRToYRow_Any_NEON, ABGRToYRow_NEON, 0, 4, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_ABGRTOYROW_MSA
|
#ifdef HAS_ABGRTOYROW_MSA
|
||||||
ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
|
ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
|
||||||
@ -1029,7 +1029,7 @@ ANY11(ABGRToYRow_Any_MSA, ABGRToYRow_MSA, 0, 4, 1, 7)
|
|||||||
ANY11(ABGRToYRow_Any_LSX, ABGRToYRow_LSX, 0, 4, 1, 15)
|
ANY11(ABGRToYRow_Any_LSX, ABGRToYRow_LSX, 0, 4, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_RGBATOYROW_NEON
|
#ifdef HAS_RGBATOYROW_NEON
|
||||||
ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 7)
|
ANY11(RGBAToYRow_Any_NEON, RGBAToYRow_NEON, 0, 4, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_RGBATOYROW_MSA
|
#ifdef HAS_RGBATOYROW_MSA
|
||||||
ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
|
ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
|
||||||
@ -1038,7 +1038,7 @@ ANY11(RGBAToYRow_Any_MSA, RGBAToYRow_MSA, 0, 4, 1, 15)
|
|||||||
ANY11(RGBAToYRow_Any_LSX, RGBAToYRow_LSX, 0, 4, 1, 15)
|
ANY11(RGBAToYRow_Any_LSX, RGBAToYRow_LSX, 0, 4, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_RGB24TOYROW_NEON
|
#ifdef HAS_RGB24TOYROW_NEON
|
||||||
ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 7)
|
ANY11(RGB24ToYRow_Any_NEON, RGB24ToYRow_NEON, 0, 3, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_RGB24TOYJROW_AVX2
|
#ifdef HAS_RGB24TOYJROW_AVX2
|
||||||
ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31)
|
ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31)
|
||||||
@ -1047,7 +1047,7 @@ ANY11(RGB24ToYJRow_Any_AVX2, RGB24ToYJRow_AVX2, 0, 3, 1, 31)
|
|||||||
ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15)
|
ANY11(RGB24ToYJRow_Any_SSSE3, RGB24ToYJRow_SSSE3, 0, 3, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_RGB24TOYJROW_NEON
|
#ifdef HAS_RGB24TOYJROW_NEON
|
||||||
ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 7)
|
ANY11(RGB24ToYJRow_Any_NEON, RGB24ToYJRow_NEON, 0, 3, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_RGB24TOYROW_MSA
|
#ifdef HAS_RGB24TOYROW_MSA
|
||||||
ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
|
ANY11(RGB24ToYRow_Any_MSA, RGB24ToYRow_MSA, 0, 3, 1, 15)
|
||||||
@ -1059,7 +1059,7 @@ ANY11(RGB24ToYRow_Any_LSX, RGB24ToYRow_LSX, 0, 3, 1, 15)
|
|||||||
ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31)
|
ANY11(RGB24ToYRow_Any_LASX, RGB24ToYRow_LASX, 0, 3, 1, 31)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_RAWTOYROW_NEON
|
#ifdef HAS_RAWTOYROW_NEON
|
||||||
ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 7)
|
ANY11(RAWToYRow_Any_NEON, RAWToYRow_NEON, 0, 3, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_RAWTOYJROW_AVX2
|
#ifdef HAS_RAWTOYJROW_AVX2
|
||||||
ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31)
|
ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31)
|
||||||
@ -1068,7 +1068,7 @@ ANY11(RAWToYJRow_Any_AVX2, RAWToYJRow_AVX2, 0, 3, 1, 31)
|
|||||||
ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15)
|
ANY11(RAWToYJRow_Any_SSSE3, RAWToYJRow_SSSE3, 0, 3, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_RAWTOYJROW_NEON
|
#ifdef HAS_RAWTOYJROW_NEON
|
||||||
ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 7)
|
ANY11(RAWToYJRow_Any_NEON, RAWToYJRow_NEON, 0, 3, 1, 15)
|
||||||
#endif
|
#endif
|
||||||
#ifdef HAS_RAWTOYROW_MSA
|
#ifdef HAS_RAWTOYROW_MSA
|
||||||
ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
|
ANY11(RAWToYRow_Any_MSA, RAWToYRow_MSA, 0, 3, 1, 15)
|
||||||
|
|||||||
@ -211,7 +211,6 @@ extern "C" {
|
|||||||
_reg1 = __lasx_xvmsub_h(_reg1, const_18, _tmpb); \
|
_reg1 = __lasx_xvmsub_h(_reg1, const_18, _tmpb); \
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) {
|
void MirrorRow_LASX(const uint8_t* src, uint8_t* dst, int width) {
|
||||||
int x;
|
int x;
|
||||||
int len = width / 64;
|
int len = width / 64;
|
||||||
@ -596,8 +595,8 @@ void I422ToARGB1555Row_LASX(const uint8_t* src_y,
|
|||||||
__m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
|
__m256i vec_yb, vec_yg, vec_ub, vec_vr, vec_ug, vec_vg;
|
||||||
__m256i vec_ubvr, vec_ugvg;
|
__m256i vec_ubvr, vec_ugvg;
|
||||||
__m256i const_0x80 = __lasx_xvldi(0x80);
|
__m256i const_0x80 = __lasx_xvldi(0x80);
|
||||||
__m256i alpha = {0x8000800080008000, 0x8000800080008000,
|
__m256i alpha = {0x8000800080008000, 0x8000800080008000, 0x8000800080008000,
|
||||||
0x8000800080008000, 0x8000800080008000};
|
0x8000800080008000};
|
||||||
|
|
||||||
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
|
YUVTORGB_SETUP(yuvconstants, vec_ub, vec_vr, vec_ug, vec_vg, vec_yg, vec_yb);
|
||||||
vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
|
vec_ubvr = __lasx_xvilvl_h(vec_ub, vec_vr);
|
||||||
@ -1507,14 +1506,14 @@ void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24,
|
|||||||
__m256i dst0, dst1, dst2, dst3;
|
__m256i dst0, dst1, dst2, dst3;
|
||||||
__m256i reg0, reg1, reg2, reg3;
|
__m256i reg0, reg1, reg2, reg3;
|
||||||
__m256i alpha = __lasx_xvldi(0xFF);
|
__m256i alpha = __lasx_xvldi(0xFF);
|
||||||
__m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514,
|
__m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C,
|
||||||
0x131211100F0E0D0C, 0x1B1A191817161514};
|
0x1B1A191817161514};
|
||||||
__m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100,
|
__m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918,
|
||||||
0x1F1E1D1C1B1A1918, 0x0706050403020100};
|
0x0706050403020100};
|
||||||
__m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C,
|
__m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504,
|
||||||
0x0B0A090807060504, 0x131211100F0E0D0C};
|
0x131211100F0E0D0C};
|
||||||
__m256i shuf3 = {0x1005040310020100, 0x100B0A0910080706,
|
__m256i shuf3 = {0x1005040310020100, 0x100B0A0910080706, 0x1005040310020100,
|
||||||
0x1005040310020100, 0x100B0A0910080706};
|
0x100B0A0910080706};
|
||||||
|
|
||||||
for (x = 0; x < len; x++) {
|
for (x = 0; x < len; x++) {
|
||||||
reg0 = __lasx_xvld(src_rgb24, 0);
|
reg0 = __lasx_xvld(src_rgb24, 0);
|
||||||
@ -1523,7 +1522,8 @@ void RGB24ToARGBRow_LASX(const uint8_t* src_rgb24,
|
|||||||
src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
|
src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
|
||||||
src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
|
src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
|
||||||
src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
|
src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
|
||||||
DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1);
|
DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0,
|
||||||
|
tmp1);
|
||||||
tmp2 = __lasx_xvshuf_b(src1, src2, shuf2);
|
tmp2 = __lasx_xvshuf_b(src1, src2, shuf2);
|
||||||
DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
|
DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
|
||||||
tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3);
|
tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3);
|
||||||
@ -1545,14 +1545,14 @@ void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
|
|||||||
__m256i tmp0, tmp1, tmp2, reg0, reg1, reg2, reg3;
|
__m256i tmp0, tmp1, tmp2, reg0, reg1, reg2, reg3;
|
||||||
__m256i dst0, dst1, dst2, dst3;
|
__m256i dst0, dst1, dst2, dst3;
|
||||||
__m256i alpha = __lasx_xvldi(0xFF);
|
__m256i alpha = __lasx_xvldi(0xFF);
|
||||||
__m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514,
|
__m256i shuf0 = {0x131211100F0E0D0C, 0x1B1A191817161514, 0x131211100F0E0D0C,
|
||||||
0x131211100F0E0D0C, 0x1B1A191817161514};
|
0x1B1A191817161514};
|
||||||
__m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100,
|
__m256i shuf1 = {0x1F1E1D1C1B1A1918, 0x0706050403020100, 0x1F1E1D1C1B1A1918,
|
||||||
0x1F1E1D1C1B1A1918, 0x0706050403020100};
|
0x0706050403020100};
|
||||||
__m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C,
|
__m256i shuf2 = {0x0B0A090807060504, 0x131211100F0E0D0C, 0x0B0A090807060504,
|
||||||
0x0B0A090807060504, 0x131211100F0E0D0C};
|
0x131211100F0E0D0C};
|
||||||
__m256i shuf3 = {0x1003040510000102, 0x10090A0B10060708,
|
__m256i shuf3 = {0x1003040510000102, 0x10090A0B10060708, 0x1003040510000102,
|
||||||
0x1003040510000102, 0x10090A0B10060708};
|
0x10090A0B10060708};
|
||||||
|
|
||||||
for (x = 0; x < len; x++) {
|
for (x = 0; x < len; x++) {
|
||||||
reg0 = __lasx_xvld(src_raw, 0);
|
reg0 = __lasx_xvld(src_raw, 0);
|
||||||
@ -1561,7 +1561,8 @@ void RAWToARGBRow_LASX(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
|
|||||||
src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
|
src0 = __lasx_xvpermi_q(reg1, reg0, 0x30);
|
||||||
src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
|
src1 = __lasx_xvpermi_q(reg2, reg0, 0x21);
|
||||||
src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
|
src2 = __lasx_xvpermi_q(reg2, reg1, 0x30);
|
||||||
DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0, tmp1);
|
DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuf0, src1, src2, shuf1, tmp0,
|
||||||
|
tmp1);
|
||||||
tmp2 = __lasx_xvshuf_b(src1, src2, shuf2);
|
tmp2 = __lasx_xvshuf_b(src1, src2, shuf2);
|
||||||
DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
|
DUP4_ARG3(__lasx_xvshuf_b, alpha, src0, shuf3, alpha, tmp0, shuf3, alpha,
|
||||||
tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3);
|
tmp1, shuf3, alpha, tmp2, shuf3, reg0, reg1, reg2, reg3);
|
||||||
@ -1646,8 +1647,8 @@ void ARGB1555ToUVRow_LASX(const uint8_t* src_argb1555,
|
|||||||
0x8080808080808080, 0x8080808080808080};
|
0x8080808080808080, 0x8080808080808080};
|
||||||
|
|
||||||
for (x = 0; x < len; x++) {
|
for (x = 0; x < len; x++) {
|
||||||
DUP4_ARG2(__lasx_xvld, src_argb1555, 0, src_argb1555, 32, next_argb1555,
|
DUP4_ARG2(__lasx_xvld, src_argb1555, 0, src_argb1555, 32, next_argb1555, 0,
|
||||||
0, next_argb1555, 32, src0, src1, src2, src3);
|
next_argb1555, 32, src0, src1, src2, src3);
|
||||||
DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2);
|
DUP2_ARG2(__lasx_xvpickev_b, src1, src0, src3, src2, tmp0, tmp2);
|
||||||
DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3);
|
DUP2_ARG2(__lasx_xvpickod_b, src1, src0, src3, src2, tmp1, tmp3);
|
||||||
tmpb = __lasx_xvandi_b(tmp0, 0x1F);
|
tmpb = __lasx_xvandi_b(tmp0, 0x1F);
|
||||||
@ -1821,14 +1822,14 @@ void RGB24ToYRow_LASX(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
|
|||||||
0x4219421942194219, 0x4219421942194219};
|
0x4219421942194219, 0x4219421942194219};
|
||||||
__m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
|
__m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
|
||||||
0x1080108010801080, 0x1080108010801080};
|
0x1080108010801080, 0x1080108010801080};
|
||||||
__m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C,
|
__m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200,
|
||||||
0x0B09080605030200, 0x17151412110F0E0C};
|
0x17151412110F0E0C};
|
||||||
__m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604,
|
__m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18,
|
||||||
0x0301001E1D1B1A18, 0x0F0D0C0A09070604};
|
0x0F0D0C0A09070604};
|
||||||
__m256i shuff2 = {0x000A000700040001, 0x001600130010000D,
|
__m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001,
|
||||||
0x000A000700040001, 0x001600130010000D};
|
0x001600130010000D};
|
||||||
__m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005,
|
__m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019,
|
||||||
0x0002001F001C0019, 0x000E000B00080005};
|
0x000E000B00080005};
|
||||||
|
|
||||||
for (x = 0; x < len; x++) {
|
for (x = 0; x < len; x++) {
|
||||||
reg0 = __lasx_xvld(src_rgb24, 0);
|
reg0 = __lasx_xvld(src_rgb24, 0);
|
||||||
@ -1887,8 +1888,8 @@ void RGB24ToUVRow_LASX(const uint8_t* src_rgb24,
|
|||||||
DUP4_ARG2(__lasx_xvld, src_rgb24, 0, src_rgb24, 32, src_rgb24, 64,
|
DUP4_ARG2(__lasx_xvld, src_rgb24, 0, src_rgb24, 32, src_rgb24, 64,
|
||||||
next_rgb24, 0, reg0, reg1, reg2, tmp0);
|
next_rgb24, 0, reg0, reg1, reg2, tmp0);
|
||||||
DUP2_ARG2(__lasx_xvld, next_rgb24, 32, next_rgb24, 64, tmp1, tmp2);
|
DUP2_ARG2(__lasx_xvld, next_rgb24, 32, next_rgb24, 64, tmp1, tmp2);
|
||||||
DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2,
|
DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1,
|
||||||
reg1, 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0);
|
0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0);
|
||||||
DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2);
|
DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2);
|
||||||
DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
|
DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
|
||||||
nexb);
|
nexb);
|
||||||
@ -1926,14 +1927,14 @@ void RAWToYRow_LASX(const uint8_t* src_raw, uint8_t* dst_y, int width) {
|
|||||||
0x1942194219421942, 0x1942194219421942};
|
0x1942194219421942, 0x1942194219421942};
|
||||||
__m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
|
__m256i const_1080 = {0x1080108010801080, 0x1080108010801080,
|
||||||
0x1080108010801080, 0x1080108010801080};
|
0x1080108010801080, 0x1080108010801080};
|
||||||
__m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C,
|
__m256i shuff0 = {0x0B09080605030200, 0x17151412110F0E0C, 0x0B09080605030200,
|
||||||
0x0B09080605030200, 0x17151412110F0E0C};
|
0x17151412110F0E0C};
|
||||||
__m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604,
|
__m256i shuff1 = {0x0301001E1D1B1A18, 0x0F0D0C0A09070604, 0x0301001E1D1B1A18,
|
||||||
0x0301001E1D1B1A18, 0x0F0D0C0A09070604};
|
0x0F0D0C0A09070604};
|
||||||
__m256i shuff2 = {0x000A000700040001, 0x001600130010000D,
|
__m256i shuff2 = {0x000A000700040001, 0x001600130010000D, 0x000A000700040001,
|
||||||
0x000A000700040001, 0x001600130010000D};
|
0x001600130010000D};
|
||||||
__m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005,
|
__m256i shuff3 = {0x0002001F001C0019, 0x000E000B00080005, 0x0002001F001C0019,
|
||||||
0x0002001F001C0019, 0x000E000B00080005};
|
0x000E000B00080005};
|
||||||
|
|
||||||
for (x = 0; x < len; x++) {
|
for (x = 0; x < len; x++) {
|
||||||
reg0 = __lasx_xvld(src_raw, 0);
|
reg0 = __lasx_xvld(src_raw, 0);
|
||||||
@ -1989,24 +1990,24 @@ void RAWToUVRow_LASX(const uint8_t* src_raw,
|
|||||||
0x0706050403020100, 0x1F1C191613100908};
|
0x0706050403020100, 0x1F1C191613100908};
|
||||||
|
|
||||||
for (x = 0; x < len; x++) {
|
for (x = 0; x < len; x++) {
|
||||||
DUP4_ARG2(__lasx_xvld, src_raw, 0, src_raw, 32, src_raw, 64,
|
DUP4_ARG2(__lasx_xvld, src_raw, 0, src_raw, 32, src_raw, 64, next_raw, 0,
|
||||||
next_raw, 0, reg0, reg1, reg2, tmp0);
|
reg0, reg1, reg2, tmp0);
|
||||||
DUP2_ARG2(__lasx_xvld, next_raw, 32, next_raw, 64, tmp1, tmp2);
|
DUP2_ARG2(__lasx_xvld, next_raw, 32, next_raw, 64, tmp1, tmp2);
|
||||||
DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2,
|
DUP4_ARG3(__lasx_xvpermi_q, reg1, reg0, 0x30, reg2, reg0, 0x21, reg2, reg1,
|
||||||
reg1, 0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0);
|
0x30, tmp1, tmp0, 0x30, src0, src1, src2, nex0);
|
||||||
DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2);
|
DUP2_ARG3(__lasx_xvpermi_q, tmp2, tmp0, 0x21, tmp2, tmp1, 0x30, nex1, nex2);
|
||||||
DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b,
|
DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_b, nex1, nex0, shuff0_b, tmpb,
|
||||||
tmpb, nexb);
|
nexb);
|
||||||
DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g,
|
DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_g, nex1, nex0, shuff0_g, tmpg,
|
||||||
tmpg, nexg);
|
nexg);
|
||||||
DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r,
|
DUP2_ARG3(__lasx_xvshuf_b, src1, src0, shuff0_r, nex1, nex0, shuff0_r, tmpr,
|
||||||
tmpr, nexr);
|
nexr);
|
||||||
DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b,
|
DUP2_ARG3(__lasx_xvshuf_b, src2, tmpb, shuff1_b, nex2, nexb, shuff1_b, tmpb,
|
||||||
tmpb, nexb);
|
nexb);
|
||||||
DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g,
|
DUP2_ARG3(__lasx_xvshuf_b, src2, tmpg, shuff1_g, nex2, nexg, shuff1_g, tmpg,
|
||||||
tmpg, nexg);
|
nexg);
|
||||||
DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r,
|
DUP2_ARG3(__lasx_xvshuf_b, src2, tmpr, shuff1_r, nex2, nexr, shuff1_r, tmpr,
|
||||||
tmpr, nexr);
|
nexr);
|
||||||
RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
|
RGBTOUV(tmpb, tmpg, tmpr, nexb, nexg, nexr, reg0, reg1);
|
||||||
dst0 = __lasx_xvpickod_b(reg1, reg0);
|
dst0 = __lasx_xvpickod_b(reg1, reg0);
|
||||||
__lasx_xvstelm_d(dst0, dst_u, 0, 0);
|
__lasx_xvstelm_d(dst0, dst_u, 0, 0);
|
||||||
@ -2071,8 +2072,8 @@ void NV12ToRGB565Row_LASX(const uint8_t* src_y,
|
|||||||
vec_vu = __lasx_xvld(src_uv, 0);
|
vec_vu = __lasx_xvld(src_uv, 0);
|
||||||
vec_vu = __lasx_xvsub_b(vec_vu, const_0x80);
|
vec_vu = __lasx_xvsub_b(vec_vu, const_0x80);
|
||||||
vec_vu = __lasx_vext2xv_h_b(vec_vu);
|
vec_vu = __lasx_vext2xv_h_b(vec_vu);
|
||||||
YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb,
|
YUVTORGB(vec_y, vec_vu, vec_vrub, vec_vgug, vec_yg, vec_yb, out_r, out_g,
|
||||||
out_r, out_g, out_b);
|
out_b);
|
||||||
out_b = __lasx_xvsrli_h(out_b, 3);
|
out_b = __lasx_xvsrli_h(out_b, 3);
|
||||||
out_g = __lasx_xvsrli_h(out_g, 2);
|
out_g = __lasx_xvsrli_h(out_g, 2);
|
||||||
out_r = __lasx_xvsrli_h(out_r, 3);
|
out_r = __lasx_xvsrli_h(out_r, 3);
|
||||||
@ -2109,8 +2110,8 @@ void NV21ToARGBRow_LASX(const uint8_t* src_y,
|
|||||||
vec_uv = __lasx_xvld(src_uv, 0);
|
vec_uv = __lasx_xvld(src_uv, 0);
|
||||||
vec_uv = __lasx_xvsub_b(vec_uv, const_0x80);
|
vec_uv = __lasx_xvsub_b(vec_uv, const_0x80);
|
||||||
vec_uv = __lasx_vext2xv_h_b(vec_uv);
|
vec_uv = __lasx_vext2xv_h_b(vec_uv);
|
||||||
YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_b,
|
YUVTORGB(vec_y, vec_uv, vec_ubvr, vec_ugvg, vec_yg, vec_yb, out_b, out_g,
|
||||||
out_g, out_r);
|
out_r);
|
||||||
STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
|
STOREARGB(alpha, out_r, out_g, out_b, dst_argb);
|
||||||
src_y += 16;
|
src_y += 16;
|
||||||
src_uv += 16;
|
src_uv += 16;
|
||||||
@ -2127,8 +2128,8 @@ void ARGBToYJRow_LASX(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
|||||||
__m256i const_150 = __lasx_xvldi(0x96);
|
__m256i const_150 = __lasx_xvldi(0x96);
|
||||||
__m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D,
|
__m256i const_br = {0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D,
|
||||||
0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
|
0x4D1D4D1D4D1D4D1D, 0x4D1D4D1D4D1D4D1D};
|
||||||
__m256i shuff = {0x0000000400000000, 0x0000000500000001,
|
__m256i shuff = {0x0000000400000000, 0x0000000500000001, 0x0000000600000002,
|
||||||
0x0000000600000002, 0x0000000700000003};
|
0x0000000700000003};
|
||||||
|
|
||||||
for (x = 0; x < len; x++) {
|
for (x = 0; x < len; x++) {
|
||||||
DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
|
DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
|
||||||
@ -2169,8 +2170,8 @@ void ARGBToUVJRow_LASX(const uint8_t* src_argb,
|
|||||||
__m256i const_10 = __lasx_xvldi(0x40A);
|
__m256i const_10 = __lasx_xvldi(0x40A);
|
||||||
__m256i const_8080 = {0x8080808080808080, 0x8080808080808080,
|
__m256i const_8080 = {0x8080808080808080, 0x8080808080808080,
|
||||||
0x8080808080808080, 0x8080808080808080};
|
0x8080808080808080, 0x8080808080808080};
|
||||||
__m256i shuff = {0x1614060412100200, 0x1E1C0E0C1A180A08,
|
__m256i shuff = {0x1614060412100200, 0x1E1C0E0C1A180A08, 0x1715070513110301,
|
||||||
0x1715070513110301, 0x1F1D0F0D1B190B09};
|
0x1F1D0F0D1B190B09};
|
||||||
|
|
||||||
for (x = 0; x < len; x++) {
|
for (x = 0; x < len; x++) {
|
||||||
DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
|
DUP4_ARG2(__lasx_xvld, src_argb, 0, src_argb, 32, src_argb, 64, src_argb,
|
||||||
|
|||||||
@ -1645,29 +1645,6 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
|
|||||||
: "cc", "memory", "q0", "q1", "q2", "q3");
|
: "cc", "memory", "q0", "q1", "q2", "q3");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
|
||||||
asm volatile(
|
|
||||||
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
|
|
||||||
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
|
|
||||||
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
|
|
||||||
"vmov.u8 d27, #16 \n" // Add 16 constant
|
|
||||||
"1: \n"
|
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
|
||||||
"vmull.u8 q2, d0, d24 \n" // B
|
|
||||||
"vmlal.u8 q2, d1, d25 \n" // G
|
|
||||||
"vmlal.u8 q2, d2, d26 \n" // R
|
|
||||||
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
|
|
||||||
"vqadd.u8 d0, d27 \n"
|
|
||||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
|
|
||||||
"bgt 1b \n"
|
|
||||||
: "+r"(src_argb), // %0
|
|
||||||
"+r"(dst_y), // %1
|
|
||||||
"+r"(width) // %2
|
|
||||||
:
|
|
||||||
: "cc", "memory", "q0", "q1", "q2", "q12", "q13");
|
|
||||||
}
|
|
||||||
|
|
||||||
void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
|
void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
|
||||||
uint8_t* dst_a,
|
uint8_t* dst_a,
|
||||||
int width) {
|
int width) {
|
||||||
@ -1686,48 +1663,6 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
|
||||||
asm volatile(
|
|
||||||
"vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
|
|
||||||
"vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
|
|
||||||
"vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
|
|
||||||
"1: \n"
|
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
|
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
|
||||||
"vmull.u8 q2, d0, d24 \n" // B
|
|
||||||
"vmlal.u8 q2, d1, d25 \n" // G
|
|
||||||
"vmlal.u8 q2, d2, d26 \n" // R
|
|
||||||
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
|
|
||||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
|
|
||||||
"bgt 1b \n"
|
|
||||||
: "+r"(src_argb), // %0
|
|
||||||
"+r"(dst_y), // %1
|
|
||||||
"+r"(width) // %2
|
|
||||||
:
|
|
||||||
: "cc", "memory", "q0", "q1", "q2", "q12", "q13");
|
|
||||||
}
|
|
||||||
|
|
||||||
void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
|
||||||
asm volatile(
|
|
||||||
"vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
|
|
||||||
"vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
|
|
||||||
"vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
|
|
||||||
"1: \n"
|
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 RGBA pixels.
|
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
|
||||||
"vmull.u8 q2, d1, d24 \n" // B
|
|
||||||
"vmlal.u8 q2, d2, d25 \n" // G
|
|
||||||
"vmlal.u8 q2, d3, d26 \n" // R
|
|
||||||
"vqrshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit Y
|
|
||||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
|
|
||||||
"bgt 1b \n"
|
|
||||||
: "+r"(src_rgba), // %0
|
|
||||||
"+r"(dst_y), // %1
|
|
||||||
"+r"(width) // %2
|
|
||||||
:
|
|
||||||
: "cc", "memory", "q0", "q1", "q2", "q12", "q13");
|
|
||||||
}
|
|
||||||
|
|
||||||
// 8x1 pixels.
|
// 8x1 pixels.
|
||||||
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
@ -1747,15 +1682,13 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
|||||||
"vmull.u8 q2, d0, d24 \n" // B
|
"vmull.u8 q2, d0, d24 \n" // B
|
||||||
"vmlsl.u8 q2, d1, d25 \n" // G
|
"vmlsl.u8 q2, d1, d25 \n" // G
|
||||||
"vmlsl.u8 q2, d2, d26 \n" // R
|
"vmlsl.u8 q2, d2, d26 \n" // R
|
||||||
"vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
|
|
||||||
|
|
||||||
"vmull.u8 q3, d2, d24 \n" // R
|
"vmull.u8 q3, d2, d24 \n" // R
|
||||||
"vmlsl.u8 q3, d1, d28 \n" // G
|
"vmlsl.u8 q3, d1, d28 \n" // G
|
||||||
"vmlsl.u8 q3, d0, d27 \n" // B
|
"vmlsl.u8 q3, d0, d27 \n" // B
|
||||||
"vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
|
|
||||||
|
|
||||||
"vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
|
"vaddhn.u16 d0, q2, q15 \n" // +128 -> unsigned
|
||||||
"vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
|
"vaddhn.u16 d1, q3, q15 \n" // +128 -> unsigned
|
||||||
|
|
||||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
|
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
|
||||||
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
|
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
|
||||||
@ -1775,13 +1708,11 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
|||||||
"vmul.s16 q8, " #QB ", q10 \n" /* B */ \
|
"vmul.s16 q8, " #QB ", q10 \n" /* B */ \
|
||||||
"vmls.s16 q8, " #QG ", q11 \n" /* G */ \
|
"vmls.s16 q8, " #QG ", q11 \n" /* G */ \
|
||||||
"vmls.s16 q8, " #QR ", q12 \n" /* R */ \
|
"vmls.s16 q8, " #QR ", q12 \n" /* R */ \
|
||||||
"vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
|
|
||||||
"vmul.s16 q9, " #QR ", q10 \n" /* R */ \
|
"vmul.s16 q9, " #QR ", q10 \n" /* R */ \
|
||||||
"vmls.s16 q9, " #QG ", q14 \n" /* G */ \
|
"vmls.s16 q9, " #QG ", q14 \n" /* G */ \
|
||||||
"vmls.s16 q9, " #QB ", q13 \n" /* B */ \
|
"vmls.s16 q9, " #QB ", q13 \n" /* B */ \
|
||||||
"vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
|
"vaddhn.u16 d0, q8, q15 \n" /* +128 -> unsigned */ \
|
||||||
"vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
|
"vaddhn.u16 d1, q9, q15 \n" /* +128 -> unsigned */
|
||||||
"vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
|
|
||||||
// clang-format on
|
// clang-format on
|
||||||
|
|
||||||
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
|
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
|
||||||
@ -2559,161 +2490,169 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
|
|||||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4");
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q4");
|
||||||
}
|
}
|
||||||
|
|
||||||
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
|
struct RgbConstants {
|
||||||
|
uint8_t kRGBToY[4];
|
||||||
|
uint16_t kAddY;
|
||||||
|
uint16_t pad;
|
||||||
|
};
|
||||||
|
|
||||||
|
// RGB to JPeg coefficients
|
||||||
|
// B * 0.1140 coefficient = 29
|
||||||
|
// G * 0.5870 coefficient = 150
|
||||||
|
// R * 0.2990 coefficient = 77
|
||||||
|
// Add 0.5 = 0x80
|
||||||
|
struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
|
||||||
|
|
||||||
|
struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
|
||||||
|
|
||||||
|
// RGB to BT.601 coefficients
|
||||||
|
// B * 0.1016 coefficient = 25
|
||||||
|
// G * 0.5078 coefficient = 129
|
||||||
|
// R * 0.2578 coefficient = 66
|
||||||
|
// Add 16.5 = 0x1080
|
||||||
|
|
||||||
|
struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, 0x1080};
|
||||||
|
|
||||||
|
struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
|
||||||
|
|
||||||
|
// ARGB expects first 3 values to contain RGB and 4th value is ignored.
|
||||||
|
void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
||||||
|
uint8_t* dst_y,
|
||||||
|
int width,
|
||||||
|
const struct RgbConstants* rgbconstants) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
|
"vld1.8 {d0}, [%3] \n" // load rgbconstants
|
||||||
"vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
|
"vdup.u8 d20, d0[0] \n"
|
||||||
"vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
|
"vdup.u8 d21, d0[1] \n"
|
||||||
"vmov.u8 d7, #16 \n" // Add 16 constant
|
"vdup.u8 d22, d0[2] \n"
|
||||||
|
"vdup.u16 q12, d0[2] \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
|
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
"vld4.8 {d1, d3, d5, d7}, [%0]! \n"
|
||||||
"vmull.u8 q8, d1, d4 \n" // R
|
"subs %2, %2, #16 \n" // 16 processed per loop.
|
||||||
"vmlal.u8 q8, d2, d5 \n" // G
|
"vmull.u8 q8, d0, d20 \n" // B
|
||||||
"vmlal.u8 q8, d3, d6 \n" // B
|
"vmull.u8 q9, d1, d20 \n"
|
||||||
"vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
|
"vmlal.u8 q8, d2, d21 \n" // G
|
||||||
"vqadd.u8 d0, d7 \n"
|
"vmlal.u8 q9, d3, d21 \n"
|
||||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
|
"vmlal.u8 q8, d4, d22 \n" // R
|
||||||
|
"vmlal.u8 q9, d5, d22 \n"
|
||||||
|
"vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
|
||||||
|
"vaddhn.u16 d1, q9, q12 \n"
|
||||||
|
"vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y.
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_bgra), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_y), // %1
|
"+r"(dst_y), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
:
|
: "r"(rgbconstants) // %3
|
||||||
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
|
||||||
|
"q12");
|
||||||
|
}
|
||||||
|
|
||||||
|
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||||
|
ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
|
||||||
|
ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
|
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
|
||||||
asm volatile(
|
ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
|
||||||
"vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
|
|
||||||
"vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
|
|
||||||
"vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
|
|
||||||
"vmov.u8 d7, #16 \n" // Add 16 constant
|
|
||||||
"1: \n"
|
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
|
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
|
||||||
"vmull.u8 q8, d0, d4 \n" // R
|
|
||||||
"vmlal.u8 q8, d1, d5 \n" // G
|
|
||||||
"vmlal.u8 q8, d2, d6 \n" // B
|
|
||||||
"vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
|
|
||||||
"vqadd.u8 d0, d7 \n"
|
|
||||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
|
|
||||||
"bgt 1b \n"
|
|
||||||
: "+r"(src_abgr), // %0
|
|
||||||
"+r"(dst_y), // %1
|
|
||||||
"+r"(width) // %2
|
|
||||||
:
|
|
||||||
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
|
||||||
|
// Same code as ARGB, except the LD4
|
||||||
|
void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
|
||||||
|
uint8_t* dst_y,
|
||||||
|
int width,
|
||||||
|
const struct RgbConstants* rgbconstants) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"vmov.u8 d4, #25 \n" // B * 0.1016 coefficient
|
"vld1.8 {d0}, [%3] \n" // load rgbconstants
|
||||||
"vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
|
"vdup.u8 d20, d0[0] \n"
|
||||||
"vmov.u8 d6, #66 \n" // R * 0.2578 coefficient
|
"vdup.u8 d21, d0[1] \n"
|
||||||
"vmov.u8 d7, #16 \n" // Add 16 constant
|
"vdup.u8 d22, d0[2] \n"
|
||||||
|
"vdup.u16 q12, d0[2] \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
|
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of RGBA
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
"vld4.8 {d1, d3, d5, d7}, [%0]! \n"
|
||||||
"vmull.u8 q8, d1, d4 \n" // B
|
"subs %2, %2, #16 \n" // 16 processed per loop.
|
||||||
"vmlal.u8 q8, d2, d5 \n" // G
|
"vmull.u8 q8, d2, d20 \n" // B
|
||||||
"vmlal.u8 q8, d3, d6 \n" // R
|
"vmull.u8 q9, d3, d20 \n"
|
||||||
"vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
|
"vmlal.u8 q8, d4, d21 \n" // G
|
||||||
"vqadd.u8 d0, d7 \n"
|
"vmlal.u8 q9, d5, d21 \n"
|
||||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
|
"vmlal.u8 q8, d6, d22 \n" // R
|
||||||
|
"vmlal.u8 q9, d7, d22 \n"
|
||||||
|
"vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
|
||||||
|
"vaddhn.u16 d1, q9, q12 \n"
|
||||||
|
"vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y.
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_rgba), // %0
|
: "+r"(src_rgba), // %0
|
||||||
"+r"(dst_y), // %1
|
"+r"(dst_y), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
:
|
: "r"(rgbconstants) // %3
|
||||||
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
|
||||||
|
"q12");
|
||||||
}
|
}
|
||||||
|
|
||||||
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
|
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||||
asm volatile(
|
RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
|
||||||
"vmov.u8 d4, #25 \n" // B * 0.1016 coefficient
|
|
||||||
"vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
|
|
||||||
"vmov.u8 d6, #66 \n" // R * 0.2578 coefficient
|
|
||||||
"vmov.u8 d7, #16 \n" // Add 16 constant
|
|
||||||
"1: \n"
|
|
||||||
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
|
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
|
||||||
"vmull.u8 q8, d0, d4 \n" // B
|
|
||||||
"vmlal.u8 q8, d1, d5 \n" // G
|
|
||||||
"vmlal.u8 q8, d2, d6 \n" // R
|
|
||||||
"vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
|
|
||||||
"vqadd.u8 d0, d7 \n"
|
|
||||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
|
|
||||||
"bgt 1b \n"
|
|
||||||
: "+r"(src_rgb24), // %0
|
|
||||||
"+r"(dst_y), // %1
|
|
||||||
"+r"(width) // %2
|
|
||||||
:
|
|
||||||
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
|
void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
|
||||||
|
RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
|
||||||
|
}
|
||||||
|
|
||||||
|
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
|
||||||
|
RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
|
||||||
|
}
|
||||||
|
|
||||||
|
void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
|
||||||
|
uint8_t* dst_y,
|
||||||
|
int width,
|
||||||
|
const struct RgbConstants* rgbconstants) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"vmov.u8 d6, #25 \n" // B * 0.1016 coefficient
|
"vld1.8 {d0}, [%3] \n" // load rgbconstants
|
||||||
"vmov.u8 d5, #129 \n" // G * 0.5078 coefficient
|
"vdup.u8 d20, d0[0] \n"
|
||||||
"vmov.u8 d4, #66 \n" // R * 0.2578 coefficient
|
"vdup.u8 d21, d0[1] \n"
|
||||||
"vmov.u8 d7, #16 \n" // Add 16 constant
|
"vdup.u8 d22, d0[2] \n"
|
||||||
|
"vdup.u16 q12, d0[2] \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
|
"vld3.8 {d2, d4, d6}, [%0]! \n" // load 16 pixels of
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
// RGB24.
|
||||||
"vmull.u8 q8, d0, d4 \n" // B
|
"vld3.8 {d3, d5, d7}, [%0]! \n"
|
||||||
"vmlal.u8 q8, d1, d5 \n" // G
|
"subs %2, %2, #16 \n" // 16 processed per loop.
|
||||||
"vmlal.u8 q8, d2, d6 \n" // R
|
"vmull.u8 q8, d2, d20 \n" // B
|
||||||
"vqrshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit Y
|
"vmull.u8 q9, d3, d20 \n"
|
||||||
"vqadd.u8 d0, d7 \n"
|
"vmlal.u8 q8, d4, d21 \n" // G
|
||||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
|
"vmlal.u8 q9, d5, d21 \n"
|
||||||
|
"vmlal.u8 q8, d6, d22 \n" // R
|
||||||
|
"vmlal.u8 q9, d7, d22 \n"
|
||||||
|
"vaddhn.u16 d0, q8, q12 \n" // 16 bit to 8 bit Y
|
||||||
|
"vaddhn.u16 d1, q9, q12 \n"
|
||||||
|
"vst1.8 {d0, d1}, [%1]! \n" // store 16 pixels Y.
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_raw), // %0
|
: "+r"(src_rgb), // %0
|
||||||
"+r"(dst_y), // %1
|
"+r"(dst_y), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
:
|
: "r"(rgbconstants) // %3
|
||||||
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8");
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "d20", "d21", "d22",
|
||||||
|
"q12");
|
||||||
}
|
}
|
||||||
|
|
||||||
void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
|
void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
|
||||||
asm volatile(
|
RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
|
||||||
"vmov.u8 d4, #29 \n" // B * 0.1140 coefficient
|
|
||||||
"vmov.u8 d5, #150 \n" // G * 0.5870 coefficient
|
|
||||||
"vmov.u8 d6, #77 \n" // R * 0.2990 coefficient
|
|
||||||
"1: \n"
|
|
||||||
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
|
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
|
||||||
"vmull.u8 q4, d0, d4 \n" // B
|
|
||||||
"vmlal.u8 q4, d1, d5 \n" // G
|
|
||||||
"vmlal.u8 q4, d2, d6 \n" // R
|
|
||||||
"vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y
|
|
||||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
|
|
||||||
"bgt 1b \n"
|
|
||||||
: "+r"(src_rgb24), // %0
|
|
||||||
"+r"(dst_yj), // %1
|
|
||||||
"+r"(width) // %2
|
|
||||||
:
|
|
||||||
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
|
void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
|
||||||
asm volatile(
|
RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants);
|
||||||
"vmov.u8 d6, #29 \n" // B * 0.1140 coefficient
|
}
|
||||||
"vmov.u8 d5, #150 \n" // G * 0.5870 coefficient
|
|
||||||
"vmov.u8 d4, #77 \n" // R * 0.2990 coefficient
|
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
|
||||||
"1: \n"
|
RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants);
|
||||||
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
|
}
|
||||||
"subs %2, %2, #8 \n" // 8 processed per loop.
|
|
||||||
"vmull.u8 q4, d0, d4 \n" // R
|
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
|
||||||
"vmlal.u8 q4, d1, d5 \n" // G
|
RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants);
|
||||||
"vmlal.u8 q4, d2, d6 \n" // B
|
|
||||||
"vqrshrn.u16 d0, q4, #8 \n" // 16 bit to 8 bit Y
|
|
||||||
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
|
|
||||||
"bgt 1b \n"
|
|
||||||
: "+r"(src_raw), // %0
|
|
||||||
"+r"(dst_yj), // %1
|
|
||||||
"+r"(width) // %2
|
|
||||||
:
|
|
||||||
: "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "q4");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Bilinear filter 16x2 -> 16x1
|
// Bilinear filter 16x2 -> 16x1
|
||||||
|
|||||||
@ -2021,30 +2021,6 @@ void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
|
|||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4");
|
||||||
}
|
}
|
||||||
|
|
||||||
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
|
||||||
asm volatile(
|
|
||||||
"movi v4.8b, #25 \n" // B * 0.1016 coefficient
|
|
||||||
"movi v5.8b, #129 \n" // G * 0.5078 coefficient
|
|
||||||
"movi v6.8b, #66 \n" // R * 0.2578 coefficient
|
|
||||||
"movi v7.8b, #16 \n" // Add 16 constant
|
|
||||||
"1: \n"
|
|
||||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
|
||||||
"umull v3.8h, v0.8b, v4.8b \n" // B
|
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
|
||||||
"umlal v3.8h, v1.8b, v5.8b \n" // G
|
|
||||||
"umlal v3.8h, v2.8b, v6.8b \n" // R
|
|
||||||
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
|
|
||||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
|
||||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
|
||||||
"b.gt 1b \n"
|
|
||||||
: "+r"(src_argb), // %0
|
|
||||||
"+r"(dst_y), // %1
|
|
||||||
"+r"(width) // %2
|
|
||||||
:
|
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
|
|
||||||
}
|
|
||||||
|
|
||||||
void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
|
void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
|
||||||
uint8_t* dst_a,
|
uint8_t* dst_a,
|
||||||
int width) {
|
int width) {
|
||||||
@ -2063,50 +2039,6 @@ void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
|
||||||
asm volatile(
|
|
||||||
"movi v4.8b, #29 \n" // B * 0.1140 coefficient
|
|
||||||
"movi v5.8b, #150 \n" // G * 0.5870 coefficient
|
|
||||||
"movi v6.8b, #77 \n" // R * 0.2990 coefficient
|
|
||||||
"1: \n"
|
|
||||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
|
||||||
"umull v3.8h, v0.8b, v4.8b \n" // B
|
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
|
||||||
"umlal v3.8h, v1.8b, v5.8b \n" // G
|
|
||||||
"umlal v3.8h, v2.8b, v6.8b \n" // R
|
|
||||||
"uqrshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit Y
|
|
||||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
|
||||||
"b.gt 1b \n"
|
|
||||||
: "+r"(src_argb), // %0
|
|
||||||
"+r"(dst_y), // %1
|
|
||||||
"+r"(width) // %2
|
|
||||||
:
|
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
|
|
||||||
}
|
|
||||||
|
|
||||||
void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
|
||||||
asm volatile(
|
|
||||||
"movi v4.8b, #29 \n" // B * 0.1140 coefficient
|
|
||||||
"movi v5.8b, #150 \n" // G * 0.5870 coefficient
|
|
||||||
"movi v6.8b, #77 \n" // R * 0.2990 coefficient
|
|
||||||
"1: \n"
|
|
||||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 RGBA
|
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
|
||||||
"umull v0.8h, v1.8b, v4.8b \n" // B
|
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
|
||||||
"umlal v0.8h, v2.8b, v5.8b \n" // G
|
|
||||||
"umlal v0.8h, v3.8b, v6.8b \n" // R
|
|
||||||
"uqrshrn v3.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
|
|
||||||
"st1 {v3.8b}, [%1], #8 \n" // store 8 pixels Y.
|
|
||||||
"b.gt 1b \n"
|
|
||||||
: "+r"(src_rgba), // %0
|
|
||||||
"+r"(dst_y), // %1
|
|
||||||
"+r"(width) // %2
|
|
||||||
:
|
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
|
|
||||||
}
|
|
||||||
|
|
||||||
// 8x1 pixels.
|
// 8x1 pixels.
|
||||||
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
||||||
uint8_t* dst_u,
|
uint8_t* dst_u,
|
||||||
@ -2124,18 +2056,16 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
|||||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 ARGB
|
||||||
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
"subs %w3, %w3, #8 \n" // 8 processed per loop.
|
||||||
"umull v4.8h, v0.8b, v24.8b \n" // B
|
"umull v4.8h, v0.8b, v24.8b \n" // B
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
|
||||||
"umlsl v4.8h, v1.8b, v25.8b \n" // G
|
"umlsl v4.8h, v1.8b, v25.8b \n" // G
|
||||||
"umlsl v4.8h, v2.8b, v26.8b \n" // R
|
"umlsl v4.8h, v2.8b, v26.8b \n" // R
|
||||||
"add v4.8h, v4.8h, v29.8h \n" // +128 -> unsigned
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
|
|
||||||
"umull v3.8h, v2.8b, v24.8b \n" // R
|
"umull v3.8h, v2.8b, v24.8b \n" // R
|
||||||
"umlsl v3.8h, v1.8b, v28.8b \n" // G
|
"umlsl v3.8h, v1.8b, v28.8b \n" // G
|
||||||
"umlsl v3.8h, v0.8b, v27.8b \n" // B
|
"umlsl v3.8h, v0.8b, v27.8b \n" // B
|
||||||
"add v3.8h, v3.8h, v29.8h \n" // +128 -> unsigned
|
|
||||||
|
|
||||||
"uqshrn v0.8b, v4.8h, #8 \n" // 16 bit to 8 bit U
|
"addhn v0.8b, v4.8h, v29.8h \n" // +128 -> unsigned
|
||||||
"uqshrn v1.8b, v3.8h, #8 \n" // 16 bit to 8 bit V
|
"addhn v1.8b, v3.8h, v29.8h \n" // +128 -> unsigned
|
||||||
|
|
||||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
|
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
|
||||||
"st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
|
"st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
|
||||||
@ -2166,10 +2096,8 @@ void ARGBToUV444Row_NEON(const uint8_t* src_argb,
|
|||||||
"mls v4.8h, " #QG ",v24.8h \n" /* G */ \
|
"mls v4.8h, " #QG ",v24.8h \n" /* G */ \
|
||||||
"mls v3.8h, " #QR ",v22.8h \n" /* R */ \
|
"mls v3.8h, " #QR ",v22.8h \n" /* R */ \
|
||||||
"mls v4.8h, " #QB ",v23.8h \n" /* B */ \
|
"mls v4.8h, " #QB ",v23.8h \n" /* B */ \
|
||||||
"add v3.8h, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
|
"addhn v0.8b, v3.8h, v25.8h \n" /* +128 -> unsigned */ \
|
||||||
"add v4.8h, v4.8h, v25.8h \n" /* +128 -> unsigned */ \
|
"addhn v1.8b, v4.8h, v25.8h \n" /* +128 -> unsigned */
|
||||||
"uqshrn v0.8b, v3.8h, #8 \n" /* 16 bit to 8 bit U */ \
|
|
||||||
"uqshrn v1.8b, v4.8h, #8 \n" /* 16 bit to 8 bit V */
|
|
||||||
// clang-format on
|
// clang-format on
|
||||||
|
|
||||||
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
|
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
|
||||||
@ -2807,168 +2735,169 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
|
|||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27");
|
||||||
}
|
}
|
||||||
|
|
||||||
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
|
struct RgbConstants {
|
||||||
|
uint8_t kRGBToY[4];
|
||||||
|
uint16_t kAddY;
|
||||||
|
uint16_t pad;
|
||||||
|
};
|
||||||
|
|
||||||
|
// RGB to JPeg coefficients
|
||||||
|
// B * 0.1140 coefficient = 29
|
||||||
|
// G * 0.5870 coefficient = 150
|
||||||
|
// R * 0.2990 coefficient = 77
|
||||||
|
// Add 0.5 = 0x80
|
||||||
|
struct RgbConstants kRgb24JPEGConstants = {{29, 150, 77, 0}, 128};
|
||||||
|
|
||||||
|
struct RgbConstants kRawJPEGConstants = {{77, 150, 29, 0}, 128};
|
||||||
|
|
||||||
|
// RGB to BT.601 coefficients
|
||||||
|
// B * 0.1016 coefficient = 25
|
||||||
|
// G * 0.5078 coefficient = 129
|
||||||
|
// R * 0.2578 coefficient = 66
|
||||||
|
// Add 16.5 = 0x1080
|
||||||
|
|
||||||
|
struct RgbConstants kRgb24I601Constants = {{25, 129, 66, 0}, 0x1080};
|
||||||
|
|
||||||
|
struct RgbConstants kRawI601Constants = {{66, 129, 25, 0}, 0x1080};
|
||||||
|
|
||||||
|
// ARGB expects first 3 values to contain RGB and 4th value is ignored.
|
||||||
|
void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
|
||||||
|
uint8_t* dst_y,
|
||||||
|
int width,
|
||||||
|
const struct RgbConstants* rgbconstants) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"movi v4.8b, #66 \n" // R * 0.2578 coefficient
|
"ldr d0, [%3] \n" // load rgbconstants
|
||||||
"movi v5.8b, #129 \n" // G * 0.5078 coefficient
|
"dup v6.16b, v0.b[0] \n"
|
||||||
"movi v6.8b, #25 \n" // B * 0.1016 coefficient
|
"dup v7.16b, v0.b[1] \n"
|
||||||
"movi v7.8b, #16 \n" // Add 16 constant
|
"dup v16.16b, v0.b[2] \n"
|
||||||
|
"dup v17.8h, v0.h[2] \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
|
"ld4 {v2.16b,v3.16b,v4.16b,v5.16b}, [%0], #64 \n" // load 16
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
// pixels.
|
||||||
"umull v16.8h, v1.8b, v4.8b \n" // R
|
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||||
|
"umull v0.8h, v2.8b, v6.8b \n" // B
|
||||||
|
"umull2 v1.8h, v2.16b, v6.16b \n"
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
"umlal v16.8h, v2.8b, v5.8b \n" // G
|
"umlal v0.8h, v3.8b, v7.8b \n" // G
|
||||||
"umlal v16.8h, v3.8b, v6.8b \n" // B
|
"umlal2 v1.8h, v3.16b, v7.16b \n"
|
||||||
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
|
"umlal v0.8h, v4.8b, v16.8b \n" // R
|
||||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
"umlal2 v1.8h, v4.16b, v16.16b \n"
|
||||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
"addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y
|
||||||
|
"addhn v1.8b, v1.8h, v17.8h \n"
|
||||||
|
"st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: "+r"(src_bgra), // %0
|
: "+r"(src_argb), // %0
|
||||||
"+r"(dst_y), // %1
|
"+r"(dst_y), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
:
|
: "r"(rgbconstants) // %3
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||||
|
"v17");
|
||||||
|
}
|
||||||
|
|
||||||
|
void ARGBToYRow_NEON(const uint8_t* src_argb, uint8_t* dst_y, int width) {
|
||||||
|
ARGBToYMatrixRow_NEON(src_argb, dst_y, width, &kRgb24I601Constants);
|
||||||
|
}
|
||||||
|
|
||||||
|
void ARGBToYJRow_NEON(const uint8_t* src_argb, uint8_t* dst_yj, int width) {
|
||||||
|
ARGBToYMatrixRow_NEON(src_argb, dst_yj, width, &kRgb24JPEGConstants);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
|
void ABGRToYRow_NEON(const uint8_t* src_abgr, uint8_t* dst_y, int width) {
|
||||||
asm volatile(
|
ARGBToYMatrixRow_NEON(src_abgr, dst_y, width, &kRawI601Constants);
|
||||||
"movi v6.8b, #25 \n" // B * 0.1016 coefficient
|
|
||||||
"movi v5.8b, #129 \n" // G * 0.5078 coefficient
|
|
||||||
"movi v4.8b, #66 \n" // R * 0.2578 coefficient
|
|
||||||
"movi v7.8b, #16 \n" // Add 16 constant
|
|
||||||
"1: \n"
|
|
||||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
|
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
|
||||||
"umull v16.8h, v0.8b, v4.8b \n" // R
|
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
|
||||||
"umlal v16.8h, v1.8b, v5.8b \n" // G
|
|
||||||
"umlal v16.8h, v2.8b, v6.8b \n" // B
|
|
||||||
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
|
|
||||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
|
||||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
|
||||||
"b.gt 1b \n"
|
|
||||||
: "+r"(src_abgr), // %0
|
|
||||||
"+r"(dst_y), // %1
|
|
||||||
"+r"(width) // %2
|
|
||||||
:
|
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
// RGBA expects first value to be A and ignored, then 3 values to contain RGB.
|
||||||
|
// Same code as ARGB, except the LD4
|
||||||
|
void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
|
||||||
|
uint8_t* dst_y,
|
||||||
|
int width,
|
||||||
|
const struct RgbConstants* rgbconstants) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"movi v4.8b, #25 \n" // B * 0.1016 coefficient
|
"ldr d0, [%3] \n" // load rgbconstants
|
||||||
"movi v5.8b, #129 \n" // G * 0.5078 coefficient
|
"dup v6.16b, v0.b[0] \n"
|
||||||
"movi v6.8b, #66 \n" // R * 0.2578 coefficient
|
"dup v7.16b, v0.b[1] \n"
|
||||||
"movi v7.8b, #16 \n" // Add 16 constant
|
"dup v16.16b, v0.b[2] \n"
|
||||||
|
"dup v17.8h, v0.h[2] \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // load 8 pixels.
|
"ld4 {v1.16b,v2.16b,v3.16b,v4.16b}, [%0], #64 \n" // load 16
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
// pixels.
|
||||||
"umull v16.8h, v1.8b, v4.8b \n" // B
|
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||||
|
"umull v0.8h, v2.8b, v6.8b \n" // B
|
||||||
|
"umull2 v1.8h, v2.16b, v6.16b \n"
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
"umlal v16.8h, v2.8b, v5.8b \n" // G
|
"umlal v0.8h, v3.8b, v7.8b \n" // G
|
||||||
"umlal v16.8h, v3.8b, v6.8b \n" // R
|
"umlal2 v1.8h, v3.16b, v7.16b \n"
|
||||||
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
|
"umlal v0.8h, v4.8b, v16.8b \n" // R
|
||||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
"umlal2 v1.8h, v4.16b, v16.16b \n"
|
||||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
"addhn v0.8b, v0.8h, v17.8h \n" // 16 bit to 8 bit Y
|
||||||
|
"addhn v1.8b, v1.8h, v17.8h \n"
|
||||||
|
"st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: "+r"(src_rgba), // %0
|
: "+r"(src_rgba), // %0
|
||||||
"+r"(dst_y), // %1
|
"+r"(dst_y), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
:
|
: "r"(rgbconstants) // %3
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16",
|
||||||
|
"v17");
|
||||||
}
|
}
|
||||||
|
|
||||||
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
|
void RGBAToYRow_NEON(const uint8_t* src_rgba, uint8_t* dst_y, int width) {
|
||||||
asm volatile(
|
RGBAToYMatrixRow_NEON(src_rgba, dst_y, width, &kRgb24I601Constants);
|
||||||
"movi v4.8b, #25 \n" // B * 0.1016 coefficient
|
|
||||||
"movi v5.8b, #129 \n" // G * 0.5078 coefficient
|
|
||||||
"movi v6.8b, #66 \n" // R * 0.2578 coefficient
|
|
||||||
"movi v7.8b, #16 \n" // Add 16 constant
|
|
||||||
"1: \n"
|
|
||||||
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
|
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
|
||||||
"umull v16.8h, v0.8b, v4.8b \n" // B
|
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
|
||||||
"umlal v16.8h, v1.8b, v5.8b \n" // G
|
|
||||||
"umlal v16.8h, v2.8b, v6.8b \n" // R
|
|
||||||
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
|
|
||||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
|
||||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
|
||||||
"b.gt 1b \n"
|
|
||||||
: "+r"(src_rgb24), // %0
|
|
||||||
"+r"(dst_y), // %1
|
|
||||||
"+r"(width) // %2
|
|
||||||
:
|
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
|
void RGBAToYJRow_NEON(const uint8_t* src_rgba, uint8_t* dst_yj, int width) {
|
||||||
|
RGBAToYMatrixRow_NEON(src_rgba, dst_yj, width, &kRgb24JPEGConstants);
|
||||||
|
}
|
||||||
|
|
||||||
|
void BGRAToYRow_NEON(const uint8_t* src_bgra, uint8_t* dst_y, int width) {
|
||||||
|
RGBAToYMatrixRow_NEON(src_bgra, dst_y, width, &kRawI601Constants);
|
||||||
|
}
|
||||||
|
|
||||||
|
void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
|
||||||
|
uint8_t* dst_y,
|
||||||
|
int width,
|
||||||
|
const struct RgbConstants* rgbconstants) {
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"movi v6.8b, #25 \n" // B * 0.1016 coefficient
|
"ldr d0, [%3] \n" // load rgbconstants
|
||||||
"movi v5.8b, #129 \n" // G * 0.5078 coefficient
|
"dup v5.16b, v0.b[0] \n"
|
||||||
"movi v4.8b, #66 \n" // R * 0.2578 coefficient
|
"dup v6.16b, v0.b[1] \n"
|
||||||
"movi v7.8b, #16 \n" // Add 16 constant
|
"dup v7.16b, v0.b[2] \n"
|
||||||
|
"dup v16.8h, v0.h[2] \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
|
"ld3 {v2.16b,v3.16b,v4.16b}, [%0], #48 \n" // load 16 pixels.
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
"subs %w2, %w2, #16 \n" // 16 processed per loop.
|
||||||
"umull v16.8h, v0.8b, v4.8b \n" // B
|
"umull v0.8h, v2.8b, v5.8b \n" // B
|
||||||
|
"umull2 v1.8h, v2.16b, v5.16b \n"
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
"umlal v16.8h, v1.8b, v5.8b \n" // G
|
"umlal v0.8h, v3.8b, v6.8b \n" // G
|
||||||
"umlal v16.8h, v2.8b, v6.8b \n" // R
|
"umlal2 v1.8h, v3.16b, v6.16b \n"
|
||||||
"uqrshrn v0.8b, v16.8h, #8 \n" // 16 bit to 8 bit Y
|
"umlal v0.8h, v4.8b, v7.8b \n" // R
|
||||||
"uqadd v0.8b, v0.8b, v7.8b \n"
|
"umlal2 v1.8h, v4.16b, v7.16b \n"
|
||||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
"addhn v0.8b, v0.8h, v16.8h \n" // 16 bit to 8 bit Y
|
||||||
|
"addhn v1.8b, v1.8h, v16.8h \n"
|
||||||
|
"st1 {v0.8b, v1.8b}, [%1], #16 \n" // store 16 pixels Y.
|
||||||
"b.gt 1b \n"
|
"b.gt 1b \n"
|
||||||
: "+r"(src_raw), // %0
|
: "+r"(src_rgb), // %0
|
||||||
"+r"(dst_y), // %1
|
"+r"(dst_y), // %1
|
||||||
"+r"(width) // %2
|
"+r"(width) // %2
|
||||||
:
|
: "r"(rgbconstants) // %3
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
|
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16");
|
||||||
}
|
}
|
||||||
|
|
||||||
void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
|
void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) {
|
||||||
asm volatile(
|
RGBToYMatrixRow_NEON(src_rgb24, dst_yj, width, &kRgb24JPEGConstants);
|
||||||
"movi v4.8b, #29 \n" // B * 0.1140 coefficient
|
|
||||||
"movi v5.8b, #150 \n" // G * 0.5870 coefficient
|
|
||||||
"movi v6.8b, #77 \n" // R * 0.2990 coefficient
|
|
||||||
"1: \n"
|
|
||||||
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
|
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
|
||||||
"umull v0.8h, v0.8b, v4.8b \n" // B
|
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
|
||||||
"umlal v0.8h, v1.8b, v5.8b \n" // G
|
|
||||||
"umlal v0.8h, v2.8b, v6.8b \n" // R
|
|
||||||
"uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
|
|
||||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
|
||||||
"b.gt 1b \n"
|
|
||||||
: "+r"(src_rgb24), // %0
|
|
||||||
"+r"(dst_yj), // %1
|
|
||||||
"+r"(width) // %2
|
|
||||||
:
|
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
|
void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) {
|
||||||
asm volatile(
|
RGBToYMatrixRow_NEON(src_raw, dst_yj, width, &kRawJPEGConstants);
|
||||||
"movi v6.8b, #29 \n" // B * 0.1140 coefficient
|
}
|
||||||
"movi v5.8b, #150 \n" // G * 0.5870 coefficient
|
|
||||||
"movi v4.8b, #77 \n" // R * 0.2990 coefficient
|
void RGB24ToYRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_y, int width) {
|
||||||
"1: \n"
|
RGBToYMatrixRow_NEON(src_rgb24, dst_y, width, &kRgb24I601Constants);
|
||||||
"ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // load 8 pixels.
|
}
|
||||||
"subs %w2, %w2, #8 \n" // 8 processed per loop.
|
|
||||||
"umull v0.8h, v0.8b, v4.8b \n" // B
|
void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) {
|
||||||
"prfm pldl1keep, [%0, 448] \n"
|
RGBToYMatrixRow_NEON(src_raw, dst_y, width, &kRawI601Constants);
|
||||||
"umlal v0.8h, v1.8b, v5.8b \n" // G
|
|
||||||
"umlal v0.8h, v2.8b, v6.8b \n" // R
|
|
||||||
"uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y
|
|
||||||
"st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y.
|
|
||||||
"b.gt 1b \n"
|
|
||||||
: "+r"(src_raw), // %0
|
|
||||||
"+r"(dst_yj), // %1
|
|
||||||
"+r"(width) // %2
|
|
||||||
:
|
|
||||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Bilinear filter 16x2 -> 16x1
|
// Bilinear filter 16x2 -> 16x1
|
||||||
|
|||||||
@ -3814,13 +3814,17 @@ TESTQPLANAR16TOB(I210Alpha, 2, 1, ARGBFilter, 4, 4, 1, 10)
|
|||||||
P216ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
|
P216ToAR30Matrix(a, b, c, d, e, f, &kYuvH709Constants, g, h)
|
||||||
|
|
||||||
#define P010ToARGBFilter(a, b, c, d, e, f, g, h) \
|
#define P010ToARGBFilter(a, b, c, d, e, f, g, h) \
|
||||||
P010ToARGBMatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, kFilterBilinear)
|
P010ToARGBMatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, \
|
||||||
|
kFilterBilinear)
|
||||||
#define P210ToARGBFilter(a, b, c, d, e, f, g, h) \
|
#define P210ToARGBFilter(a, b, c, d, e, f, g, h) \
|
||||||
P210ToARGBMatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, kFilterBilinear)
|
P210ToARGBMatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, \
|
||||||
|
kFilterBilinear)
|
||||||
#define P010ToAR30Filter(a, b, c, d, e, f, g, h) \
|
#define P010ToAR30Filter(a, b, c, d, e, f, g, h) \
|
||||||
P010ToAR30MatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, kFilterBilinear)
|
P010ToAR30MatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, \
|
||||||
|
kFilterBilinear)
|
||||||
#define P210ToAR30Filter(a, b, c, d, e, f, g, h) \
|
#define P210ToAR30Filter(a, b, c, d, e, f, g, h) \
|
||||||
P210ToAR30MatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, kFilterBilinear)
|
P210ToAR30MatrixFilter(a, b, c, d, e, f, &kYuvH709Constants, g, h, \
|
||||||
|
kFilterBilinear)
|
||||||
|
|
||||||
#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
|
#if !defined(DISABLE_SLOW_TESTS) || defined(__x86_64__) || defined(__i386__)
|
||||||
TESTBIPLANAR16TOB(P010, 2, 2, ARGB, 4, 4, 1, 10)
|
TESTBIPLANAR16TOB(P010, 2, 2, ARGB, 4, 4, 1, 10)
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user