Enable SIMD for exact RGB to Y conversions

Bug: libyuv:908, b/202888439
Change-Id: Icc5470b85d91b441ded9958ee04b4f32246646f0
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3230489
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
This commit is contained in:
Frank Barchard 2021-10-19 00:02:50 -07:00 committed by libyuv LUCI CQ
parent f0cfc1f1c8
commit b179f1847a
9 changed files with 736 additions and 345 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1799
Version: 1801
License: BSD
License File: LICENSE

View File

@ -74,9 +74,9 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_X86) && \
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
// Conversions:
#define HAS_ABGRTOYROW_SSSE3
#if !defined(LIBYUV_BIT_EXACT)
#define HAS_ABGRTOUVROW_SSSE3
#define HAS_ABGRTOYROW_SSSE3
#endif
#define HAS_ARGB1555TOARGBROW_SSE2
#define HAS_ARGB4444TOARGBROW_SSE2
@ -90,13 +90,13 @@ extern "C" {
#define HAS_ARGBTORGB565DITHERROW_SSE2
#define HAS_ARGBTORGB565ROW_SSE2
#define HAS_ARGBTOYJROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3
#define HAS_BGRATOYROW_SSSE3
#if !defined(LIBYUV_BIT_EXACT)
#define HAS_ARGBTOUV444ROW_SSSE3
#define HAS_ARGBTOUVJROW_SSSE3
#define HAS_ARGBTOUVROW_SSSE3
#define HAS_ARGBTOYROW_SSSE3
#define HAS_BGRATOUVROW_SSSE3
#define HAS_BGRATOYROW_SSSE3
#endif
#define HAS_COPYROW_ERMS
#define HAS_COPYROW_SSE2
@ -125,13 +125,13 @@ extern "C" {
#define HAS_RAWTORGB24ROW_SSSE3
#define HAS_RGB24TOARGBROW_SSSE3
#define HAS_RGB565TOARGBROW_SSE2
#if !defined(LIBYUV_BIT_EXACT)
#define HAS_RAWTOYROW_SSSE3
#define HAS_RGB24TOYROW_SSSE3
#define HAS_RGBATOYROW_SSSE3
#if !defined(LIBYUV_BIT_EXACT)
#define HAS_RGB24TOYJROW_SSSE3
#define HAS_RAWTOYJROW_SSSE3
#define HAS_RGBATOUVROW_SSSE3
#define HAS_RGBATOYROW_SSSE3
#endif
#define HAS_SETROW_ERMS
#define HAS_SETROW_X86
@ -203,10 +203,10 @@ extern "C" {
#define HAS_ARGBTOYJROW_AVX2
#define HAS_RAWTOYJROW_AVX2
#define HAS_RGB24TOYJROW_AVX2
#define HAS_ARGBTOYROW_AVX2
#if !defined(LIBYUV_BIT_EXACT)
#define HAS_ARGBTOUVJROW_AVX2
#define HAS_ARGBTOUVROW_AVX2
#define HAS_ARGBTOYROW_AVX2
#endif
#define HAS_COPYROW_AVX
#define HAS_H422TOARGBROW_AVX2
@ -472,10 +472,12 @@ extern "C" {
#define HAS_RAWTORGB24ROW_NEON
#define HAS_RAWTORGBAROW_NEON
#define HAS_RAWTOUVROW_NEON
#define HAS_RAWTOUVJROW_NEON
#define HAS_RAWTOYJROW_NEON
#define HAS_RAWTOYROW_NEON
#define HAS_RGB24TOARGBROW_NEON
#define HAS_RGB24TOUVROW_NEON
#define HAS_RGB24TOUVJROW_NEON
#define HAS_RGB24TOYJROW_NEON
#define HAS_RGB24TOYROW_NEON
#define HAS_RGB565TOARGBROW_NEON
@ -1096,6 +1098,16 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
int src_stride_rgb24,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RAWToUVJRow_NEON(const uint8_t* src_raw,
int src_stride_raw,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
int src_stride_rgb565,
uint8_t* dst_u,
@ -1433,6 +1445,16 @@ void RAWToUVRow_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGB24ToUVJRow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RAWToUVJRow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGB565ToUVRow_Any_NEON(const uint8_t* src_ptr,
int src_stride,
uint8_t* dst_u,
@ -1578,6 +1600,16 @@ void RAWToUVRow_C(const uint8_t* src_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGB24ToUVJRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RAWToUVJRow_C(const uint8_t* src_rgb,
int src_stride_rgb,
uint8_t* dst_u,
uint8_t* dst_v,
int width);
void RGB565ToUVRow_C(const uint8_t* src_rgb565,
int src_stride_rgb565,
uint8_t* dst_u,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1799
#define LIBYUV_VERSION 1801
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -1368,38 +1368,54 @@ int ARGBToI420(const uint8_t* src_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON)
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON;
}
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToYRow = ARGBToYRow_Any_MMI;
@ -1560,26 +1576,38 @@ int ABGRToI420(const uint8_t* src_abgr,
src_abgr = src_abgr + (height - 1) * src_stride_abgr;
src_stride_abgr = -src_stride_abgr;
}
#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
#if defined(HAS_ABGRTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
ABGRToYRow = ABGRToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ABGRToUVRow = ABGRToUVRow_SSSE3;
ABGRToYRow = ABGRToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
#if defined(HAS_ABGRTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ABGRToUVRow = ABGRToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ABGRTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ABGRToUVRow = ABGRToUVRow_Any_AVX2;
ABGRToYRow = ABGRToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ABGRToUVRow = ABGRToUVRow_AVX2;
ABGRToYRow = ABGRToYRow_AVX2;
}
}
#endif
#if defined(HAS_ABGRTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ABGRToUVRow = ABGRToUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ABGRToUVRow = ABGRToUVRow_AVX2;
}
}
#endif
#if defined(HAS_ABGRTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ABGRToYRow = ABGRToYRow_Any_NEON;
@ -1662,16 +1690,22 @@ int RGBAToI420(const uint8_t* src_rgba,
src_rgba = src_rgba + (height - 1) * src_stride_rgba;
src_stride_rgba = -src_stride_rgba;
}
#if defined(HAS_RGBATOYROW_SSSE3) && defined(HAS_RGBATOUVROW_SSSE3)
#if defined(HAS_RGBATOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
RGBAToYRow = RGBAToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
RGBAToUVRow = RGBAToUVRow_SSSE3;
RGBAToYRow = RGBAToYRow_SSSE3;
}
}
#endif
#if defined(HAS_RGBATOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
RGBAToUVRow = RGBAToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_RGBATOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGBAToYRow = RGBAToYRow_Any_NEON;
@ -1727,6 +1761,12 @@ int RGBAToI420(const uint8_t* src_rgba,
return 0;
}
// Enabled if 1 pass is available
#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
defined(HAS_RGB24TOYROW_MMI))
#define HAS_RGB24TOYROW
#endif
// Convert RGB24 to I420.
LIBYUV_API
int RGB24ToI420(const uint8_t* src_rgb24,
@ -1740,8 +1780,7 @@ int RGB24ToI420(const uint8_t* src_rgb24,
int width,
int height) {
int y;
#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
defined(HAS_RGB24TOYROW_MMI))
#if defined(HAS_RGB24TOYROW)
void (*RGB24ToUVRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
uint8_t* dst_u, uint8_t* dst_v, int width) =
RGB24ToUVRow_C;
@ -1766,6 +1805,8 @@ int RGB24ToI420(const uint8_t* src_rgb24,
src_stride_rgb24 = -src_stride_rgb24;
}
#if defined(HAS_RGB24TOYROW)
// Neon version does direct RGB24 to YUV.
#if defined(HAS_RGB24TOYROW_NEON) && defined(HAS_RGB24TOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
@ -1778,8 +1819,7 @@ int RGB24ToI420(const uint8_t* src_rgb24,
}
}
}
// MMI and MSA version does direct RGB24 to YUV.
#elif (defined(HAS_RGB24TOYROW_MMI) || defined(HAS_RGB24TOYROW_MSA))
#endif
#if defined(HAS_RGB24TOYROW_MMI) && defined(HAS_RGB24TOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
RGB24ToUVRow = RGB24ToUVRow_Any_MMI;
@ -1802,16 +1842,10 @@ int RGB24ToI420(const uint8_t* src_rgb24,
}
}
#endif
// Other platforms do intermediate conversion from RGB24 to ARGB.
#else
#if defined(HAS_RGB24TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RGB24ToARGBRow = RGB24ToARGBRow_NEON;
}
}
#endif
#else // HAS_RGB24TOYROW
#if defined(HAS_RGB24TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
@ -1820,51 +1854,49 @@ int RGB24ToI420(const uint8_t* src_rgb24,
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON;
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON;
}
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
}
}
#endif
#endif // HAS_RGB24TOYROW
{
#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
defined(HAS_RGB24TOYROW_MMI))
#if !defined(HAS_RGB24TOYROW)
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
defined(HAS_RGB24TOYROW_MMI))
#if defined(HAS_RGB24TOYROW)
RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
RGB24ToYRow(src_rgb24, dst_y, width);
RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
@ -1881,8 +1913,7 @@ int RGB24ToI420(const uint8_t* src_rgb24,
dst_v += dst_stride_v;
}
if (height & 1) {
#if (defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
defined(HAS_RGB24TOYROW_MMI))
#if defined(HAS_RGB24TOYROW)
RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
RGB24ToYRow(src_rgb24, dst_y, width);
#else
@ -1891,15 +1922,20 @@ int RGB24ToI420(const uint8_t* src_rgb24,
ARGBToYRow(row, dst_y, width);
#endif
}
#if !(defined(HAS_RGB24TOYROW_NEON) || defined(HAS_RGB24TOYROW_MSA) || \
defined(HAS_RGB24TOYROW_MMI))
#if !defined(HAS_RGB24TOYROW)
free_aligned_buffer_64(row);
#endif
}
return 0;
}
#undef HAS_RGB24TOYROW
// Enabled if 1 pass is available
#if (defined(HAS_RGB24TOYJROW_NEON) || defined(HAS_RGB24TOYJROW_MSA) || \
defined(HAS_RGB24TOYJROW_MMI))
#define HAS_RGB24TOYJROW
#endif
// TODO(fbarchard): Use Matrix version to implement I420 and J420.
// Convert RGB24 to J420.
LIBYUV_API
int RGB24ToJ420(const uint8_t* src_rgb24,
@ -1913,10 +1949,9 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
int width,
int height) {
int y;
#if (defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)
#if defined(HAS_RGB24TOYJROW)
void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24,
uint8_t* dst_u, uint8_t* dst_v, int width) =
uint8_t* dst_u, uint8_t* dst_v, int width) =
RGB24ToUVJRow_C;
void (*RGB24ToYJRow)(const uint8_t* src_rgb24, uint8_t* dst_y, int width) =
RGB24ToYJRow_C;
@ -1924,7 +1959,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
void (*RGB24ToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
RGB24ToARGBRow_C;
void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
uint8_t* dst_u, uint8_t* dst_v, int width) =
uint8_t* dst_u, uint8_t* dst_v, int width) =
ARGBToUVJRow_C;
void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
ARGBToYJRow_C;
@ -1939,6 +1974,8 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
src_stride_rgb24 = -src_stride_rgb24;
}
#if defined(HAS_RGB24TOYJROW)
// Neon version does direct RGB24 to YUV.
#if defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
@ -1951,8 +1988,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
}
}
}
// MMI and MSA version does direct RGB24 to YUV.
#elif (defined(HAS_RGB24TOYJROW_MMI) || defined(HAS_RGB24TOYJROW_MSA))
#endif
#if defined(HAS_RGB24TOYJROW_MMI) && defined(HAS_RGB24TOUVJROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
RGB24ToUVJRow = RGB24ToUVJRow_Any_MMI;
@ -1975,15 +2011,10 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
}
}
#endif
#else
#if defined(HAS_RGB24TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RGB24ToARGBRow = RGB24ToARGBRow_NEON;
}
}
#endif
// Other platforms do intermediate conversion from RGB24 to ARGB.
#else // HAS_RGB24TOYJROW
#if defined(HAS_RGB24TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
@ -1992,51 +2023,49 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
}
}
#endif
#if defined(HAS_ARGBTOYJROW_NEON) && defined(HAS_ARGBTOUVJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
ARGBToYJRow = ARGBToYJRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYJRow = ARGBToYJRow_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJRow = ARGBToUVJRow_NEON;
}
}
}
#endif
#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJRow = ARGBToUVJRow_SSSE3;
ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
#if defined(HAS_ARGBTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVJRow = ARGBToUVJRow_AVX2;
ARGBToYJRow = ARGBToYJRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJRow = ARGBToUVJRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUVJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVJRow = ARGBToUVJRow_AVX2;
}
}
#endif
#endif // HAS_RGB24TOYJROW
{
#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
#if !defined(HAS_RGB24TOYJROW)
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
#if defined(HAS_RGB24TOYJROW)
RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
RGB24ToYJRow(src_rgb24, dst_y, width);
RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
@ -2053,8 +2082,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
dst_v += dst_stride_v;
}
if (height & 1) {
#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
#if defined(HAS_RGB24TOYJROW)
RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width);
RGB24ToYJRow(src_rgb24, dst_y, width);
#else
@ -2063,31 +2091,37 @@ int RGB24ToJ420(const uint8_t* src_rgb24,
ARGBToYJRow(row, dst_y, width);
#endif
}
#if !((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \
defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI))
#if !defined(HAS_RGB24TOYJROW)
free_aligned_buffer_64(row);
#endif
}
return 0;
}
#undef HAS_RGB24TOYJROW
// Enabled if 1 pass is available
#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
defined(HAS_RAWTOYROW_MMI))
#define HAS_RAWTOYROW
#endif
// Convert RAW to I420.
LIBYUV_API
int RAWToI420(const uint8_t* src_raw,
int src_stride_raw,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int src_stride_raw,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
#if (defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)) || \
defined(HAS_RAWTOYROW_MSA) || defined(HAS_RAWTOYROW_MMI)
void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u,
uint8_t* dst_v, int width) = RAWToUVRow_C;
#if defined(HAS_RAWTOYROW)
void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw,
uint8_t* dst_u, uint8_t* dst_v, int width) =
RAWToUVRow_C;
void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
RAWToYRow_C;
#else
@ -2109,6 +2143,8 @@ int RAWToI420(const uint8_t* src_raw,
src_stride_raw = -src_stride_raw;
}
#if defined(HAS_RAWTOYROW)
// Neon version does direct RAW to YUV.
#if defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
@ -2121,8 +2157,7 @@ int RAWToI420(const uint8_t* src_raw,
}
}
}
// MMI and MSA version does direct RAW to YUV.
#elif (defined(HAS_RAWTOYROW_MMI) || defined(HAS_RAWTOYROW_MSA))
#endif
#if defined(HAS_RAWTOYROW_MMI) && defined(HAS_RAWTOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
RAWToUVRow = RAWToUVRow_Any_MMI;
@ -2145,28 +2180,10 @@ int RAWToI420(const uint8_t* src_raw,
}
}
#endif
// Other platforms do intermediate conversion from RAW to ARGB.
#else
#if defined(HAS_RAWTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RAWToARGBRow = RAWToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RAWToARGBRow = RAWToARGBRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON) && defined(HAS_ARGBTOUVROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVRow = ARGBToUVRow_Any_NEON;
ARGBToYRow = ARGBToYRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYRow = ARGBToYRow_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_NEON;
}
}
}
#endif
#else // HAS_RAWTOYROW
#if defined(HAS_RAWTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
@ -2175,39 +2192,49 @@ int RAWToI420(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
}
}
#endif
#endif // HAS_RAWTOYROW
{
#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
defined(HAS_RAWTOYROW_MMI))
#if !defined(HAS_RAWTOYROW)
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
defined(HAS_RAWTOYROW_MMI))
#if defined(HAS_RAWTOYROW)
RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
RAWToYRow(src_raw, dst_y, width);
RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
@ -2224,8 +2251,7 @@ int RAWToI420(const uint8_t* src_raw,
dst_v += dst_stride_v;
}
if (height & 1) {
#if (defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
defined(HAS_RAWTOYROW_MMI))
#if defined(HAS_RAWTOYROW)
RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
RAWToYRow(src_raw, dst_y, width);
#else
@ -2234,32 +2260,36 @@ int RAWToI420(const uint8_t* src_raw,
ARGBToYRow(row, dst_y, width);
#endif
}
#if !(defined(HAS_RAWTOYROW_NEON) || defined(HAS_RAWTOYROW_MSA) || \
defined(HAS_RAWTOYROW_MMI))
#if !defined(HAS_RAWTOYROW)
free_aligned_buffer_64(row);
#endif
}
return 0;
}
#undef HAS_RAWTOYROW
// Enabled if 1 pass is available
#if (defined(HAS_RAWTOYJROW_NEON) || defined(HAS_RAWTOYJROW_MSA) || \
defined(HAS_RAWTOYJROW_MMI))
#define HAS_RAWTOYJROW
#endif
// TODO(fbarchard): Use Matrix version to implement I420 and J420.
// Convert RAW to J420.
LIBYUV_API
int RAWToJ420(const uint8_t* src_raw,
int src_stride_raw,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int src_stride_raw,
uint8_t* dst_y,
int dst_stride_y,
uint8_t* dst_u,
int dst_stride_u,
uint8_t* dst_v,
int dst_stride_v,
int width,
int height) {
int y;
#if (defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI)
#if defined(HAS_RAWTOYJROW)
void (*RAWToUVJRow)(const uint8_t* src_raw, int src_stride_raw,
uint8_t* dst_u, uint8_t* dst_v, int width) =
uint8_t* dst_u, uint8_t* dst_v, int width) =
RAWToUVJRow_C;
void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) =
RAWToYJRow_C;
@ -2267,7 +2297,7 @@ int RAWToJ420(const uint8_t* src_raw,
void (*RAWToARGBRow)(const uint8_t* src_rgb, uint8_t* dst_argb, int width) =
RAWToARGBRow_C;
void (*ARGBToUVJRow)(const uint8_t* src_argb0, int src_stride_argb,
uint8_t* dst_u, uint8_t* dst_v, int width) =
uint8_t* dst_u, uint8_t* dst_v, int width) =
ARGBToUVJRow_C;
void (*ARGBToYJRow)(const uint8_t* src_argb, uint8_t* dst_y, int width) =
ARGBToYJRow_C;
@ -2282,6 +2312,8 @@ int RAWToJ420(const uint8_t* src_raw,
src_stride_raw = -src_stride_raw;
}
#if defined(HAS_RAWTOYJROW)
// Neon version does direct RAW to YUV.
#if defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
@ -2294,8 +2326,7 @@ int RAWToJ420(const uint8_t* src_raw,
}
}
}
// MMI and MSA version does direct RAW to YUV.
#elif (defined(HAS_RAWTOYJROW_MMI) || defined(HAS_RAWTOYJROW_MSA))
#endif
#if defined(HAS_RAWTOYJROW_MMI) && defined(HAS_RAWTOUVJROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
RAWToUVJRow = RAWToUVJRow_Any_MMI;
@ -2318,27 +2349,10 @@ int RAWToJ420(const uint8_t* src_raw,
}
}
#endif
#else
#if defined(HAS_RAWTOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
RAWToARGBRow = RAWToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
RAWToARGBRow = RAWToARGBRow_NEON;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_NEON) && defined(HAS_ARGBTOUVJROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
ARGBToYJRow = ARGBToYJRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
ARGBToYJRow = ARGBToYJRow_NEON;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJRow = ARGBToUVJRow_NEON;
}
}
}
#endif
// Other platforms do intermediate conversion from RAW to ARGB.
#else // HAS_RAWTOYJROW
#if defined(HAS_RAWTOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
@ -2347,39 +2361,49 @@ int RAWToJ420(const uint8_t* src_raw,
}
}
#endif
#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
#if defined(HAS_ARGBTOYJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJRow = ARGBToUVJRow_SSSE3;
ARGBToYJRow = ARGBToYJRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
#if defined(HAS_ARGBTOYJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
ARGBToYJRow = ARGBToYJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVJRow = ARGBToUVJRow_AVX2;
ARGBToYJRow = ARGBToYJRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVJROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVJRow = ARGBToUVJRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUVJROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVJRow = ARGBToUVJRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVJRow = ARGBToUVJRow_AVX2;
}
}
#endif
#endif // HAS_RAWTOYJROW
{
#if !((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI))
#if !defined(HAS_RAWTOYJROW)
// Allocate 2 rows of ARGB.
const int kRowSize = (width * 4 + 31) & ~31;
align_buffer_64(row, kRowSize * 2);
#endif
for (y = 0; y < height - 1; y += 2) {
#if ((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI))
#if defined(HAS_RAWTOYJROW)
RAWToUVJRow(src_raw, src_stride_raw, dst_u, dst_v, width);
RAWToYJRow(src_raw, dst_y, width);
RAWToYJRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
@ -2396,8 +2420,7 @@ int RAWToJ420(const uint8_t* src_raw,
dst_v += dst_stride_v;
}
if (height & 1) {
#if ((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI))
#if defined(HAS_RAWTOYJROW)
RAWToUVJRow(src_raw, 0, dst_u, dst_v, width);
RAWToYJRow(src_raw, dst_y, width);
#else
@ -2406,13 +2429,13 @@ int RAWToJ420(const uint8_t* src_raw,
ARGBToYJRow(row, dst_y, width);
#endif
}
#if !((defined(HAS_RAWTOYJROW_NEON) && defined(HAS_RAWTOUVJROW_NEON)) || \
defined(HAS_RAWTOYJROW_MSA) || defined(HAS_RAWTOYJROW_MMI))
#if !defined(HAS_RAWTOYJROW)
free_aligned_buffer_64(row);
#endif
}
return 0;
}
#undef HAS_RAWTOYJROW
// Convert RGB565 to I420.
LIBYUV_API
@ -2507,26 +2530,38 @@ int RGB565ToI420(const uint8_t* src_rgb565,
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
}
}
#endif
#endif
{
#if !(defined(HAS_RGB565TOYROW_NEON) || defined(HAS_RGB565TOYROW_MSA) || \
@ -2666,26 +2701,38 @@ int ARGB1555ToI420(const uint8_t* src_argb1555,
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
}
}
#endif
#endif
{
#if !(defined(HAS_ARGB1555TOYROW_NEON) || defined(HAS_ARGB1555TOYROW_MSA) || \
@ -2822,26 +2869,38 @@ int ARGB4444ToI420(const uint8_t* src_argb4444,
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToUVRow = ARGBToUVRow_Any_MMI;

View File

@ -170,26 +170,38 @@ int ARGBToI422(const uint8_t* src_argb,
height = 1;
src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
}
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
@ -271,26 +283,6 @@ int ARGBToNV12(const uint8_t* src_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
@ -307,6 +299,38 @@ int ARGBToNV12(const uint8_t* src_argb,
}
}
#endif
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_MMI) && defined(HAS_ARGBTOUVROW_MMI)
if (TestCpuFlag(kCpuHasMMI)) {
ARGBToYRow = ARGBToYRow_Any_MMI;
@ -423,26 +447,38 @@ int ARGBToNV21(const uint8_t* src_argb,
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
@ -574,26 +610,38 @@ int ABGRToNV12(const uint8_t* src_abgr,
src_abgr = src_abgr + (height - 1) * src_stride_abgr;
src_stride_abgr = -src_stride_abgr;
}
#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
#if defined(HAS_ABGRTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
ABGRToYRow = ABGRToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ABGRToUVRow = ABGRToUVRow_SSSE3;
ABGRToYRow = ABGRToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
#if defined(HAS_ABGRTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ABGRToUVRow = ABGRToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ABGRTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ABGRToUVRow = ABGRToUVRow_Any_AVX2;
ABGRToYRow = ABGRToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ABGRToUVRow = ABGRToUVRow_AVX2;
ABGRToYRow = ABGRToYRow_AVX2;
}
}
#endif
#if defined(HAS_ABGRTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ABGRToUVRow = ABGRToUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ABGRToUVRow = ABGRToUVRow_AVX2;
}
}
#endif
#if defined(HAS_ABGRTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ABGRToYRow = ABGRToYRow_Any_NEON;
@ -726,26 +774,38 @@ int ABGRToNV21(const uint8_t* src_abgr,
src_abgr = src_abgr + (height - 1) * src_stride_abgr;
src_stride_abgr = -src_stride_abgr;
}
#if defined(HAS_ABGRTOYROW_SSSE3) && defined(HAS_ABGRTOUVROW_SSSE3)
#if defined(HAS_ABGRTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
ABGRToYRow = ABGRToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ABGRToUVRow = ABGRToUVRow_SSSE3;
ABGRToYRow = ABGRToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ABGRTOYROW_AVX2) && defined(HAS_ABGRTOUVROW_AVX2)
#if defined(HAS_ABGRTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ABGRToUVRow = ABGRToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ABGRTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ABGRToUVRow = ABGRToUVRow_Any_AVX2;
ABGRToYRow = ABGRToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ABGRToUVRow = ABGRToUVRow_AVX2;
ABGRToYRow = ABGRToYRow_AVX2;
}
}
#endif
#if defined(HAS_ABGRTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ABGRToUVRow = ABGRToUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ABGRToUVRow = ABGRToUVRow_AVX2;
}
}
#endif
#if defined(HAS_ABGRTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ABGRToYRow = ABGRToYRow_Any_NEON;
@ -883,26 +943,38 @@ int ARGBToYUY2(const uint8_t* src_argb,
height = 1;
src_stride_argb = dst_stride_yuy2 = 0;
}
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;
@ -1036,26 +1108,38 @@ int ARGBToUYVY(const uint8_t* src_argb,
height = 1;
src_stride_argb = dst_stride_uyvy = 0;
}
#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
#if defined(HAS_ARGBTOYROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
ARGBToYRow = ARGBToYRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
ARGBToYRow = ARGBToYRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
#if defined(HAS_ARGBTOUVROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
ARGBToUVRow = ARGBToUVRow_SSSE3;
}
}
#endif
#if defined(HAS_ARGBTOYROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
ARGBToYRow = ARGBToYRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
ARGBToYRow = ARGBToYRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOUVROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
ARGBToUVRow = ARGBToUVRow_AVX2;
}
}
#endif
#if defined(HAS_ARGBTOYROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
ARGBToYRow = ARGBToYRow_Any_NEON;

View File

@ -1977,6 +1977,9 @@ ANY12S(RGBAToUVRow_Any_MMI, RGBAToUVRow_MMI, 0, 4, 15)
#ifdef HAS_RGB24TOUVROW_NEON
ANY12S(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, 0, 3, 15)
#endif
#ifdef HAS_RGB24TOUVJROW_NEON
ANY12S(RGB24ToUVJRow_Any_NEON, RGB24ToUVJRow_NEON, 0, 3, 15)
#endif
#ifdef HAS_RGB24TOUVROW_MSA
ANY12S(RGB24ToUVRow_Any_MSA, RGB24ToUVRow_MSA, 0, 3, 15)
#endif
@ -1986,6 +1989,9 @@ ANY12S(RGB24ToUVRow_Any_MMI, RGB24ToUVRow_MMI, 0, 3, 15)
#ifdef HAS_RAWTOUVROW_NEON
ANY12S(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, 0, 3, 15)
#endif
#ifdef HAS_RAWTOUVJROW_NEON
ANY12S(RAWToUVJRow_Any_NEON, RAWToUVJRow_NEON, 0, 3, 15)
#endif
#ifdef HAS_RAWTOUVROW_MSA
ANY12S(RAWToUVRow_Any_MSA, RAWToUVRow_MSA, 0, 3, 15)
#endif

View File

@ -1830,6 +1830,98 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
);
}
// TODO(fbarchard): Subsample match C code.
void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
int src_stride_rgb24,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_rgb24
"vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
"vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
"vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
"vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
"vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
"vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
"vrshr.u16 q0, q0, #1 \n" // 2x average
"vrshr.u16 q1, q1, #1 \n"
"vrshr.u16 q2, q2, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q0, q1, q2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(src_stride_rgb24), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
// TODO(fbarchard): Subsample match C code.
void RAWToUVJRow_NEON(const uint8_t* src_raw,
int src_stride_raw,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
asm volatile (
"add %1, %0, %1 \n" // src_stride + src_raw
"vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
"vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
"vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
"vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
"vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
"vmov.u16 q15, #0x8080 \n" // 128.5
"1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
"vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
"vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
"vrshr.u16 q0, q0, #1 \n" // 2x average
"vrshr.u16 q1, q1, #1 \n"
"vrshr.u16 q2, q2, #1 \n"
"subs %4, %4, #16 \n" // 16 processed per loop.
RGBTOUV(q2, q1, q0)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_raw), // %0
"+r"(src_stride_raw), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
"q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
);
}
void BGRAToUVRow_NEON(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_u,

View File

@ -2001,7 +2001,7 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 32 processed per loop.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -2017,6 +2017,96 @@ void ARGBToUVJRow_NEON(const uint8_t* src_argb,
);
}
void RGB24ToUVJRow_NEON(const uint8_t* src_rgb24,
int src_stride_rgb24,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
const uint8_t* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
asm volatile (
"movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
"movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
"movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
"movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
"movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
"movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
"1: \n"
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load next 16
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
"urshr v0.8h, v0.8h, #1 \n" // 2x average
"urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(src_rgb24_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
void RAWToUVJRow_NEON(const uint8_t* src_raw,
int src_stride_raw,
uint8_t* dst_u,
uint8_t* dst_v,
int width) {
const uint8_t* src_raw_1 = src_raw + src_stride_raw;
asm volatile (
"movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
"movi v21.8h, #42, lsl #0 \n" // UG coeff (-0.33126) / 2
"movi v22.8h, #21, lsl #0 \n" // UR coeff (-0.16874) / 2
"movi v23.8h, #10, lsl #0 \n" // VB coeff (-0.08131) / 2
"movi v24.8h, #53, lsl #0 \n" // VG coeff (-0.41869) / 2
"movi v25.16b, #0x80 \n" // 128.5 (0x8080 in 16-bit)
"1: \n"
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 pixels.
"uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
"uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
"ld3 {v4.16b,v5.16b,v6.16b}, [%1], #48 \n" // load next 16
"uadalp v0.8h, v4.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%1, 448] \n"
"uadalp v1.8h, v5.16b \n" // G 16 bytes -> 8 shorts.
"uadalp v2.8h, v6.16b \n" // R 16 bytes -> 8 shorts.
"urshr v0.8h, v0.8h, #1 \n" // 2x average
"urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v2.8h, v1.8h, v0.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(src_raw_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(width) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
void BGRAToUVRow_NEON(const uint8_t* src_bgra,
int src_stride_bgra,
uint8_t* dst_u,
@ -2041,7 +2131,7 @@ void BGRAToUVRow_NEON(const uint8_t* src_bgra,
"urshr v1.8h, v3.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 32 processed per loop.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -2081,7 +2171,7 @@ void ABGRToUVRow_NEON(const uint8_t* src_abgr,
"urshr v2.8h, v2.8h, #1 \n"
"urshr v1.8h, v1.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 32 processed per loop.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v2.8h, v1.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -2121,7 +2211,7 @@ void RGBAToUVRow_NEON(const uint8_t* src_rgba,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 32 processed per loop.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -2161,7 +2251,7 @@ void RGB24ToUVRow_NEON(const uint8_t* src_rgb24,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v2.8h, v2.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 32 processed per loop.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v0.8h, v1.8h, v2.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.
@ -2186,7 +2276,7 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
asm volatile (
RGBTOUV_SETUP_REG
"1: \n"
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 8 RAW pixels.
"ld3 {v0.16b,v1.16b,v2.16b}, [%0], #48 \n" // load 16 RAW pixels.
"uaddlp v2.8h, v2.16b \n" // B 16 bytes -> 8 shorts.
"prfm pldl1keep, [%0, 448] \n"
"uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
@ -2201,7 +2291,7 @@ void RAWToUVRow_NEON(const uint8_t* src_raw,
"urshr v1.8h, v1.8h, #1 \n"
"urshr v0.8h, v0.8h, #1 \n"
"subs %w4, %w4, #16 \n" // 32 processed per loop.
"subs %w4, %w4, #16 \n" // 16 processed per loop.
RGBTOUV(v2.8h, v1.8h, v0.8h)
"st1 {v0.8b}, [%2], #8 \n" // store 8 pixels U.
"st1 {v1.8b}, [%3], #8 \n" // store 8 pixels V.

View File

@ -4096,6 +4096,68 @@ TEST_F(LibYUVConvertTest, TestH420ToAR30) {
free_aligned_buffer_page_end(ar30_pixels);
}
// Test I400 with jpeg matrix is same as J400
TEST_F(LibYUVConvertTest, TestI400) {
const int kSize = 256;
align_buffer_page_end(orig_i400, kSize);
align_buffer_page_end(argb_pixels_i400, kSize * 4);
align_buffer_page_end(argb_pixels_j400, kSize * 4);
align_buffer_page_end(argb_pixels_jpeg_i400, kSize * 4);
align_buffer_page_end(argb_pixels_h709_i400, kSize * 4);
align_buffer_page_end(argb_pixels_2020_i400, kSize * 4);
// Test grey scale
for (int i = 0; i < kSize; ++i) {
orig_i400[i] = i;
}
J400ToARGB(orig_i400, 0, argb_pixels_j400, 0, kSize, 1);
I400ToARGB(orig_i400, 0, argb_pixels_i400, 0, kSize, 1);
I400ToARGBMatrix(orig_i400, 0, argb_pixels_jpeg_i400, 0, &kYuvJPEGConstants,
kSize, 1);
I400ToARGBMatrix(orig_i400, 0, argb_pixels_h709_i400, 0, &kYuvH709Constants,
kSize, 1);
I400ToARGBMatrix(orig_i400, 0, argb_pixels_2020_i400, 0, &kYuv2020Constants,
kSize, 1);
EXPECT_EQ(0, argb_pixels_i400[0]);
EXPECT_EQ(0, argb_pixels_j400[0]);
EXPECT_EQ(0, argb_pixels_jpeg_i400[0]);
EXPECT_EQ(0, argb_pixels_h709_i400[0]);
EXPECT_EQ(0, argb_pixels_2020_i400[0]);
EXPECT_EQ(0, argb_pixels_i400[16 * 4]);
EXPECT_EQ(16, argb_pixels_j400[16 * 4]);
EXPECT_EQ(16, argb_pixels_jpeg_i400[16 * 4]);
EXPECT_EQ(0, argb_pixels_h709_i400[16 * 4]);
EXPECT_EQ(0, argb_pixels_2020_i400[16 * 4]);
EXPECT_EQ(130, argb_pixels_i400[128 * 4]);
EXPECT_EQ(128, argb_pixels_j400[128 * 4]);
EXPECT_EQ(128, argb_pixels_jpeg_i400[128 * 4]);
EXPECT_EQ(130, argb_pixels_h709_i400[128 * 4]);
EXPECT_EQ(130, argb_pixels_2020_i400[128 * 4]);
EXPECT_EQ(255, argb_pixels_i400[255 * 4]);
EXPECT_EQ(255, argb_pixels_j400[255 * 4]);
EXPECT_EQ(255, argb_pixels_jpeg_i400[255 * 4]);
EXPECT_EQ(255, argb_pixels_h709_i400[255 * 4]);
EXPECT_EQ(255, argb_pixels_2020_i400[255 * 4]);
for (int i = 0; i < kSize * 4; ++i) {
if ((i & 3) == 3) {
EXPECT_EQ(255, argb_pixels_j400[i]);
} else {
EXPECT_EQ(i / 4, argb_pixels_j400[i]);
}
EXPECT_EQ(argb_pixels_jpeg_i400[i], argb_pixels_j400[i]);
}
free_aligned_buffer_page_end(orig_i400);
free_aligned_buffer_page_end(argb_pixels_i400);
free_aligned_buffer_page_end(argb_pixels_j400);
free_aligned_buffer_page_end(argb_pixels_jpeg_i400);
free_aligned_buffer_page_end(argb_pixels_h709_i400);
free_aligned_buffer_page_end(argb_pixels_2020_i400);
}
// Test RGB24 to ARGB and back to RGB24
TEST_F(LibYUVConvertTest, TestARGBToRGB24) {
const int kSize = 256;
@ -4162,66 +4224,32 @@ TEST_F(LibYUVConvertTest, TestRGB24ToJ420) {
}
#endif
// Test I400 with jpeg matrix is same as J400
TEST_F(LibYUVConvertTest, TestI400) {
// Test RGB24 to I420 is exact
#if defined(LIBYUV_BIT_EXACT)
TEST_F(LibYUVConvertTest, TestRGB24ToI420) {
const int kSize = 256;
align_buffer_page_end(orig_i400, kSize);
align_buffer_page_end(argb_pixels_i400, kSize * 4);
align_buffer_page_end(argb_pixels_j400, kSize * 4);
align_buffer_page_end(argb_pixels_jpeg_i400, kSize * 4);
align_buffer_page_end(argb_pixels_h709_i400, kSize * 4);
align_buffer_page_end(argb_pixels_2020_i400, kSize * 4);
align_buffer_page_end(orig_rgb24, kSize * 3 * 2); // 2 rows of RGB24
align_buffer_page_end(dest_i420, kSize * 3 / 2 * 2);
int iterations256 = (benchmark_width_ * benchmark_height_ + (kSize * 2 - 1)) /
(kSize * 2) * benchmark_iterations_;
// Test grey scale
for (int i = 0; i < kSize; ++i) {
orig_i400[i] = i;
for (int i = 0; i < kSize * 3 * 2; ++i) {
orig_rgb24[i] = i;
}
J400ToARGB(orig_i400, 0, argb_pixels_j400, 0, kSize, 1);
I400ToARGB(orig_i400, 0, argb_pixels_i400, 0, kSize, 1);
I400ToARGBMatrix(orig_i400, 0, argb_pixels_jpeg_i400, 0, &kYuvJPEGConstants,
kSize, 1);
I400ToARGBMatrix(orig_i400, 0, argb_pixels_h709_i400, 0, &kYuvH709Constants,
kSize, 1);
I400ToARGBMatrix(orig_i400, 0, argb_pixels_2020_i400, 0, &kYuv2020Constants,
kSize, 1);
EXPECT_EQ(0, argb_pixels_i400[0]);
EXPECT_EQ(0, argb_pixels_j400[0]);
EXPECT_EQ(0, argb_pixels_jpeg_i400[0]);
EXPECT_EQ(0, argb_pixels_h709_i400[0]);
EXPECT_EQ(0, argb_pixels_2020_i400[0]);
EXPECT_EQ(0, argb_pixels_i400[16 * 4]);
EXPECT_EQ(16, argb_pixels_j400[16 * 4]);
EXPECT_EQ(16, argb_pixels_jpeg_i400[16 * 4]);
EXPECT_EQ(0, argb_pixels_h709_i400[16 * 4]);
EXPECT_EQ(0, argb_pixels_2020_i400[16 * 4]);
EXPECT_EQ(130, argb_pixels_i400[128 * 4]);
EXPECT_EQ(128, argb_pixels_j400[128 * 4]);
EXPECT_EQ(128, argb_pixels_jpeg_i400[128 * 4]);
EXPECT_EQ(130, argb_pixels_h709_i400[128 * 4]);
EXPECT_EQ(130, argb_pixels_2020_i400[128 * 4]);
EXPECT_EQ(255, argb_pixels_i400[255 * 4]);
EXPECT_EQ(255, argb_pixels_j400[255 * 4]);
EXPECT_EQ(255, argb_pixels_jpeg_i400[255 * 4]);
EXPECT_EQ(255, argb_pixels_h709_i400[255 * 4]);
EXPECT_EQ(255, argb_pixels_2020_i400[255 * 4]);
for (int i = 0; i < kSize * 4; ++i) {
if ((i & 3) == 3) {
EXPECT_EQ(255, argb_pixels_j400[i]);
} else {
EXPECT_EQ(i / 4, argb_pixels_j400[i]);
}
EXPECT_EQ(argb_pixels_jpeg_i400[i], argb_pixels_j400[i]);
for (int i = 0; i < iterations256; ++i) {
RGB24ToI420(orig_rgb24, kSize * 3, dest_i420, kSize, // Y plane
dest_i420 + kSize * 2, kSize / 2, // U plane
dest_i420 + kSize * 5 / 2, kSize / 2, // V plane
kSize, 2);
}
free_aligned_buffer_page_end(orig_i400);
free_aligned_buffer_page_end(argb_pixels_i400);
free_aligned_buffer_page_end(argb_pixels_j400);
free_aligned_buffer_page_end(argb_pixels_jpeg_i400);
free_aligned_buffer_page_end(argb_pixels_h709_i400);
free_aligned_buffer_page_end(argb_pixels_2020_i400);
uint32_t checksum = HashDjb2(dest_i420, kSize * 3 / 2 * 2, 5381);
EXPECT_EQ(1526656597u, checksum);
free_aligned_buffer_page_end(orig_rgb24);
free_aligned_buffer_page_end(dest_i420);
}
#endif
} // namespace libyuv