mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
I420ToAR30 in 1 step SSSE3 assembly
Bug: libyuv:751 Test: LibYUVConvertTest.I420ToAR30_Opt Change-Id: Ie89c3eb2526354cf11175746bc8af72be83a1e00 Reviewed-on: https://chromium-review.googlesource.com/877541 Reviewed-by: Cheng Wang <wangcheng@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
09db0c4ce2
commit
8af6ea4100
@ -258,6 +258,7 @@ extern "C" {
|
||||
// I210 is for H010. 2 = 422. I for 601 vs H for 709.
|
||||
#define HAS_I210TOAR30ROW_SSSE3
|
||||
#define HAS_I210TOARGBROW_SSSE3
|
||||
#define HAS_I422TOAR30ROW_SSSE3
|
||||
#define HAS_MERGERGBROW_SSSE3
|
||||
#define HAS_SPLITRGBROW_SSSE3
|
||||
#endif
|
||||
@ -1683,6 +1684,12 @@ void I422ToARGBRow_C(const uint8* src_y,
|
||||
uint8* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422ToAR30Row_C(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_ar30,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I210ToAR30Row_C(const uint16* src_y,
|
||||
const uint16* src_u,
|
||||
const uint16* src_v,
|
||||
@ -1798,6 +1805,12 @@ void I422ToARGBRow_SSSE3(const uint8* src_y,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
|
||||
void I422ToAR30Row_SSSE3(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_ar30,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I210ToAR30Row_SSSE3(const uint16* src_y,
|
||||
const uint16* src_u,
|
||||
const uint16* src_v,
|
||||
@ -1960,6 +1973,12 @@ void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
|
||||
uint8* dst_argb,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I422ToAR30Row_Any_SSSE3(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* dst_ar30,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width);
|
||||
void I210ToAR30Row_Any_SSSE3(const uint16* src_y,
|
||||
const uint16* src_u,
|
||||
const uint16* src_v,
|
||||
|
||||
@ -1149,12 +1149,10 @@ static int I420ToAR30Matrix(const uint8* src_y,
|
||||
int width,
|
||||
int height) {
|
||||
int y;
|
||||
void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
|
||||
void (*I422ToAR30Row)(const uint8* y_buf, const uint8* u_buf,
|
||||
const uint8* v_buf, uint8* rgb_buf,
|
||||
const struct YuvConstants* yuvconstants, int width) =
|
||||
I422ToARGBRow_C;
|
||||
void (*ARGBToAR30Row)(const uint8* src_argb, uint8* dst_rgb, int width) =
|
||||
ARGBToAR30Row_C;
|
||||
I422ToAR30Row_C;
|
||||
|
||||
if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) {
|
||||
return -1;
|
||||
@ -1166,71 +1164,31 @@ static int I420ToAR30Matrix(const uint8* src_y,
|
||||
dst_stride_ar30 = -dst_stride_ar30;
|
||||
}
|
||||
|
||||
#if defined(HAS_ARGBTOAR30ROW_SSSE3)
|
||||
#if defined(HAS_I422TOAR30ROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 4)) {
|
||||
ARGBToAR30Row = ARGBToAR30Row_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOAR30ROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
ARGBToAR30Row = ARGBToAR30Row_Any_AVX2;
|
||||
I422ToAR30Row = I422ToAR30Row_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ARGBToAR30Row = ARGBToAR30Row_AVX2;
|
||||
I422ToAR30Row = I422ToAR30Row_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
I422ToARGBRow = I422ToARGBRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_AVX2)
|
||||
#if defined(HAS_I422TOAR30ROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
|
||||
I422ToAR30Row = I422ToAR30Row_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
I422ToARGBRow = I422ToARGBRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
I422ToARGBRow = I422ToARGBRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_I422TOARGBROW_MSA)
|
||||
if (TestCpuFlag(kCpuHasMSA)) {
|
||||
I422ToARGBRow = I422ToARGBRow_Any_MSA;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
I422ToARGBRow = I422ToARGBRow_MSA;
|
||||
I422ToAR30Row = I422ToAR30Row_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
{
|
||||
// Row buffer for ARGB.
|
||||
align_buffer_64(row_argb, width * 4);
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
I422ToARGBRow(src_y, src_u, src_v, row_argb, yuvconstants, width);
|
||||
ARGBToAR30Row(row_argb, dst_ar30, width);
|
||||
dst_ar30 += dst_stride_ar30;
|
||||
src_y += src_stride_y;
|
||||
if (y & 1) {
|
||||
src_u += src_stride_u;
|
||||
src_v += src_stride_v;
|
||||
}
|
||||
for (y = 0; y < height; ++y) {
|
||||
I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width);
|
||||
dst_ar30 += dst_stride_ar30;
|
||||
src_y += src_stride_y;
|
||||
if (y & 1) {
|
||||
src_u += src_stride_u;
|
||||
src_v += src_stride_v;
|
||||
}
|
||||
|
||||
free_aligned_buffer_64(row_argb);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -145,6 +145,9 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7)
|
||||
#ifdef HAS_I422TOARGBROW_SSSE3
|
||||
ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_I422TOAR30ROW_SSSE3
|
||||
ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7)
|
||||
#endif
|
||||
#ifdef HAS_I444TOARGBROW_SSSE3
|
||||
ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7)
|
||||
ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7)
|
||||
|
||||
@ -1261,6 +1261,8 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = {
|
||||
#undef YG
|
||||
|
||||
// C reference code that mimics the YUV assembly.
|
||||
// Reads 8 bit YUV and leaves result as 16 bit.
|
||||
|
||||
static __inline void YuvPixel(uint8 y,
|
||||
uint8 u,
|
||||
uint8 v,
|
||||
@ -1303,14 +1305,14 @@ static __inline void YuvPixel(uint8 y,
|
||||
*r = Clamp((int32)(-(v * vr) + y1 + br) >> 6);
|
||||
}
|
||||
|
||||
// C reference code that mimics the YUV 10 bit assembly.
|
||||
static __inline void YuvPixel10(uint16 y,
|
||||
uint16 u,
|
||||
uint16 v,
|
||||
uint8* b,
|
||||
uint8* g,
|
||||
uint8* r,
|
||||
const struct YuvConstants* yuvconstants) {
|
||||
// Reads 8 bit YUV and leaves result as 16 bit.
|
||||
static __inline void YuvPixel8_16(uint8 y,
|
||||
uint8 u,
|
||||
uint8 v,
|
||||
int* b,
|
||||
int* g,
|
||||
int* r,
|
||||
const struct YuvConstants* yuvconstants) {
|
||||
#if defined(__aarch64__)
|
||||
int ub = -yuvconstants->kUVToRB[0];
|
||||
int ug = yuvconstants->kUVToG[0];
|
||||
@ -1340,15 +1342,14 @@ static __inline void YuvPixel10(uint16 y,
|
||||
int yg = yuvconstants->kYToRgb[0];
|
||||
#endif
|
||||
|
||||
uint32 y1 = (uint32)((y << 6) * yg) >> 16;
|
||||
u = clamp255(u >> 2);
|
||||
v = clamp255(v >> 2);
|
||||
*b = Clamp((int32)(-(u * ub) + y1 + bb) >> 6);
|
||||
*g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);
|
||||
*r = Clamp((int32)(-(v * vr) + y1 + br) >> 6);
|
||||
uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16;
|
||||
*b = (int)(-(u * ub) + y1 + bb);
|
||||
*g = (int)(-(u * ug + v * vg) + y1 + bg);
|
||||
*r = (int)(-(v * vr) + y1 + br);
|
||||
}
|
||||
|
||||
// C reference code that mimics the YUV 16 bit assembly.
|
||||
// Reads 10 bit YUV and leaves result as 16 bit.
|
||||
static __inline void YuvPixel16(int16 y,
|
||||
int16 u,
|
||||
int16 v,
|
||||
@ -1391,11 +1392,24 @@ static __inline void YuvPixel16(int16 y,
|
||||
*b = (int)(-(u * ub) + y1 + bb);
|
||||
*g = (int)(-(u * ug + v * vg) + y1 + bg);
|
||||
*r = (int)(-(v * vr) + y1 + br);
|
||||
}
|
||||
|
||||
if ((int16)(*b & 0xffff) != *b) {
|
||||
printf("%d vs %d bb %d y1 %d\n",(int16)*b, *b, bb, y1);
|
||||
}
|
||||
|
||||
// C reference code that mimics the YUV 10 bit assembly.
|
||||
// Reads 10 bit YUV and clamps down to 8 bit RGB.
|
||||
static __inline void YuvPixel10(uint16 y,
|
||||
uint16 u,
|
||||
uint16 v,
|
||||
uint8* b,
|
||||
uint8* g,
|
||||
uint8* r,
|
||||
const struct YuvConstants* yuvconstants) {
|
||||
int b16;
|
||||
int g16;
|
||||
int r16;
|
||||
YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants);
|
||||
*b = Clamp(b16 >> 6);
|
||||
*g = Clamp(g16 >> 6);
|
||||
*r = Clamp(r16 >> 6);
|
||||
}
|
||||
|
||||
// Y contribution to R,G,B. Scale and bias.
|
||||
@ -1560,6 +1574,35 @@ void I210ToAR30Row_C(const uint16* src_y,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// 8 bit YUV to 10 bit AR30
|
||||
// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits.
|
||||
void I422ToAR30Row_C(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
uint8* rgb_buf,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
int x;
|
||||
int b;
|
||||
int g;
|
||||
int r;
|
||||
for (x = 0; x < width - 1; x += 2) {
|
||||
YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
|
||||
StoreAR30(rgb_buf, b, g, r);
|
||||
YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
|
||||
StoreAR30(rgb_buf + 4, b, g, r);
|
||||
src_y += 2;
|
||||
src_u += 1;
|
||||
src_v += 1;
|
||||
rgb_buf += 8; // Advance 2 pixels.
|
||||
}
|
||||
if (width & 1) {
|
||||
YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants);
|
||||
StoreAR30(rgb_buf, b, g, r);
|
||||
}
|
||||
}
|
||||
|
||||
void I422AlphaToARGBRow_C(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
|
||||
@ -1901,6 +1901,40 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
);
|
||||
}
|
||||
|
||||
void OMITFP I422ToAR30Row_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* dst_ar30,
|
||||
const struct YuvConstants* yuvconstants,
|
||||
int width) {
|
||||
asm volatile (
|
||||
YUVTORGB_SETUP(yuvconstants)
|
||||
"sub %[u_buf],%[v_buf] \n"
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants
|
||||
"psrlw $14,%%xmm5 \n"
|
||||
"psllw $4,%%xmm5 \n" // 2 alpha bits
|
||||
"pxor %%xmm6,%%xmm6 \n"
|
||||
"pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min
|
||||
"psrlw $6,%%xmm7 \n" // 1023 for max
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
READYUV422
|
||||
YUVTORGB16(yuvconstants)
|
||||
STOREAR30
|
||||
"sub $0x8,%[width] \n"
|
||||
"jg 1b \n"
|
||||
: [y_buf]"+r"(y_buf), // %[y_buf]
|
||||
[u_buf]"+r"(u_buf), // %[u_buf]
|
||||
[v_buf]"+r"(v_buf), // %[v_buf]
|
||||
[dst_ar30]"+r"(dst_ar30), // %[dst_ar30]
|
||||
[width]"+rm"(width) // %[width]
|
||||
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
|
||||
: "memory", "cc", YUVTORGB_REGS
|
||||
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
||||
);
|
||||
}
|
||||
|
||||
// 10 bit YUV to ARGB
|
||||
void OMITFP I210ToARGBRow_SSSE3(const uint16* y_buf,
|
||||
const uint16* u_buf,
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user