I210ToARGB conversion from 10 bit YUV to RGB

SSSE3 optimized 10 bit YUV conversion to ARGB in single step.

Bug: libyuv:751
Test:  I010ToARGB
Change-Id: I234b2850e35992113ee6bd638732bafc7010a60d
Reviewed-on: https://chromium-review.googlesource.com/848238
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
Frank Barchard 2018-01-04 15:20:19 -08:00 committed by Commit Bot
parent ac088b4be9
commit a64658593e
8 changed files with 201 additions and 97 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1686 Version: 1687
License: BSD License: BSD
License File: LICENSE License File: LICENSE

View File

@ -63,6 +63,32 @@ int I420ToABGR(const uint8* src_y,
int width, int width,
int height); int height);
// Convert I010 to ARGB.
LIBYUV_API
int I010ToARGB(const uint16* src_y,
int src_stride_y,
const uint16* src_u,
int src_stride_u,
const uint16* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert H010 to ARGB.
LIBYUV_API
int H010ToARGB(const uint16* src_y,
int src_stride_y,
const uint16* src_u,
int src_stride_u,
const uint16* src_v,
int src_stride_v,
uint8* dst_argb,
int dst_stride_argb,
int width,
int height);
// Convert I422 to ARGB. // Convert I422 to ARGB.
LIBYUV_API LIBYUV_API
int I422ToARGB(const uint8* src_y, int I422ToARGB(const uint8* src_y,

View File

@ -265,6 +265,8 @@ extern "C" {
#define HAS_ARGBTOAR30ROW_SSSE3 #define HAS_ARGBTOAR30ROW_SSSE3
#define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT16TO8ROW_SSSE3
#define HAS_CONVERT8TO16ROW_SSE2 #define HAS_CONVERT8TO16ROW_SSE2
// I210 is for H010. 2 = 422. I for 601 vs H for 709.
#define HAS_I210TOARGBROW_SSSE3
#define HAS_MERGERGBROW_SSSE3 #define HAS_MERGERGBROW_SSSE3
#define HAS_SPLITRGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3
#endif #endif
@ -1735,9 +1737,9 @@ void I422ToARGBRow_C(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I422ToARGBRow_C(const uint8* src_y, void I210ToARGBRow_C(const uint16* src_y,
const uint8* src_u, const uint16* src_u,
const uint8* src_v, const uint16* src_v,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
@ -1807,12 +1809,6 @@ void I422ToARGBRow_AVX2(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I422ToARGBRow_AVX2(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422ToRGBARow_AVX2(const uint8* src_y, void I422ToRGBARow_AVX2(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,
@ -1849,6 +1845,13 @@ void I422ToARGBRow_SSSE3(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I210ToARGBRow_SSSE3(const uint16* src_y,
const uint16* src_u,
const uint16* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422AlphaToARGBRow_SSSE3(const uint8* y_buf, void I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,
@ -1863,12 +1866,6 @@ void I422AlphaToARGBRow_AVX2(const uint8* y_buf,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I422ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void NV12ToARGBRow_SSSE3(const uint8* src_y, void NV12ToARGBRow_SSSE3(const uint8* src_y,
const uint8* src_uv, const uint8* src_uv,
uint8* dst_argb, uint8* dst_argb,
@ -1999,6 +1996,12 @@ void I422ToARGBRow_Any_SSSE3(const uint8* src_y,
uint8* dst_argb, uint8* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width); int width);
void I210ToARGBRow_Any_SSSE3(const uint16* src_y,
const uint16* src_u,
const uint16* src_v,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width);
void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf, void I422AlphaToARGBRow_Any_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,
const uint8* v_buf, const uint8* v_buf,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1686 #define LIBYUV_VERSION 1687
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -47,7 +47,7 @@ int ARGBCopy(const uint8* src_argb,
return 0; return 0;
} }
// Convert I422 to ARGB with matrix // Convert I420 to ARGB with matrix
static int I420ToARGBMatrix(const uint8* src_y, static int I420ToARGBMatrix(const uint8* src_y,
int src_stride_y, int src_stride_y,
const uint8* src_u, const uint8* src_u,
@ -573,18 +573,13 @@ static int H010ToARGBMatrix(const uint16* src_y,
uint8* dst_argb, uint8* dst_argb,
int dst_stride_argb, int dst_stride_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int scale, // 16384 for 10 bits
int width, int width,
int height) { int height) {
int y; int y;
int halfwidth = (width + 1) >> 1; void (*I210ToARGBRow)(const uint16* y_buf, const uint16* u_buf,
void (*Convert16To8Row)(const uint16* src_y, uint8* dst_y, int scale, const uint16* v_buf, uint8* rgb_buf,
int width) = Convert16To8Row_C;
void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf,
const uint8* v_buf, uint8* rgb_buf,
const struct YuvConstants* yuvconstants, int width) = const struct YuvConstants* yuvconstants, int width) =
I422ToARGBRow_C; I210ToARGBRow_C;
if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) { if (!src_y || !src_u || !src_v || !dst_argb || width <= 0 || height == 0) {
return -1; return -1;
} }
@ -594,85 +589,23 @@ static int H010ToARGBMatrix(const uint16* src_y,
dst_argb = dst_argb + (height - 1) * dst_stride_argb; dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb; dst_stride_argb = -dst_stride_argb;
} }
#if defined(HAS_I210TOARGBROW_SSSE3)
#if defined(HAS_CONVERT16TO8ROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) { if (TestCpuFlag(kCpuHasSSSE3)) {
Convert16To8Row = Convert16To8Row_Any_SSSE3; I210ToARGBRow = I210ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 16)) {
Convert16To8Row = Convert16To8Row_SSSE3;
}
}
#endif
#if defined(HAS_CONVERT16TO8ROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
Convert16To8Row = Convert16To8Row_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
Convert16To8Row = Convert16To8Row_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_SSSE3)
if (TestCpuFlag(kCpuHasSSSE3)) {
I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
if (IS_ALIGNED(width, 8)) { if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_SSSE3; I210ToARGBRow = I210ToARGBRow_SSSE3;
}
}
#endif
#if defined(HAS_I422TOARGBROW_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
I422ToARGBRow = I422ToARGBRow_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
I422ToARGBRow = I422ToARGBRow_AVX2;
}
}
#endif
#if defined(HAS_I422TOARGBROW_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
I422ToARGBRow = I422ToARGBRow_Any_NEON;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_NEON;
}
}
#endif
#if defined(HAS_I422TOARGBROW_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
I422ToARGBRow = I422ToARGBRow_Any_MSA;
if (IS_ALIGNED(width, 8)) {
I422ToARGBRow = I422ToARGBRow_MSA;
} }
} }
#endif #endif
{ for (y = 0; y < height; ++y) {
// Row buffers for 8 bit YUV. I210ToARGBRow(src_y, src_u, src_v, dst_argb, yuvconstants, width);
align_buffer_64(row_buf, width + halfwidth * 2); dst_argb += dst_stride_argb;
uint8* row_y = row_buf; src_y += src_stride_y;
uint8* row_u = row_buf + width; if (y & 1) {
uint8* row_v = row_buf + width + halfwidth;
for (y = 0; y < height - 1; y += 2) {
Convert16To8Row(src_y, row_y, scale, width);
Convert16To8Row(src_u, row_u, scale, halfwidth);
Convert16To8Row(src_v, row_v, scale, halfwidth);
I422ToARGBRow(row_y, row_u, row_v, dst_argb, yuvconstants, width);
Convert16To8Row(src_y + src_stride_y, row_y, scale, width);
I422ToARGBRow(row_y, row_u, row_v, dst_argb + dst_stride_argb,
yuvconstants, width);
dst_argb += dst_stride_argb * 2;
src_y += src_stride_y * 2;
src_u += src_stride_u; src_u += src_stride_u;
src_v += src_stride_v; src_v += src_stride_v;
} }
if (height & 1) {
Convert16To8Row(src_y, row_y, scale, width);
Convert16To8Row(src_u, row_u, scale, halfwidth);
Convert16To8Row(src_v, row_v, scale, halfwidth);
I422ToARGBRow(row_y, row_u, row_v, dst_argb, yuvconstants, width);
}
free_aligned_buffer_64(row_buf);
} }
return 0; return 0;
} }
@ -691,7 +624,7 @@ int H010ToARGB(const uint16* src_y,
int height) { int height) {
return H010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v, return H010ToARGBMatrix(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_argb, dst_stride_argb, src_stride_v, dst_argb, dst_stride_argb,
&kYuvH709Constants, 16384, width, height); &kYuvH709Constants, width, height);
} }
// Convert I444 to ARGB with matrix // Convert I444 to ARGB with matrix

View File

@ -194,6 +194,32 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7)
#endif #endif
#undef ANY31C #undef ANY31C
// 64 byte per row for future AVX2
// Any 3 planes of 16 bit to 1 with yuvconstants
// TODO(fbarchard): consider
#define ANY31CT(NAMEANY, ANY_SIMD, UVSHIFT, DUVSHIFT, T, SBPP, BPP, MASK) \
void NAMEANY(const T* y_buf, const T* u_buf, const T* v_buf, uint8* dst_ptr, \
const struct YuvConstants* yuvconstants, int width) { \
SIMD_ALIGNED(T temp[16 * 3]); \
SIMD_ALIGNED(uint8 out[64]); \
memset(temp, 0, 16 * 3 * SBPP); /* for YUY2 and msan */ \
int r = width & MASK; \
int n = width & ~MASK; \
if (n > 0) { \
ANY_SIMD(y_buf, u_buf, v_buf, dst_ptr, yuvconstants, n); \
} \
memcpy(temp, y_buf + n, r * SBPP); \
memcpy(temp + 16, u_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
memcpy(temp + 32, v_buf + (n >> UVSHIFT), SS(r, UVSHIFT) * SBPP); \
ANY_SIMD(temp, temp + 16, temp + 32, out, yuvconstants, MASK + 1); \
memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \
}
#ifdef HAS_I210TOARGBROW_SSSE3
ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16, 2, 4, 7)
#endif
#undef ANY31CT
// Any 2 planes to 1. // Any 2 planes to 1.
#define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \ #define ANY21(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, SBPP2, BPP, MASK) \
void NAMEANY(const uint8* y_buf, const uint8* uv_buf, uint8* dst_ptr, \ void NAMEANY(const uint8* y_buf, const uint8* uv_buf, uint8* dst_ptr, \

View File

@ -1295,6 +1295,51 @@ static __inline void YuvPixel(uint8 y,
*r = Clamp((int32)(-(v * vr) + y1 + br) >> 6); *r = Clamp((int32)(-(v * vr) + y1 + br) >> 6);
} }
// C reference code that mimics the YUV 10 bit assembly.
static __inline void YuvPixel10(uint16 y,
uint16 u,
uint16 v,
uint8* b,
uint8* g,
uint8* r,
const struct YuvConstants* yuvconstants) {
#if defined(__aarch64__)
int ub = -yuvconstants->kUVToRB[0];
int ug = yuvconstants->kUVToG[0];
int vg = yuvconstants->kUVToG[1];
int vr = -yuvconstants->kUVToRB[1];
int bb = yuvconstants->kUVBiasBGR[0];
int bg = yuvconstants->kUVBiasBGR[1];
int br = yuvconstants->kUVBiasBGR[2];
int yg = yuvconstants->kYToRgb[0] / 0x0101;
#elif defined(__arm__)
int ub = -yuvconstants->kUVToRB[0];
int ug = yuvconstants->kUVToG[0];
int vg = yuvconstants->kUVToG[4];
int vr = -yuvconstants->kUVToRB[4];
int bb = yuvconstants->kUVBiasBGR[0];
int bg = yuvconstants->kUVBiasBGR[1];
int br = yuvconstants->kUVBiasBGR[2];
int yg = yuvconstants->kYToRgb[0] / 0x0101;
#else
int ub = yuvconstants->kUVToB[0];
int ug = yuvconstants->kUVToG[0];
int vg = yuvconstants->kUVToG[1];
int vr = yuvconstants->kUVToR[1];
int bb = yuvconstants->kUVBiasB[0];
int bg = yuvconstants->kUVBiasG[0];
int br = yuvconstants->kUVBiasR[0];
int yg = yuvconstants->kYToRgb[0];
#endif
uint32 y1 = (uint32)((y << 6) * yg) >> 16;
u = clamp255(u >> 2);
v = clamp255(v >> 2);
*b = Clamp((int32)(-(u * ub) + y1 + bb) >> 6);
*g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6);
*r = Clamp((int32)(-(v * vr) + y1 + br) >> 6);
}
// Y contribution to R,G,B. Scale and bias. // Y contribution to R,G,B. Scale and bias.
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
@ -1388,6 +1433,33 @@ void I422ToARGBRow_C(const uint8* src_y,
} }
} }
// 10 bit YUV to ARGB
void I210ToARGBRow_C(const uint16* src_y,
const uint16* src_u,
const uint16* src_v,
uint8* rgb_buf,
const struct YuvConstants* yuvconstants,
int width) {
int x;
for (x = 0; x < width - 1; x += 2) {
YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
YuvPixel10(src_y[1], src_u[0], src_v[0], rgb_buf + 4, rgb_buf + 5,
rgb_buf + 6, yuvconstants);
rgb_buf[7] = 255;
src_y += 2;
src_u += 1;
src_v += 1;
rgb_buf += 8; // Advance 2 pixels.
}
if (width & 1) {
YuvPixel10(src_y[0], src_u[0], src_v[0], rgb_buf + 0, rgb_buf + 1,
rgb_buf + 2, yuvconstants);
rgb_buf[3] = 255;
}
}
void I422AlphaToARGBRow_C(const uint8* src_y, void I422AlphaToARGBRow_C(const uint8* src_y,
const uint8* src_u, const uint8* src_u,
const uint8* src_v, const uint8* src_v,

View File

@ -1623,6 +1623,20 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
"punpcklbw %%xmm4,%%xmm4 \n" \ "punpcklbw %%xmm4,%%xmm4 \n" \
"lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n"
// Read 4 UV from 422 10 bit, upsample to 8 UV
// TODO(fbarchard): Consider shufb to replace pack/unpack
#define READYUV422_10 \
"movq " MEMACCESS([u_buf]) ",%%xmm0 \n" \
MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1) \
"lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \
"punpcklwd %%xmm1,%%xmm0 \n" \
"psraw $0x2,%%xmm0 \n" \
"packuswb %%xmm0,%%xmm0 \n" \
"punpcklwd %%xmm0,%%xmm0 \n" \
"movdqu " MEMACCESS([y_buf]) ",%%xmm4 \n" \
"psllw $0x6,%%xmm4 \n" \
"lea " MEMLEA(0x10, [y_buf]) ",%[y_buf] \n"
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
#define READYUVA422 \ #define READYUVA422 \
"movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \ "movd " MEMACCESS([u_buf]) ",%%xmm0 \n" \
@ -1862,6 +1876,36 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
); );
} }
// 10 bit YUV to ARGB
void OMITFP I210ToARGBRow_SSSE3(const uint16* y_buf,
const uint16* u_buf,
const uint16* v_buf,
uint8* dst_argb,
const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
YUVTORGB_SETUP(yuvconstants)
"sub %[u_buf],%[v_buf] \n"
"pcmpeqb %%xmm5,%%xmm5 \n"
LABELALIGN
"1: \n"
READYUV422_10
YUVTORGB(yuvconstants)
STOREARGB
"sub $0x8,%[width] \n"
"jg 1b \n"
: [y_buf]"+r"(y_buf), // %[y_buf]
[u_buf]"+r"(u_buf), // %[u_buf]
[v_buf]"+r"(v_buf), // %[v_buf]
[dst_argb]"+r"(dst_argb), // %[dst_argb]
[width]"+rm"(width) // %[width]
: [yuvconstants]"r"(yuvconstants) // %[yuvconstants]
: "memory", "cc", NACL_R14 YUVTORGB_REGS
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
);
}
#ifdef HAS_I422ALPHATOARGBROW_SSSE3 #ifdef HAS_I422ALPHATOARGBROW_SSSE3
void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
const uint8* u_buf, const uint8* u_buf,