From 8af6ea4100b8027697a7b4c81fb684544727c992 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Mon, 22 Jan 2018 17:04:03 -0800 Subject: [PATCH] I420ToAR30 in 1 step SSSE3 assembly Bug: libyuv:751 Test: LibYUVConvertTest.I420ToAR30_Opt Change-Id: Ie89c3eb2526354cf11175746bc8af72be83a1e00 Reviewed-on: https://chromium-review.googlesource.com/877541 Reviewed-by: Cheng Wang Commit-Queue: Frank Barchard --- include/libyuv/row.h | 19 ++++++++++ source/convert_from.cc | 72 ++++++++------------------------------ source/row_any.cc | 3 ++ source/row_common.cc | 79 ++++++++++++++++++++++++++++++++---------- source/row_gcc.cc | 34 ++++++++++++++++++ 5 files changed, 132 insertions(+), 75 deletions(-) diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 08b3465e6..0205bb597 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -258,6 +258,7 @@ extern "C" { // I210 is for H010. 2 = 422. I for 601 vs H for 709. #define HAS_I210TOAR30ROW_SSSE3 #define HAS_I210TOARGBROW_SSSE3 +#define HAS_I422TOAR30ROW_SSSE3 #define HAS_MERGERGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3 #endif @@ -1683,6 +1684,12 @@ void I422ToARGBRow_C(const uint8* src_y, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToAR30Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); void I210ToAR30Row_C(const uint16* src_y, const uint16* src_u, const uint16* src_v, @@ -1798,6 +1805,12 @@ void I422ToARGBRow_SSSE3(const uint8* src_y, const struct YuvConstants* yuvconstants, int width); +void I422ToAR30Row_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); void I210ToAR30Row_SSSE3(const uint16* src_y, const uint16* src_u, const uint16* src_v, @@ -1960,6 +1973,12 @@ void I422ToARGBRow_Any_SSSE3(const uint8* src_y, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I422ToAR30Row_Any_SSSE3(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); void I210ToAR30Row_Any_SSSE3(const uint16* src_y, const uint16* src_u, const uint16* src_v, diff --git a/source/convert_from.cc b/source/convert_from.cc index 9da607102..484c13c94 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -1149,12 +1149,10 @@ static int I420ToAR30Matrix(const uint8* src_y, int width, int height) { int y; - void (*I422ToARGBRow)(const uint8* y_buf, const uint8* u_buf, + void (*I422ToAR30Row)(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, const struct YuvConstants* yuvconstants, int width) = - I422ToARGBRow_C; - void (*ARGBToAR30Row)(const uint8* src_argb, uint8* dst_rgb, int width) = - ARGBToAR30Row_C; + I422ToAR30Row_C; if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { return -1; @@ -1166,71 +1164,31 @@ static int I420ToAR30Matrix(const uint8* src_y, dst_stride_ar30 = -dst_stride_ar30; } -#if defined(HAS_ARGBTOAR30ROW_SSSE3) +#if defined(HAS_I422TOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ARGBToAR30Row = ARGBToAR30Row_SSSE3; - } - } -#endif -#if defined(HAS_ARGBTOAR30ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToAR30Row = ARGBToAR30Row_Any_AVX2; + I422ToAR30Row = I422ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - ARGBToAR30Row = ARGBToAR30Row_AVX2; + I422ToAR30Row = I422ToAR30Row_SSSE3; } } #endif -#if defined(HAS_I422TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - I422ToARGBRow = I422ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_SSSE3; - } - } -#endif -#if defined(HAS_I422TOARGBROW_AVX2) +#if defined(HAS_I422TOAR30ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - I422ToARGBRow = I422ToARGBRow_Any_AVX2; + I422ToAR30Row = I422ToAR30Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { - I422ToARGBRow = I422ToARGBRow_AVX2; - } - } -#endif -#if defined(HAS_I422TOARGBROW_NEON) - if (TestCpuFlag(kCpuHasNEON)) { - I422ToARGBRow = I422ToARGBRow_Any_NEON; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_NEON; - } - } -#endif -#if defined(HAS_I422TOARGBROW_MSA) - if (TestCpuFlag(kCpuHasMSA)) { - I422ToARGBRow = I422ToARGBRow_Any_MSA; - if (IS_ALIGNED(width, 8)) { - I422ToARGBRow = I422ToARGBRow_MSA; + I422ToAR30Row = I422ToAR30Row_AVX2; } } #endif - { - // Row buffer for ARGB. - align_buffer_64(row_argb, width * 4); - - for (y = 0; y < height; ++y) { - I422ToARGBRow(src_y, src_u, src_v, row_argb, yuvconstants, width); - ARGBToAR30Row(row_argb, dst_ar30, width); - dst_ar30 += dst_stride_ar30; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } + for (y = 0; y < height; ++y) { + I422ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; } - - free_aligned_buffer_64(row_argb); } return 0; } diff --git a/source/row_any.cc b/source/row_any.cc index d52a4a0ad..8b98feaf4 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -145,6 +145,9 @@ ANY31(BlendPlaneRow_Any_SSSE3, BlendPlaneRow_SSSE3, 0, 0, 1, 7) #ifdef HAS_I422TOARGBROW_SSSE3 ANY31C(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_SSSE3, 1, 0, 4, 7) #endif +#ifdef HAS_I422TOAR30ROW_SSSE3 +ANY31C(I422ToAR30Row_Any_SSSE3, I422ToAR30Row_SSSE3, 1, 0, 4, 7) +#endif #ifdef HAS_I444TOARGBROW_SSSE3 ANY31C(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_SSSE3, 0, 0, 4, 7) ANY31C(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_SSSE3, 1, 0, 4, 7) diff --git a/source/row_common.cc b/source/row_common.cc index 395f45905..c2093bae1 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1261,6 +1261,8 @@ const struct YuvConstants SIMD_ALIGNED(kYvuH709Constants) = { #undef YG // C reference code that mimics the YUV assembly. +// Reads 8 bit YUV and leaves result as 16 bit. + static __inline void YuvPixel(uint8 y, uint8 u, uint8 v, @@ -1303,14 +1305,14 @@ static __inline void YuvPixel(uint8 y, *r = Clamp((int32)(-(v * vr) + y1 + br) >> 6); } -// C reference code that mimics the YUV 10 bit assembly. -static __inline void YuvPixel10(uint16 y, - uint16 u, - uint16 v, - uint8* b, - uint8* g, - uint8* r, - const struct YuvConstants* yuvconstants) { +// Reads 8 bit YUV and leaves result as 16 bit. +static __inline void YuvPixel8_16(uint8 y, + uint8 u, + uint8 v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { #if defined(__aarch64__) int ub = -yuvconstants->kUVToRB[0]; int ug = yuvconstants->kUVToG[0]; @@ -1340,15 +1342,14 @@ static __inline void YuvPixel10(uint16 y, int yg = yuvconstants->kYToRgb[0]; #endif - uint32 y1 = (uint32)((y << 6) * yg) >> 16; - u = clamp255(u >> 2); - v = clamp255(v >> 2); - *b = Clamp((int32)(-(u * ub) + y1 + bb) >> 6); - *g = Clamp((int32)(-(u * ug + v * vg) + y1 + bg) >> 6); - *r = Clamp((int32)(-(v * vr) + y1 + br) >> 6); + uint32 y1 = (uint32)(y * 0x0101 * yg) >> 16; + *b = (int)(-(u * ub) + y1 + bb); + *g = (int)(-(u * ug + v * vg) + y1 + bg); + *r = (int)(-(v * vr) + y1 + br); } // C reference code that mimics the YUV 16 bit assembly. +// Reads 10 bit YUV and leaves result as 16 bit. static __inline void YuvPixel16(int16 y, int16 u, int16 v, @@ -1391,11 +1392,24 @@ static __inline void YuvPixel16(int16 y, *b = (int)(-(u * ub) + y1 + bb); *g = (int)(-(u * ug + v * vg) + y1 + bg); *r = (int)(-(v * vr) + y1 + br); +} - if ((int16)(*b & 0xffff) != *b) { - printf("%d vs %d bb %d y1 %d\n",(int16)*b, *b, bb, y1); - } - +// C reference code that mimics the YUV 10 bit assembly. +// Reads 10 bit YUV and clamps down to 8 bit RGB. +static __inline void YuvPixel10(uint16 y, + uint16 u, + uint16 v, + uint8* b, + uint8* g, + uint8* r, + const struct YuvConstants* yuvconstants) { + int b16; + int g16; + int r16; + YuvPixel16(y, u, v, &b16, &g16, &r16, yuvconstants); + *b = Clamp(b16 >> 6); + *g = Clamp(g16 >> 6); + *r = Clamp(r16 >> 6); } // Y contribution to R,G,B. Scale and bias. @@ -1560,6 +1574,35 @@ void I210ToAR30Row_C(const uint16* src_y, } } + +// 8 bit YUV to 10 bit AR30 +// Uses same code as 10 bit YUV bit shifts the 8 bit values up to 10 bits. +void I422ToAR30Row_C(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + YuvPixel8_16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf + 4, b, g, r); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel8_16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + } +} + void I422AlphaToARGBRow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 8ea735081..5946e4806 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1901,6 +1901,40 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, ); } +void OMITFP I422ToAR30Row_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" // AR30 constants + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV422 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + // 10 bit YUV to ARGB void OMITFP I210ToARGBRow_SSSE3(const uint16* y_buf, const uint16* u_buf,