From 09db0c4ce2008008f73b247f1a5b64cfbb29b72e Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Fri, 19 Jan 2018 10:22:08 -0800 Subject: [PATCH] H010ToAR30 in 1 step with SSSE3 assembly Switch YUV conversion macro to output 16 bits per channel. STOREAR30 macro to output AR30. [ RUN ] LibYUVConvertTest.TestH420ToARGB uniques: B 220, G, 220, R 220 [ OK ] LibYUVConvertTest.TestH420ToARGB (0 ms) [ RUN ] LibYUVConvertTest.TestH010ToARGB uniques: B 256, G, 256, R 256 [ OK ] LibYUVConvertTest.TestH010ToARGB (0 ms) [ RUN ] LibYUVConvertTest.TestH010ToAR30 uniques: B 883, G, 883, R 883 [ OK ] LibYUVConvertTest.TestH010ToAR30 (0 ms) Bug: libyuv:751 Test: LibYUVConvertTest.H010ToAR30_Opt Change-Id: I902b718e2c8b68ede69625ccafebc6519d5af70d Reviewed-on: https://chromium-review.googlesource.com/869511 Reviewed-by: Frank Barchard Reviewed-by: Miguel Casas Reviewed-by: richard winterton Commit-Queue: Frank Barchard --- README.chromium | 2 +- include/libyuv/convert_argb.h | 13 ++++ include/libyuv/row.h | 19 +++++ include/libyuv/version.h | 2 +- source/convert_argb.cc | 79 +++++++++------------ source/row_any.cc | 3 + source/row_common.cc | 112 ++++++++++++++++++++++++++++-- source/row_gcc.cc | 126 +++++++++++++++++++++++++--------- 8 files changed, 269 insertions(+), 87 deletions(-) diff --git a/README.chromium b/README.chromium index 77ab37e82..8d6e615a0 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1689 +Version: 1690 License: BSD License File: LICENSE diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h index b8b57cb12..973c615b9 100644 --- a/include/libyuv/convert_argb.h +++ b/include/libyuv/convert_argb.h @@ -420,6 +420,19 @@ int H010ToARGB(const uint16* src_y, int width, int height); +// Convert I010 to AR30. +LIBYUV_API +int I010ToAR30(const uint16* src_y, + int src_stride_y, + const uint16* src_u, + int src_stride_u, + const uint16* src_v, + int src_stride_v, + uint8* dst_ar30, + int dst_stride_ar30, + int width, + int height); + // Convert H010 to AR30. LIBYUV_API int H010ToAR30(const uint16* src_y, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 28ecc6726..08b3465e6 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -256,6 +256,7 @@ extern "C" { #define HAS_CONVERT16TO8ROW_SSSE3 #define HAS_CONVERT8TO16ROW_SSE2 // I210 is for H010. 2 = 422. I for 601 vs H for 709. +#define HAS_I210TOAR30ROW_SSSE3 #define HAS_I210TOARGBROW_SSSE3 #define HAS_MERGERGBROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3 @@ -1682,6 +1683,12 @@ void I422ToARGBRow_C(const uint8* src_y, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I210ToAR30Row_C(const uint16* src_y, + const uint16* src_u, + const uint16* src_v, + uint8* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); void I210ToARGBRow_C(const uint16* src_y, const uint16* src_u, const uint16* src_v, @@ -1791,6 +1798,12 @@ void I422ToARGBRow_SSSE3(const uint8* src_y, const struct YuvConstants* yuvconstants, int width); +void I210ToAR30Row_SSSE3(const uint16* src_y, + const uint16* src_u, + const uint16* src_v, + uint8* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); void I210ToARGBRow_SSSE3(const uint16* src_y, const uint16* src_u, const uint16* src_v, @@ -1947,6 +1960,12 @@ void I422ToARGBRow_Any_SSSE3(const uint8* src_y, uint8* dst_argb, const struct YuvConstants* yuvconstants, int width); +void I210ToAR30Row_Any_SSSE3(const uint16* src_y, + const uint16* src_u, + const uint16* src_v, + uint8* dst_ar30, + const struct YuvConstants* yuvconstants, + int width); void I210ToARGBRow_Any_SSSE3(const uint16* src_y, const uint16* src_u, const uint16* src_v, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index b191add97..dc0bf6f0b 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1689 +#define LIBYUV_VERSION 1690 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 292010831..20cac2de2 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -413,7 +413,7 @@ int H422ToABGR(const uint8* src_y, // Convert 10 bit YUV to ARGB with matrix // TODO(fbarchard): Consider passing scale multiplier to I210ToARGB to // multiply 10 bit yuv into high bits to allow any number of bits. -static int H010ToAR30Matrix(const uint16* src_y, +static int I010ToAR30Matrix(const uint16* src_y, int src_stride_y, const uint16* src_u, int src_stride_u, @@ -425,12 +425,10 @@ static int H010ToAR30Matrix(const uint16* src_y, int width, int height) { int y; - void (*I210ToARGBRow)(const uint16* y_buf, const uint16* u_buf, + void (*I210ToAR30Row)(const uint16* y_buf, const uint16* u_buf, const uint16* v_buf, uint8* rgb_buf, const struct YuvConstants* yuvconstants, int width) = - I210ToARGBRow_C; - void (*ARGBToAR30Row)(const uint8* src_argb, uint8* dst_rgb, int width) = - ARGBToAR30Row_C; + I210ToAR30Row_C; if (!src_y || !src_u || !src_v || !dst_ar30 || width <= 0 || height == 0) { return -1; } @@ -440,60 +438,51 @@ static int H010ToAR30Matrix(const uint16* src_y, dst_ar30 = dst_ar30 + (height - 1) * dst_stride_ar30; dst_stride_ar30 = -dst_stride_ar30; } -#if defined(HAS_I210TOARGBROW_SSSE3) +#if defined(HAS_I210TOAR30ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - I210ToARGBRow = I210ToARGBRow_Any_SSSE3; + I210ToAR30Row = I210ToAR30Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I210ToARGBRow = I210ToARGBRow_SSSE3; + I210ToAR30Row = I210ToAR30Row_SSSE3; } } #endif -#if defined(HAS_I210TOARGBROW_AVX2) +#if defined(HAS_I210TOAR30ROW_AVX2) if (TestCpuFlag(kCpuHasAVX2)) { - I210ToARGBRow = I210ToARGBRow_Any_AVX2; + I210ToAR30Row = I210ToAR30Row_Any_AVX2; if (IS_ALIGNED(width, 16)) { - I210ToARGBRow = I210ToARGBRow_AVX2; + I210ToAR30Row = I210ToAR30Row_AVX2; } } #endif -#if defined(HAS_ARGBTOAR30ROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToAR30Row = ARGBToAR30Row_Any_SSSE3; - if (IS_ALIGNED(width, 4)) { - ARGBToAR30Row = ARGBToAR30Row_SSSE3; + for (y = 0; y < height; ++y) { + I210ToAR30Row(src_y, src_u, src_v, dst_ar30, yuvconstants, width); + dst_ar30 += dst_stride_ar30; + src_y += src_stride_y; + if (y & 1) { + src_u += src_stride_u; + src_v += src_stride_v; } } -#endif -#if defined(HAS_ARGBTOAR30ROW_AVX2) - if (TestCpuFlag(kCpuHasAVX2)) { - ARGBToAR30Row = ARGBToAR30Row_Any_AVX2; - if (IS_ALIGNED(width, 8)) { - ARGBToAR30Row = ARGBToAR30Row_AVX2; - } - } -#endif - - { - // Row buffers for 8 bit YUV and RGB. - align_buffer_64(row_argb, width * 4); - - for (y = 0; y < height; ++y) { - I210ToARGBRow(src_y, src_u, src_v, row_argb, yuvconstants, width); - ARGBToAR30Row(row_argb, dst_ar30, width); - dst_ar30 += dst_stride_ar30; - src_y += src_stride_y; - if (y & 1) { - src_u += src_stride_u; - src_v += src_stride_v; - } - } - - free_aligned_buffer_64(row_argb); - } - return 0; } +// Convert I010 to AR30. +LIBYUV_API +int I010ToAR30(const uint16* src_y, + int src_stride_y, + const uint16* src_u, + int src_stride_u, + const uint16* src_v, + int src_stride_v, + uint8* dst_ar30, + int dst_stride_ar30, + int width, + int height) { + return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + src_stride_v, dst_ar30, dst_stride_ar30, + &kYuvI601Constants, width, height); +} + // Convert H010 to AR30. LIBYUV_API int H010ToAR30(const uint16* src_y, @@ -506,7 +495,7 @@ int H010ToAR30(const uint16* src_y, int dst_stride_ar30, int width, int height) { - return H010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, + return I010ToAR30Matrix(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, dst_ar30, dst_stride_ar30, &kYuvH709Constants, width, height); } diff --git a/source/row_any.cc b/source/row_any.cc index 9f4725bf5..d52a4a0ad 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -214,6 +214,9 @@ ANY31C(I422ToRGB565Row_Any_MSA, I422ToRGB565Row_MSA, 1, 0, 2, 7) memcpy(dst_ptr + (n >> DUVSHIFT) * BPP, out, SS(r, DUVSHIFT) * BPP); \ } +#ifdef HAS_I210TOAR30ROW_SSSE3 +ANY31CT(I210ToAR30Row_Any_SSSE3, I210ToAR30Row_SSSE3, 1, 0, uint16, 2, 4, 7) +#endif #ifdef HAS_I210TOARGBROW_SSSE3 ANY31CT(I210ToARGBRow_Any_SSSE3, I210ToARGBRow_SSSE3, 1, 0, uint16, 2, 4, 7) #endif diff --git a/source/row_common.cc b/source/row_common.cc index a0ca90b8a..395f45905 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -11,6 +11,7 @@ #include "libyuv/row.h" #include // For memcpy and memset. +#include #include "libyuv/basic_types.h" @@ -31,9 +32,8 @@ static __inline int32 clamp255(int32 v) { return (((255 - (v)) >> 31) | (v)) & 255; } -static __inline uint32 Clamp(int32 val) { - int v = clamp0(val); - return (uint32)(clamp255(v)); +static __inline int32 clamp1023(int32 v) { + return (((1023 - (v)) >> 31) | (v)) & 1023; } static __inline uint32 Abs(int32 v) { @@ -49,15 +49,23 @@ static __inline int32 clamp255(int32 v) { return (v > 255) ? 255 : v; } -static __inline uint32 Clamp(int32 val) { - int v = clamp0(val); - return (uint32)(clamp255(v)); +static __inline int32 clamp1023(int32 v) { + return (v > 1023) ? 1023 : v; } static __inline uint32 Abs(int32 v) { return (v < 0) ? -v : v; } #endif // USE_BRANCHLESS +static __inline uint32 Clamp(int32 val) { + int v = clamp0(val); + return (uint32)(clamp255(v)); +} + +static __inline uint32 Clamp10(int32 val) { + int v = clamp0(val); + return (uint32)(clamp1023(v)); +} #ifdef LIBYUV_LITTLE_ENDIAN #define WRITEWORD(p, v) *(uint32*)(p) = v @@ -1340,6 +1348,56 @@ static __inline void YuvPixel10(uint16 y, *r = Clamp((int32)(-(v * vr) + y1 + br) >> 6); } +// C reference code that mimics the YUV 16 bit assembly. +static __inline void YuvPixel16(int16 y, + int16 u, + int16 v, + int* b, + int* g, + int* r, + const struct YuvConstants* yuvconstants) { +#if defined(__aarch64__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = -yuvconstants->kUVToRB[1]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#elif defined(__arm__) + int ub = -yuvconstants->kUVToRB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[4]; + int vr = -yuvconstants->kUVToRB[4]; + int bb = yuvconstants->kUVBiasBGR[0]; + int bg = yuvconstants->kUVBiasBGR[1]; + int br = yuvconstants->kUVBiasBGR[2]; + int yg = yuvconstants->kYToRgb[0] / 0x0101; +#else + int ub = yuvconstants->kUVToB[0]; + int ug = yuvconstants->kUVToG[0]; + int vg = yuvconstants->kUVToG[1]; + int vr = yuvconstants->kUVToR[1]; + int bb = yuvconstants->kUVBiasB[0]; + int bg = yuvconstants->kUVBiasG[0]; + int br = yuvconstants->kUVBiasR[0]; + int yg = yuvconstants->kYToRgb[0]; +#endif + + uint32 y1 = (uint32)((y << 6) * yg) >> 16; + u = clamp255(u >> 2); + v = clamp255(v >> 2); + *b = (int)(-(u * ub) + y1 + bb); + *g = (int)(-(u * ug + v * vg) + y1 + bg); + *r = (int)(-(v * vr) + y1 + br); + + if ((int16)(*b & 0xffff) != *b) { + printf("%d vs %d bb %d y1 %d\n",(int16)*b, *b, bb, y1); + } + +} + // Y contribution to R,G,B. Scale and bias. #define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ #define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */ @@ -1460,6 +1518,48 @@ void I210ToARGBRow_C(const uint16* src_y, } } +static void StoreAR30(uint8* rgb_buf, + int b, + int g, + int r) { + uint32 ar30; + b = b >> 4; // convert 10.6 to 10 bit. + g = g >> 4; + r = r >> 4; + b = Clamp10(b); + g = Clamp10(g); + r = Clamp10(r); + ar30 = b | ((uint32)g << 10) | ((uint32)r << 20) | 0xc0000000; + (*(uint32*)rgb_buf) = ar30; +} + +// 10 bit YUV to 10 bit AR30 +void I210ToAR30Row_C(const uint16* src_y, + const uint16* src_u, + const uint16* src_v, + uint8* rgb_buf, + const struct YuvConstants* yuvconstants, + int width) { + int x; + int b; + int g; + int r; + for (x = 0; x < width - 1; x += 2) { + YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + YuvPixel16(src_y[1], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf + 4, b, g, r); + src_y += 2; + src_u += 1; + src_v += 1; + rgb_buf += 8; // Advance 2 pixels. + } + if (width & 1) { + YuvPixel16(src_y[0], src_u[0], src_v[0], &b, &g, &r, yuvconstants); + StoreAR30(rgb_buf, b, g, r); + } +} + void I422AlphaToARGBRow_C(const uint8* src_y, const uint8* src_u, const uint8* src_v, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index df6a8c1e3..8ea735081 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1696,7 +1696,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, "movdqa 160(%[yuvconstants]),%%xmm13 \n" \ "movdqa 192(%[yuvconstants]),%%xmm14 \n" // Convert 8 pixels: 8 UV and 8 Y -#define YUVTORGB(yuvconstants) \ +#define YUVTORGB16(yuvconstants) \ "movdqa %%xmm0,%%xmm1 \n" \ "movdqa %%xmm0,%%xmm2 \n" \ "movdqa %%xmm0,%%xmm3 \n" \ @@ -1712,45 +1712,42 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, "pmulhuw %%xmm14,%%xmm4 \n" \ "paddsw %%xmm4,%%xmm0 \n" \ "paddsw %%xmm4,%%xmm1 \n" \ - "paddsw %%xmm4,%%xmm2 \n" \ - "psraw $0x6,%%xmm0 \n" \ - "psraw $0x6,%%xmm1 \n" \ - "psraw $0x6,%%xmm2 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "packuswb %%xmm1,%%xmm1 \n" \ - "packuswb %%xmm2,%%xmm2 \n" + "paddsw %%xmm4,%%xmm2 \n" #define YUVTORGB_REGS \ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", #else #define YUVTORGB_SETUP(yuvconstants) // Convert 8 pixels: 8 UV and 8 Y -#define YUVTORGB(yuvconstants) \ - "movdqa %%xmm0,%%xmm1 \n" \ - "movdqa %%xmm0,%%xmm2 \n" \ - "movdqa %%xmm0,%%xmm3 \n" \ - "movdqa 96(%[yuvconstants]),%%xmm0 \n" \ - "pmaddubsw (%[yuvconstants]),%%xmm1 \n" \ - "psubw %%xmm1,%%xmm0 \n" \ - "movdqa 128(%[yuvconstants]),%%xmm1 \n" \ - "pmaddubsw 32(%[yuvconstants]),%%xmm2 \n" \ - "psubw %%xmm2,%%xmm1 \n" \ - "movdqa 160(%[yuvconstants]),%%xmm2 \n" \ - "pmaddubsw 64(%[yuvconstants]),%%xmm3 \n" \ - "psubw %%xmm3,%%xmm2 \n" \ - "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \ - "paddsw %%xmm4,%%xmm0 \n" \ - "paddsw %%xmm4,%%xmm1 \n" \ - "paddsw %%xmm4,%%xmm2 \n" \ - "psraw $0x6,%%xmm0 \n" \ - "psraw $0x6,%%xmm1 \n" \ - "psraw $0x6,%%xmm2 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "packuswb %%xmm1,%%xmm1 \n" \ - "packuswb %%xmm2,%%xmm2 \n" +#define YUVTORGB16(yuvconstants) \ + "movdqa %%xmm0,%%xmm1 \n" \ + "movdqa %%xmm0,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "movdqa 96(%[yuvconstants]),%%xmm0 \n" \ + "pmaddubsw (%[yuvconstants]),%%xmm1 \n" \ + "psubw %%xmm1,%%xmm0 \n" \ + "movdqa 128(%[yuvconstants]),%%xmm1 \n" \ + "pmaddubsw 32(%[yuvconstants]),%%xmm2 \n" \ + "psubw %%xmm2,%%xmm1 \n" \ + "movdqa 160(%[yuvconstants]),%%xmm2 \n" \ + "pmaddubsw 64(%[yuvconstants]),%%xmm3 \n" \ + "psubw %%xmm3,%%xmm2 \n" \ + "pmulhuw 192(%[yuvconstants]),%%xmm4 \n" \ + "paddsw %%xmm4,%%xmm0 \n" \ + "paddsw %%xmm4,%%xmm1 \n" \ + "paddsw %%xmm4,%%xmm2 \n" #define YUVTORGB_REGS #endif +#define YUVTORGB(yuvconstants) \ + YUVTORGB16(yuvconstants) \ + "psraw $0x6,%%xmm0 \n" \ + "psraw $0x6,%%xmm1 \n" \ + "psraw $0x6,%%xmm2 \n" \ + "packuswb %%xmm0,%%xmm0 \n" \ + "packuswb %%xmm1,%%xmm1 \n" \ + "packuswb %%xmm2,%%xmm2 \n" + // Store 8 ARGB values. #define STOREARGB \ "punpcklbw %%xmm1,%%xmm0 \n" \ @@ -1774,6 +1771,32 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, "movdqu %%xmm0,0x10(%[dst_rgba]) \n" \ "lea 0x20(%[dst_rgba]),%[dst_rgba] \n" +// Store 8 AR30 values. +#define STOREAR30 \ + "psraw $0x4,%%xmm0 \n" \ + "psraw $0x4,%%xmm1 \n" \ + "psraw $0x4,%%xmm2 \n" \ + "pminsw %%xmm7,%%xmm0 \n" \ + "pminsw %%xmm7,%%xmm1 \n" \ + "pminsw %%xmm7,%%xmm2 \n" \ + "pmaxsw %%xmm6,%%xmm0 \n" \ + "pmaxsw %%xmm6,%%xmm1 \n" \ + "pmaxsw %%xmm6,%%xmm2 \n" \ + "psllw $0x4,%%xmm2 \n" \ + "movdqa %%xmm0,%%xmm3 \n" \ + "punpcklwd %%xmm2,%%xmm0 \n" \ + "punpckhwd %%xmm2,%%xmm3 \n" \ + "movdqa %%xmm1,%%xmm2 \n" \ + "punpcklwd %%xmm5,%%xmm1 \n" \ + "punpckhwd %%xmm5,%%xmm2 \n" \ + "pslld $0xa,%%xmm1 \n" \ + "pslld $0xa,%%xmm2 \n" \ + "por %%xmm1,%%xmm0 \n" \ + "por %%xmm2,%%xmm3 \n" \ + "movdqu %%xmm0,(%[dst_ar30]) \n" \ + "movdqu %%xmm3,0x10(%[dst_ar30]) \n" \ + "lea 0x20(%[dst_ar30]), %[dst_ar30] \n" + void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -1827,9 +1850,9 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, "pshufb %%xmm5,%%xmm0 \n" "pshufb %%xmm6,%%xmm1 \n" "palignr $0xc,%%xmm0,%%xmm1 \n" - "movq %%xmm0,(%[dst_rgb24]) \n" - "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" - "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" + "movq %%xmm0,(%[dst_rgb24]) \n" + "movdqu %%xmm1,0x8(%[dst_rgb24]) \n" + "lea 0x18(%[dst_rgb24]),%[dst_rgb24] \n" "subl $0x8,%[width] \n" "jg 1b \n" : [y_buf]"+r"(y_buf), // %[y_buf] @@ -1908,6 +1931,41 @@ void OMITFP I210ToARGBRow_SSSE3(const uint16* y_buf, ); } +// 10 bit YUV to AR30 +void OMITFP I210ToAR30Row_SSSE3(const uint16* y_buf, + const uint16* u_buf, + const uint16* v_buf, + uint8* dst_ar30, + const struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + YUVTORGB_SETUP(yuvconstants) + "sub %[u_buf],%[v_buf] \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $14,%%xmm5 \n" + "psllw $4,%%xmm5 \n" // 2 alpha bits + "pxor %%xmm6,%%xmm6 \n" + "pcmpeqb %%xmm7,%%xmm7 \n" // 0 for min + "psrlw $6,%%xmm7 \n" // 1023 for max + + LABELALIGN + "1: \n" + READYUV210 + YUVTORGB16(yuvconstants) + STOREAR30 + "sub $0x8,%[width] \n" + "jg 1b \n" + : [y_buf]"+r"(y_buf), // %[y_buf] + [u_buf]"+r"(u_buf), // %[u_buf] + [v_buf]"+r"(v_buf), // %[v_buf] + [dst_ar30]"+r"(dst_ar30), // %[dst_ar30] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] + : "memory", "cc", YUVTORGB_REGS + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7" + ); +} + #ifdef HAS_I422ALPHATOARGBROW_SSSE3 void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf,