From 8fc02134c838294ca1fe1ee60e24111ad311cfb1 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 20 Sep 2022 13:40:07 -0700 Subject: [PATCH] 10/12 bit YUV replicate upper bits to low bits before converting to RGB - shift high bits of 10 and 12 bit into lower bits Bug: libyuv:941, libyuv:942, Change-Id: I14381dbf226ef27dcce06893ea88860835639baa Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3906085 Reviewed-by: Mirko Bonadei Commit-Queue: Frank Barchard Reviewed-by: Wan-Teh Chang --- source/row_common.cc | 51 ++++++++++++++++++++++++++++++++++++++++++-- source/row_gcc.cc | 39 +++++++++++++++++++++++++++------ 2 files changed, 81 insertions(+), 9 deletions(-) diff --git a/source/row_common.cc b/source/row_common.cc index 2531c85b8..20eb48ec0 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1723,7 +1723,7 @@ static __inline void YuvPixel10_16(uint16_t y, int* r, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; - uint32_t y32 = y << 6; + uint32_t y32 = (y << 6) | (y >> 4); u = clamp255(u >> 2); v = clamp255(v >> 2); CALC_RGB16; @@ -1742,7 +1742,7 @@ static __inline void YuvPixel12_16(int16_t y, int* r, const struct YuvConstants* yuvconstants) { LOAD_YUV_CONSTANTS; - uint32_t y32 = y << 4; + uint32_t y32 = (y << 4) | (y >> 8); u = clamp255(u >> 4); v = clamp255(v >> 4); CALC_RGB16; @@ -4052,6 +4052,30 @@ void I422ToARGB4444Row_AVX2(const uint8_t* src_y, } #endif +// SSSE3 implemented in row_gcc.cc row_win.cc for 32 bit +// For row_win Visual C (not clangcl) +#if defined(HAS_I422TORGB24ROW_SSSE3) && defined(_M_X64) && !defined(__clang__) +void I422ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); + src_y += twidth; + src_u += twidth / 2; + src_v += twidth / 2; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + #if defined(HAS_I422TORGB24ROW_AVX2) void I422ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, @@ -4078,6 +4102,29 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y, } #endif +// For row_win Visual C (not clangcl) +#if defined(HAS_I444TORGB24ROW_SSSE3) && defined(_M_X64) && !defined(__clang__) +void I444ToRGB24Row_SSSE3(const uint8_t* src_y, + const uint8_t* src_u, + const uint8_t* src_v, + uint8_t* dst_rgb24, + const struct YuvConstants* yuvconstants, + int width) { + // Row buffer for intermediate ARGB pixels. + SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]); + while (width > 0) { + int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; + I444ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth); + ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth); + src_y += twidth; + src_u += twidth; + src_v += twidth; + dst_rgb24 += twidth * 3; + width -= twidth; + } +} +#endif + #if defined(HAS_I444TORGB24ROW_AVX2) void I444ToRGB24Row_AVX2(const uint8_t* src_y, const uint8_t* src_u, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index af92e3f02..fa1b54793 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -2348,7 +2348,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, "packuswb %%xmm3,%%xmm3 \n" \ "punpcklwd %%xmm3,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm2 \n" \ "psllw $6,%%xmm4 \n" \ + "psraw $4,%%xmm2 \n" \ + "paddw %%xmm2,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" #define READYUVA210 \ @@ -2360,7 +2363,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, "packuswb %%xmm3,%%xmm3 \n" \ "punpcklwd %%xmm3,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm2 \n" \ "psllw $6,%%xmm4 \n" \ + "psraw $4,%%xmm2 \n" \ + "paddw %%xmm2,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" \ "movdqu (%[a_buf]),%%xmm5 \n" \ "psraw $2,%%xmm5 \n" \ @@ -2379,7 +2385,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, "punpckhwd %%xmm2,%%xmm1 \n" \ "packuswb %%xmm1,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm2 \n" \ "psllw $6,%%xmm4 \n" \ + "psraw $4,%%xmm2 \n" \ + "paddw %%xmm2,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 8 UV from 444 10 bit. With 8 Alpha. @@ -2394,7 +2403,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, "punpckhwd %%xmm2,%%xmm1 \n" \ "packuswb %%xmm1,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ - "psllw $0x6,%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm2 \n" \ + "psllw $6,%%xmm4 \n" \ + "psraw $4,%%xmm2 \n" \ + "paddw %%xmm2,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" \ "movdqu (%[a_buf]),%%xmm5 \n" \ "psraw $2,%%xmm5 \n" \ @@ -2411,7 +2423,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba, "packuswb %%xmm3,%%xmm3 \n" \ "punpcklwd %%xmm3,%%xmm3 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \ - "psllw $0x4,%%xmm4 \n" \ + "movdqa %%xmm4,%%xmm2 \n" \ + "psllw $6,%%xmm4 \n" \ + "psraw $4,%%xmm2 \n" \ + "paddw %%xmm2,%%xmm4 \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. @@ -3432,7 +3447,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm2 \n" \ + "vpsraw $4,%%ymm4,%%ymm4 \n" \ + "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" // Read 8 UV from 210, upsample to 16 UV. With 16 Alpha. @@ -3447,7 +3464,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm2 \n" \ + "vpsraw $4,%%ymm4,%%ymm4 \n" \ + "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" \ "vmovdqu (%[a_buf]),%%ymm5 \n" \ "vpsraw $2,%%ymm5,%%ymm5 \n" \ @@ -3465,7 +3484,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \ "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm2 \n" \ + "vpsraw $4,%%ymm4,%%ymm4 \n" \ + "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" // Read 8 UV from 212 12 bit, upsample to 16 UV @@ -3480,7 +3501,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $0x4,%%ymm4,%%ymm4 \n" \ + "vpsllw $4,%%ymm4,%%ymm2 \n" \ + "vpsraw $8,%%ymm4,%%ymm4 \n" \ + "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" // Read 16 UV from 410. With 16 Alpha. @@ -3494,7 +3517,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf, "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \ "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \ - "vpsllw $6,%%ymm4,%%ymm4 \n" \ + "vpsllw $6,%%ymm4,%%ymm2 \n" \ + "vpsraw $4,%%ymm4,%%ymm4 \n" \ + "vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" \ "vmovdqu (%[a_buf]),%%ymm5 \n" \ "vpsraw $2,%%ymm5,%%ymm5 \n" \