10/12 bit YUV replicate upper bits to low bits before converting to RGB

- shift high bits of 10 and 12 bit into lower bits

Bug: libyuv:941, libyuv:942,
Change-Id: I14381dbf226ef27dcce06893ea88860835639baa
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3906085
Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Wan-Teh Chang <wtc@google.com>
This commit is contained in:
Frank Barchard 2022-09-20 13:40:07 -07:00 committed by libyuv LUCI CQ
parent e4b1ddd8fe
commit 8fc02134c8
2 changed files with 81 additions and 9 deletions

View File

@ -1723,7 +1723,7 @@ static __inline void YuvPixel10_16(uint16_t y,
int* r, int* r,
const struct YuvConstants* yuvconstants) { const struct YuvConstants* yuvconstants) {
LOAD_YUV_CONSTANTS; LOAD_YUV_CONSTANTS;
uint32_t y32 = y << 6; uint32_t y32 = (y << 6) | (y >> 4);
u = clamp255(u >> 2); u = clamp255(u >> 2);
v = clamp255(v >> 2); v = clamp255(v >> 2);
CALC_RGB16; CALC_RGB16;
@ -1742,7 +1742,7 @@ static __inline void YuvPixel12_16(int16_t y,
int* r, int* r,
const struct YuvConstants* yuvconstants) { const struct YuvConstants* yuvconstants) {
LOAD_YUV_CONSTANTS; LOAD_YUV_CONSTANTS;
uint32_t y32 = y << 4; uint32_t y32 = (y << 4) | (y >> 8);
u = clamp255(u >> 4); u = clamp255(u >> 4);
v = clamp255(v >> 4); v = clamp255(v >> 4);
CALC_RGB16; CALC_RGB16;
@ -4052,6 +4052,30 @@ void I422ToARGB4444Row_AVX2(const uint8_t* src_y,
} }
#endif #endif
// SSSE3 implemented in row_gcc.cc row_win.cc for 32 bit
// For row_win Visual C (not clangcl)
#if defined(HAS_I422TORGB24ROW_SSSE3) && defined(_M_X64) && !defined(__clang__)
void I422ToRGB24Row_SSSE3(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I422ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
src_y += twidth;
src_u += twidth / 2;
src_v += twidth / 2;
dst_rgb24 += twidth * 3;
width -= twidth;
}
}
#endif
#if defined(HAS_I422TORGB24ROW_AVX2) #if defined(HAS_I422TORGB24ROW_AVX2)
void I422ToRGB24Row_AVX2(const uint8_t* src_y, void I422ToRGB24Row_AVX2(const uint8_t* src_y,
const uint8_t* src_u, const uint8_t* src_u,
@ -4078,6 +4102,29 @@ void I422ToRGB24Row_AVX2(const uint8_t* src_y,
} }
#endif #endif
// For row_win Visual C (not clangcl)
#if defined(HAS_I444TORGB24ROW_SSSE3) && defined(_M_X64) && !defined(__clang__)
void I444ToRGB24Row_SSSE3(const uint8_t* src_y,
const uint8_t* src_u,
const uint8_t* src_v,
uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants,
int width) {
// Row buffer for intermediate ARGB pixels.
SIMD_ALIGNED(uint8_t row[MAXTWIDTH * 4]);
while (width > 0) {
int twidth = width > MAXTWIDTH ? MAXTWIDTH : width;
I444ToARGBRow_SSSE3(src_y, src_u, src_v, row, yuvconstants, twidth);
ARGBToRGB24Row_SSSE3(row, dst_rgb24, twidth);
src_y += twidth;
src_u += twidth;
src_v += twidth;
dst_rgb24 += twidth * 3;
width -= twidth;
}
}
#endif
#if defined(HAS_I444TORGB24ROW_AVX2) #if defined(HAS_I444TORGB24ROW_AVX2)
void I444ToRGB24Row_AVX2(const uint8_t* src_y, void I444ToRGB24Row_AVX2(const uint8_t* src_y,
const uint8_t* src_u, const uint8_t* src_u,

View File

@ -2348,7 +2348,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
"packuswb %%xmm3,%%xmm3 \n" \ "packuswb %%xmm3,%%xmm3 \n" \
"punpcklwd %%xmm3,%%xmm3 \n" \ "punpcklwd %%xmm3,%%xmm3 \n" \
"movdqu (%[y_buf]),%%xmm4 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \
"movdqa %%xmm4,%%xmm2 \n" \
"psllw $6,%%xmm4 \n" \ "psllw $6,%%xmm4 \n" \
"psraw $4,%%xmm2 \n" \
"paddw %%xmm2,%%xmm4 \n" \
"lea 0x10(%[y_buf]),%[y_buf] \n" "lea 0x10(%[y_buf]),%[y_buf] \n"
#define READYUVA210 \ #define READYUVA210 \
@ -2360,7 +2363,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
"packuswb %%xmm3,%%xmm3 \n" \ "packuswb %%xmm3,%%xmm3 \n" \
"punpcklwd %%xmm3,%%xmm3 \n" \ "punpcklwd %%xmm3,%%xmm3 \n" \
"movdqu (%[y_buf]),%%xmm4 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \
"movdqa %%xmm4,%%xmm2 \n" \
"psllw $6,%%xmm4 \n" \ "psllw $6,%%xmm4 \n" \
"psraw $4,%%xmm2 \n" \
"paddw %%xmm2,%%xmm4 \n" \
"lea 0x10(%[y_buf]),%[y_buf] \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" \
"movdqu (%[a_buf]),%%xmm5 \n" \ "movdqu (%[a_buf]),%%xmm5 \n" \
"psraw $2,%%xmm5 \n" \ "psraw $2,%%xmm5 \n" \
@ -2379,7 +2385,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
"punpckhwd %%xmm2,%%xmm1 \n" \ "punpckhwd %%xmm2,%%xmm1 \n" \
"packuswb %%xmm1,%%xmm3 \n" \ "packuswb %%xmm1,%%xmm3 \n" \
"movdqu (%[y_buf]),%%xmm4 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \
"movdqa %%xmm4,%%xmm2 \n" \
"psllw $6,%%xmm4 \n" \ "psllw $6,%%xmm4 \n" \
"psraw $4,%%xmm2 \n" \
"paddw %%xmm2,%%xmm4 \n" \
"lea 0x10(%[y_buf]),%[y_buf] \n" "lea 0x10(%[y_buf]),%[y_buf] \n"
// Read 8 UV from 444 10 bit. With 8 Alpha. // Read 8 UV from 444 10 bit. With 8 Alpha.
@ -2394,7 +2403,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
"punpckhwd %%xmm2,%%xmm1 \n" \ "punpckhwd %%xmm2,%%xmm1 \n" \
"packuswb %%xmm1,%%xmm3 \n" \ "packuswb %%xmm1,%%xmm3 \n" \
"movdqu (%[y_buf]),%%xmm4 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \
"psllw $0x6,%%xmm4 \n" \ "movdqa %%xmm4,%%xmm2 \n" \
"psllw $6,%%xmm4 \n" \
"psraw $4,%%xmm2 \n" \
"paddw %%xmm2,%%xmm4 \n" \
"lea 0x10(%[y_buf]),%[y_buf] \n" \ "lea 0x10(%[y_buf]),%[y_buf] \n" \
"movdqu (%[a_buf]),%%xmm5 \n" \ "movdqu (%[a_buf]),%%xmm5 \n" \
"psraw $2,%%xmm5 \n" \ "psraw $2,%%xmm5 \n" \
@ -2411,7 +2423,10 @@ void RGBAToUVRow_SSSE3(const uint8_t* src_rgba,
"packuswb %%xmm3,%%xmm3 \n" \ "packuswb %%xmm3,%%xmm3 \n" \
"punpcklwd %%xmm3,%%xmm3 \n" \ "punpcklwd %%xmm3,%%xmm3 \n" \
"movdqu (%[y_buf]),%%xmm4 \n" \ "movdqu (%[y_buf]),%%xmm4 \n" \
"psllw $0x4,%%xmm4 \n" \ "movdqa %%xmm4,%%xmm2 \n" \
"psllw $6,%%xmm4 \n" \
"psraw $4,%%xmm2 \n" \
"paddw %%xmm2,%%xmm4 \n" \
"lea 0x10(%[y_buf]),%[y_buf] \n" "lea 0x10(%[y_buf]),%[y_buf] \n"
// Read 4 UV from 422, upsample to 8 UV. With 8 Alpha. // Read 4 UV from 422, upsample to 8 UV. With 8 Alpha.
@ -3432,7 +3447,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
"vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
"vmovdqu (%[y_buf]),%%ymm4 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \
"vpsllw $6,%%ymm4,%%ymm4 \n" \ "vpsllw $6,%%ymm4,%%ymm2 \n" \
"vpsraw $4,%%ymm4,%%ymm4 \n" \
"vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
"lea 0x20(%[y_buf]),%[y_buf] \n" "lea 0x20(%[y_buf]),%[y_buf] \n"
// Read 8 UV from 210, upsample to 16 UV. With 16 Alpha. // Read 8 UV from 210, upsample to 16 UV. With 16 Alpha.
@ -3447,7 +3464,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
"vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
"vmovdqu (%[y_buf]),%%ymm4 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \
"vpsllw $6,%%ymm4,%%ymm4 \n" \ "vpsllw $6,%%ymm4,%%ymm2 \n" \
"vpsraw $4,%%ymm4,%%ymm4 \n" \
"vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
"lea 0x20(%[y_buf]),%[y_buf] \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" \
"vmovdqu (%[a_buf]),%%ymm5 \n" \ "vmovdqu (%[a_buf]),%%ymm5 \n" \
"vpsraw $2,%%ymm5,%%ymm5 \n" \ "vpsraw $2,%%ymm5,%%ymm5 \n" \
@ -3465,7 +3484,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \
"vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
"vmovdqu (%[y_buf]),%%ymm4 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \
"vpsllw $6,%%ymm4,%%ymm4 \n" \ "vpsllw $6,%%ymm4,%%ymm2 \n" \
"vpsraw $4,%%ymm4,%%ymm4 \n" \
"vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
"lea 0x20(%[y_buf]),%[y_buf] \n" "lea 0x20(%[y_buf]),%[y_buf] \n"
// Read 8 UV from 212 12 bit, upsample to 16 UV // Read 8 UV from 212 12 bit, upsample to 16 UV
@ -3480,7 +3501,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \ "vpackuswb %%ymm3,%%ymm3,%%ymm3 \n" \
"vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n" \
"vmovdqu (%[y_buf]),%%ymm4 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \
"vpsllw $0x4,%%ymm4,%%ymm4 \n" \ "vpsllw $4,%%ymm4,%%ymm2 \n" \
"vpsraw $8,%%ymm4,%%ymm4 \n" \
"vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
"lea 0x20(%[y_buf]),%[y_buf] \n" "lea 0x20(%[y_buf]),%[y_buf] \n"
// Read 16 UV from 410. With 16 Alpha. // Read 16 UV from 410. With 16 Alpha.
@ -3494,7 +3517,9 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8_t* y_buf,
"vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \ "vpunpcklwd %%ymm2,%%ymm3,%%ymm3 \n" \
"vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \ "vpackuswb %%ymm1,%%ymm3,%%ymm3 \n" \
"vmovdqu (%[y_buf]),%%ymm4 \n" \ "vmovdqu (%[y_buf]),%%ymm4 \n" \
"vpsllw $6,%%ymm4,%%ymm4 \n" \ "vpsllw $6,%%ymm4,%%ymm2 \n" \
"vpsraw $4,%%ymm4,%%ymm4 \n" \
"vpaddw %%ymm2,%%ymm4,%%ymm4 \n" \
"lea 0x20(%[y_buf]),%[y_buf] \n" \ "lea 0x20(%[y_buf]),%[y_buf] \n" \
"vmovdqu (%[a_buf]),%%ymm5 \n" \ "vmovdqu (%[a_buf]),%%ymm5 \n" \
"vpsraw $2,%%ymm5,%%ymm5 \n" \ "vpsraw $2,%%ymm5,%%ymm5 \n" \