diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 502184e38..f66705eaa 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -127,8 +127,6 @@ extern "C" { #define HAS_MIRRORUVROW_SSSE3 #define HAS_NV12TOARGBROW_SSSE3 #define HAS_NV12TORGB565ROW_SSSE3 -#define HAS_NV21TOARGBROW_SSSE3 -#define HAS_NV21TORGB565ROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3 #define HAS_RAWTOYROW_SSSE3 #define HAS_RGB24TOARGBROW_SSSE3 @@ -209,8 +207,6 @@ extern "C" { #define HAS_J400TOARGBROW_AVX2 #define HAS_NV12TOARGBROW_AVX2 #define HAS_NV12TORGB565ROW_AVX2 -#define HAS_NV21TOARGBROW_AVX2 -#define HAS_NV21TORGB565ROW_AVX2 #define HAS_RGB565TOARGBROW_AVX2 #endif @@ -321,8 +317,6 @@ extern "C" { #define HAS_MIRRORUVROW_NEON #define HAS_NV12TOARGBROW_NEON #define HAS_NV12TORGB565ROW_NEON -#define HAS_NV21TOARGBROW_NEON -#define HAS_NV21TORGB565ROW_NEON #define HAS_RAWTOARGBROW_NEON #define HAS_RAWTOUVROW_NEON #define HAS_RAWTOYROW_NEON @@ -1068,11 +1062,6 @@ void NV12ToARGBRow_C(const uint8* src_y, uint8* dst_argb, struct YuvConstants* yuvconstants, int width); -void NV21ToRGB565Row_C(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, - struct YuvConstants* yuvconstants, - int width); void NV12ToRGB565Row_C(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, @@ -1433,21 +1422,11 @@ void NV12ToARGBRow_Any_SSSE3(const uint8* src_y, uint8* dst_argb, struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_Any_SSSE3(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, - struct YuvConstants* yuvconstants, - int width); void NV12ToARGBRow_Any_AVX2(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, struct YuvConstants* yuvconstants, int width); -void NV21ToARGBRow_Any_AVX2(const uint8* src_y, - const uint8* src_vu, - uint8* dst_argb, - struct YuvConstants* yuvconstants, - int width); void NV12ToRGB565Row_Any_SSSE3(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, diff --git a/source/row_common.cc b/source/row_common.cc index 341380332..db79d6ba6 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2476,48 +2476,6 @@ void NV12ToRGB565Row_SSSE3(const uint8* src_y, } #endif -#if defined(HAS_YUY2TOARGBROW_SSSE3) -void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, - uint8* dst_argb, - struct YuvConstants* yuvconstants, - int width) { - // Row buffers for intermediate YUV pixels. - SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]); - SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]); - SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - YUY2ToUV422Row_SSE2(src_yuy2, row_u, row_v, twidth); - YUY2ToYRow_SSE2(src_yuy2, row_y, twidth); - I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, yuvconstants, twidth); - src_yuy2 += twidth * 2; - dst_argb += twidth * 4; - width -= twidth; - } -} -#endif - -#if defined(HAS_UYVYTOARGBROW_SSSE3) -void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, - uint8* dst_argb, - struct YuvConstants* yuvconstants, - int width) { - // Row buffers for intermediate YUV pixels. - SIMD_ALIGNED(uint8 row_y[MAXTWIDTH]); - SIMD_ALIGNED(uint8 row_u[MAXTWIDTH / 2]); - SIMD_ALIGNED(uint8 row_v[MAXTWIDTH / 2]); - while (width > 0) { - int twidth = width > MAXTWIDTH ? MAXTWIDTH : width; - UYVYToUV422Row_SSE2(src_uyvy, row_u, row_v, twidth); - UYVYToYRow_SSE2(src_uyvy, row_y, twidth); - I422ToARGBRow_SSSE3(row_y, row_u, row_v, dst_argb, yuvconstants, twidth); - src_uyvy += twidth * 2; - dst_argb += twidth * 4; - width -= twidth; - } -} -#endif // !defined(LIBYUV_DISABLE_X86) - #if defined(HAS_I422TORGB565ROW_AVX2) void I422ToRGB565Row_AVX2(const uint8* src_y, const uint8* src_u, diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 68846dc8f..bc0cfe1f9 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1326,6 +1326,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "lea " MEMLEA(0x8, [u_buf]) ",%[u_buf] \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ +<<<<<<< HEAD + "punpcklbw %%xmm4,%%xmm4 \n" \ +======= +>>>>>>> refs/remotes/origin/master "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" // Read 4 UV from 422, upsample to 8 UV @@ -1336,6 +1340,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "punpcklbw %%xmm1,%%xmm0 \n" \ "punpcklwd %%xmm0,%%xmm0 \n" \ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ +<<<<<<< HEAD + "punpcklbw %%xmm4,%%xmm4 \n" \ +======= +>>>>>>> refs/remotes/origin/master "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" // Read 2 UV from 411, upsample to 8 UV @@ -1347,6 +1355,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "punpcklwd %%xmm0,%%xmm0 \n" \ "punpckldq %%xmm0,%%xmm0 \n" \ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ +<<<<<<< HEAD + "punpcklbw %%xmm4,%%xmm4 \n" \ +======= +>>>>>>> refs/remotes/origin/master "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" // Read 4 UV from NV12, upsample to 8 UV @@ -1355,8 +1367,49 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "lea " MEMLEA(0x8, [uv_buf]) ",%[uv_buf] \n" \ "punpcklwd %%xmm0,%%xmm0 \n" \ "movq " MEMACCESS([y_buf]) ",%%xmm4 \n" \ +<<<<<<< HEAD + "punpcklbw %%xmm4,%%xmm4 \n" \ "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" +// YUY2 shuf 8 Y to 16 Y. +static const vec8 kShuffleYUY2Y = { + 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 +}; + +// YUY2 shuf 4 UV to 8 UV. +static const vec8 kShuffleYUY2UV = { + 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 +}; + +// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. +#define READYUY2 \ + "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm4 \n" \ + "pshufb %[kShuffleYUY2Y], %%xmm4 \n" \ + "movdqu " MEMACCESS([yuy2_buf]) ",%%xmm0 \n" \ + "pshufb %[kShuffleYUY2UV], %%xmm0 \n" \ + "lea " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf] \n" + +// UYVY shuf 8 Y to 16 Y. +static const vec8 kShuffleUYVYY = { + 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 +}; + +// UYVY shuf 4 UV to 8 UV. +static const vec8 kShuffleUYVYUV = { + 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 +}; + +// Read 4 UYVY with 8 Y and update 4 UV to 8 UV. +#define READUYVY \ + "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm4 \n" \ + "pshufb %[kShuffleUYVYY], %%xmm4 \n" \ + "movdqu " MEMACCESS([uyvy_buf]) ",%%xmm0 \n" \ + "pshufb %[kShuffleUYVYUV], %%xmm0 \n" \ + "lea " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf] \n" +======= + "lea " MEMLEA(0x8, [y_buf]) ",%[y_buf] \n" +>>>>>>> refs/remotes/origin/master + // Convert 8 pixels: 8 UV and 8 Y #define YUVTORGB(yuvconstants) \ "movdqa %%xmm0,%%xmm1 \n" \ @@ -1371,7 +1424,10 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, "movdqa " MEMACCESS2(160, [yuvconstants]) ",%%xmm2 \n" \ "pmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%xmm3 \n" \ "psubw %%xmm3,%%xmm2 \n" \ +<<<<<<< HEAD +======= "punpcklbw %%xmm4,%%xmm4 \n" \ +>>>>>>> refs/remotes/origin/master "pmulhuw " MEMACCESS2(192, [yuvconstants]) ",%%xmm4 \n" \ "paddsw %%xmm4,%%xmm0 \n" \ "paddsw %%xmm4,%%xmm1 \n" \ @@ -1452,7 +1508,7 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -1479,7 +1535,7 @@ void OMITFP I444ToABGRRow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -1525,7 +1581,7 @@ void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf, [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0), [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24) : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); } @@ -1570,7 +1626,7 @@ void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf, [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0), [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW) : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5", "xmm6" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6" ); } @@ -1597,7 +1653,7 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -1624,7 +1680,7 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -1648,7 +1704,55 @@ void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] // Does not use r14. - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + READYUY2 + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [yuy2_buf]"+r"(yuy2_buf), // %[yuy2_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleYUY2Y]"m"(kShuffleYUY2Y), + [kShuffleYUY2UV]"m"(kShuffleYUY2UV) + // Does not use r14. + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" + ); +} + +void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + LABELALIGN + "1: \n" + READUYVY + YUVTORGB(yuvconstants) + STOREARGB + "sub $0x8,%[width] \n" + "jg 1b \n" + : [uyvy_buf]"+r"(uyvy_buf), // %[uyvy_buf] + [dst_argb]"+r"(dst_argb), // %[dst_argb] + [width]"+rm"(width) // %[width] + : [yuvconstants]"r"(yuvconstants), // %[yuvconstants] + [kShuffleUYVYY]"m"(kShuffleUYVYY), + [kShuffleUYVYUV]"m"(kShuffleUYVYUV) + // Does not use r14. + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -1675,7 +1779,7 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -1702,7 +1806,7 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -1729,7 +1833,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } @@ -1808,7 +1912,7 @@ void OMITFP I422ToBGRARow_AVX2(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I422TOBGRAROW_AVX2 @@ -1851,7 +1955,7 @@ void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I422TOARGBROW_AVX2 @@ -1893,7 +1997,7 @@ void OMITFP I422ToABGRRow_AVX2(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I422TOABGRROW_AVX2 @@ -1935,7 +2039,7 @@ void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf, [width]"+rm"(width) // %[width] : [yuvconstants]"r"(yuvconstants) // %[yuvconstants] : "memory", "cc", NACL_R14 - "xmm0", "xmm1", "xmm2", "xmm3", "xmm5" + "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" ); } #endif // HAS_I422TORGBAROW_AVX2 diff --git a/source/row_win.cc b/source/row_win.cc index e8d3a9a19..182842cfc 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -36,6 +36,10 @@ extern "C" { xmm0 = _mm_unpacklo_epi16(xmm0, xmm0); \ u_buf += 4; \ xmm4 = _mm_loadl_epi64((__m128i*)y_buf); \ +<<<<<<< HEAD + xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ +======= +>>>>>>> refs/remotes/origin/master y_buf += 8; \ // Convert 8 pixels: 8 UV and 8 Y. @@ -48,7 +52,10 @@ extern "C" { xmm0 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasB, xmm0); \ xmm1 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasG, xmm1); \ xmm2 = _mm_sub_epi16(*(__m128i*)YuvConstants->kUVBiasR, xmm2); \ +<<<<<<< HEAD +======= xmm4 = _mm_unpacklo_epi8(xmm4, xmm4); \ +>>>>>>> refs/remotes/origin/master xmm4 = _mm_mulhi_epu16(xmm4, *(__m128i*)YuvConstants->kYToRgb); \ xmm0 = _mm_adds_epi16(xmm0, xmm4); \ xmm1 = _mm_adds_epi16(xmm1, xmm4); \ @@ -1853,6 +1860,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, __asm vpermq ymm1, ymm1, 0xd8 \ __asm vpunpcklbw ymm0, ymm0, ymm1 /* UV */ \ __asm vmovdqu xmm4, [eax] /* Y */ \ +<<<<<<< HEAD + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ +======= +>>>>>>> refs/remotes/origin/master __asm lea eax, [eax + 16] \ } @@ -1865,6 +1877,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, __asm vpermq ymm0, ymm0, 0xd8 \ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ __asm vmovdqu xmm4, [eax] /* Y */ \ +<<<<<<< HEAD + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ +======= +>>>>>>> refs/remotes/origin/master __asm lea eax, [eax + 16] \ } @@ -1878,6 +1895,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, __asm vpermq ymm0, ymm0, 0xd8 \ __asm vpunpckldq ymm0, ymm0, ymm0 /* UVUVUVUV (upsample) */ \ __asm vmovdqu xmm4, [eax] /* Y */ \ +<<<<<<< HEAD + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ +======= +>>>>>>> refs/remotes/origin/master __asm lea eax, [eax + 16] \ } @@ -1888,6 +1910,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, __asm vpermq ymm0, ymm0, 0xd8 \ __asm vpunpcklwd ymm0, ymm0, ymm0 /* UVUV (upsample) */ \ __asm vmovdqu xmm4, [eax] /* Y */ \ +<<<<<<< HEAD + __asm vpermq ymm4, ymm4, 0xd8 \ + __asm vpunpcklbw ymm4, ymm4, ymm4 \ +======= +>>>>>>> refs/remotes/origin/master __asm lea eax, [eax + 16] \ } @@ -1903,8 +1930,11 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, __asm vmovdqu ymm3, ymmword ptr [YuvConstants + KUVBIASB] \ __asm vpsubw ymm0, ymm3, ymm0 \ /* Step 2: Find Y contribution to 16 R,G,B values */ \ +<<<<<<< HEAD +======= __asm vpermq ymm4, ymm4, 0xd8 \ __asm vpunpcklbw ymm4, ymm4, ymm4 \ +>>>>>>> refs/remotes/origin/master __asm vpmulhuw ymm4, ymm4, ymmword ptr [YuvConstants + KYTORGB] \ __asm vpaddsw ymm0, ymm0, ymm4 /* B += Y */ \ __asm vpaddsw ymm1, ymm1, ymm4 /* G += Y */ \ @@ -1987,7 +2017,7 @@ void I422ToARGBRow_AVX2(const uint8* y_buf, mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb - mov ebp, [esp + 12 + 20] // YuvConstants + mov ebp, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha @@ -2027,7 +2057,7 @@ void I444ToARGBRow_AVX2(const uint8* y_buf, mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb - mov ebp, [esp + 12 + 20] // YuvConstants + mov ebp, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha @@ -2066,7 +2096,7 @@ void I444ToABGRRow_AVX2(const uint8* y_buf, mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // abgr - mov ebp, [esp + 12 + 20] // YuvConstants + mov ebp, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha @@ -2105,7 +2135,7 @@ void I411ToARGBRow_AVX2(const uint8* y_buf, mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // abgr - mov ebp, [esp + 12 + 20] // YuvConstants + mov ebp, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha @@ -2142,7 +2172,7 @@ void NV12ToARGBRow_AVX2(const uint8* y_buf, mov eax, [esp + 8 + 4] // Y mov esi, [esp + 8 + 8] // UV mov edx, [esp + 8 + 12] // argb - mov ebp, [esp + 8 + 16] // YuvConstants + mov ebp, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha @@ -2181,7 +2211,7 @@ void I422ToBGRARow_AVX2(const uint8* y_buf, mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // abgr - mov ebp, [esp + 12 + 20] // YuvConstants + mov ebp, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha @@ -2221,7 +2251,7 @@ void I422ToRGBARow_AVX2(const uint8* y_buf, mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // abgr - mov ebp, [esp + 12 + 20] // YuvConstants + mov ebp, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha @@ -2261,7 +2291,7 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb - mov ebp, [esp + 12 + 20] // YuvConstants + mov ebp, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha @@ -2293,6 +2323,10 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm lea esi, [esi + 8] \ __asm punpcklbw xmm0, xmm1 /* UV */ \ __asm movq xmm4, qword ptr [eax] \ +<<<<<<< HEAD + __asm punpcklbw xmm4, xmm4 \ +======= +>>>>>>> refs/remotes/origin/master __asm lea eax, [eax + 8] \ } @@ -2304,6 +2338,10 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm punpcklbw xmm0, xmm1 /* UV */ \ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ __asm movq xmm4, qword ptr [eax] \ +<<<<<<< HEAD + __asm punpcklbw xmm4, xmm4 \ +======= +>>>>>>> refs/remotes/origin/master __asm lea eax, [eax + 8] \ } @@ -2316,6 +2354,10 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ __asm punpckldq xmm0, xmm0 /* UVUVUVUV (upsample) */ \ __asm movq xmm4, qword ptr [eax] \ +<<<<<<< HEAD + __asm punpcklbw xmm4, xmm4 \ +======= +>>>>>>> refs/remotes/origin/master __asm lea eax, [eax + 8] \ } @@ -2325,9 +2367,52 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm lea esi, [esi + 8] \ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ __asm movq xmm4, qword ptr [eax] \ +<<<<<<< HEAD + __asm punpcklbw xmm4, xmm4 \ __asm lea eax, [eax + 8] \ } +// YUY2 shuf 8 Y to 16 Y. +static const vec8 kShuffleYUY2Y = { + 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14 +}; + +// YUY2 shuf 4 UV to 8 UV. +static const vec8 kShuffleYUY2UV = { + 1, 3, 1, 3, 5, 7, 5, 7, 9, 11, 9, 11, 13, 15, 13, 15 +}; + +// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV. +#define READYUY2 __asm { \ + __asm movdqu xmm4, [eax] /* YUY2 */ \ + __asm pshufb xmm4, xmmword ptr kShuffleYUY2Y \ + __asm movdqu xmm0, [eax] /* UV */ \ + __asm pshufb xmm0, xmmword ptr kShuffleYUY2UV \ + __asm lea eax, [eax + 16] \ + } + +// UYVY shuf 8 Y to 16 Y. +static const vec8 kShuffleUYVYY = { + 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15 +}; + +// UYVY shuf 4 UV to 8 UV. +static const vec8 kShuffleUYVYUV = { + 0, 2, 0, 2, 4, 6, 4, 6, 8, 10, 8, 10, 12, 14, 12, 14 +}; + +// Read 4 UYVY with 8 Y and update 4 UV to 8 UV. +#define READUYVY __asm { \ + __asm movdqu xmm4, [eax] /* UYVY */ \ + __asm pshufb xmm4, xmmword ptr kShuffleUYVYY \ + __asm movdqu xmm0, [eax] /* UV */ \ + __asm pshufb xmm0, xmmword ptr kShuffleUYVYUV \ + __asm lea eax, [eax + 16] \ +======= + __asm lea eax, [eax + 8] \ +>>>>>>> refs/remotes/origin/master + } + // Convert 8 pixels: 8 UV and 8 Y. #define YUVTORGB(YuvConstants) __asm { \ __asm movdqa xmm1, xmm0 \ @@ -2342,7 +2427,10 @@ void I422ToABGRRow_AVX2(const uint8* y_buf, __asm movdqa xmm2, xmmword ptr [YuvConstants + KUVBIASR] \ __asm pmaddubsw xmm3, xmmword ptr [YuvConstants + KUVTOR] \ __asm psubw xmm2, xmm3 \ +<<<<<<< HEAD +======= __asm punpcklbw xmm4, xmm4 \ +>>>>>>> refs/remotes/origin/master __asm pmulhuw xmm4, xmmword ptr [YuvConstants + KYTORGB] \ __asm paddsw xmm0, xmm4 /* B += Y */ \ __asm paddsw xmm1, xmm4 /* G += Y */ \ @@ -2492,7 +2580,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb - mov ebp, [esp + 12 + 20] // YuvConstants + mov ebp, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha @@ -2529,7 +2617,7 @@ void I444ToABGRRow_SSSE3(const uint8* y_buf, mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // abgr - mov ebp, [esp + 12 + 20] // YuvConstants + mov ebp, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha @@ -2566,7 +2654,7 @@ void I422ToRGB24Row_SSSE3(const uint8* y_buf, mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb - mov ebp, [esp + 12 + 20] // YuvConstants + mov ebp, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi movdqa xmm5, xmmword ptr kShuffleMaskARGBToRGB24_0 @@ -2604,7 +2692,7 @@ void I422ToRAWRow_SSSE3(const uint8* y_buf, mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb - mov ebp, [esp + 12 + 20] // YuvConstants + mov ebp, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi movdqa xmm5, xmmword ptr kShuffleMaskARGBToRAW_0 @@ -2642,7 +2730,7 @@ void I422ToRGB565Row_SSSE3(const uint8* y_buf, mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb - mov ebp, [esp + 12 + 20] // YuvConstants + mov ebp, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate mask 0x0000001f @@ -2685,7 +2773,7 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb - mov ebp, [esp + 12 + 20] // YuvConstants + mov ebp, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha @@ -2723,7 +2811,7 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // abgr - mov ebp, [esp + 12 + 20] // YuvConstants + mov ebp, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha @@ -2757,7 +2845,7 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, mov eax, [esp + 8 + 4] // Y mov esi, [esp + 8 + 8] // UV mov edx, [esp + 8 + 12] // argb - mov ebp, [esp + 8 + 16] // YuvConstants + mov ebp, [esp + 8 + 16] // yuvconstants mov ecx, [esp + 8 + 20] // width pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha @@ -2775,6 +2863,62 @@ void NV12ToARGBRow_SSSE3(const uint8* y_buf, } } +// 8 pixels. +// 4 YUY2 values with 8 Y and 4 UV producing 8 ARGB (32 bytes). +__declspec(naked) +void YUY2ToARGBRow_SSSE3(const uint8* src_yuy2, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width) { + __asm { + push ebp + mov eax, [esp + 4 + 4] // yuy2 + mov edx, [esp + 4 + 8] // argb + mov ebp, [esp + 4 + 12] // yuvconstants + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READYUY2 + YUVTORGB(ebp) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebp + ret + } +} + +// 8 pixels. +// 4 UYVY values with 8 Y and 4 UV producing 8 ARGB (32 bytes). +__declspec(naked) +void UYVYToARGBRow_SSSE3(const uint8* src_uyvy, + uint8* dst_argb, + struct YuvConstants* yuvconstants, + int width) { + __asm { + push ebp + mov eax, [esp + 4 + 4] // uyvy + mov edx, [esp + 4 + 8] // argb + mov ebp, [esp + 4 + 12] // yuvconstants + mov ecx, [esp + 4 + 16] // width + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + + convertloop: + READUYVY + YUVTORGB(ebp) + STOREARGB + + sub ecx, 8 + jg convertloop + + pop ebp + ret + } +} + __declspec(naked) void I422ToBGRARow_SSSE3(const uint8* y_buf, const uint8* u_buf, @@ -2790,7 +2934,7 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf, mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb - mov ebp, [esp + 12 + 20] // YuvConstants + mov ebp, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi @@ -2824,7 +2968,7 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf, mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb - mov ebp, [esp + 12 + 20] // YuvConstants + mov ebp, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha @@ -2859,7 +3003,7 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, mov esi, [esp + 12 + 8] // U mov edi, [esp + 12 + 12] // V mov edx, [esp + 12 + 16] // argb - mov ebp, [esp + 12 + 20] // YuvConstants + mov ebp, [esp + 12 + 20] // yuvconstants mov ecx, [esp + 12 + 24] // width sub edi, esi @@ -3524,8 +3668,7 @@ void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int count) { #ifdef HAS_YUY2TOYROW_AVX2 __declspec(naked) -void YUY2ToYRow_AVX2(const uint8* src_yuy2, - uint8* dst_y, int pix) { +void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int pix) { __asm { mov eax, [esp + 4] // src_yuy2 mov edx, [esp + 8] // dst_y