diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 319652a37..206e270ee 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -220,6 +220,11 @@ extern "C" { #define HAS_ARGBUNATTENUATEROW_AVX2 #endif +// The following are available require VS2012 +#if !defined(LIBYUV_DISABLE_X86) && defined(VISUALC_HAS_AVX2) +#define HAS_YTOARGBROW_AVX2 +#endif + // The following are Yasm x86 only: // TODO(fbarchard): Port AVX2 to inline. #if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM) @@ -980,9 +985,6 @@ void I422ToRGB565Row_C(const uint8* src_y, const uint8* src_v, uint8* dst_rgb565, int width); -void YToARGBRow_C(const uint8* src_y, - uint8* dst_argb, - int width); void I422ToARGBRow_AVX2(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1182,15 +1184,25 @@ void I422ToRAWRow_Any_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_argb, int width); + +void YToARGBRow_C(const uint8* src_y, + uint8* dst_argb, + int width); void YToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width); +void YToARGBRow_AVX2(const uint8* src_y, + uint8* dst_argb, + int width); void YToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width); void YToARGBRow_Any_SSE2(const uint8* src_y, uint8* dst_argb, int width); +void YToARGBRow_Any_AVX2(const uint8* src_y, + uint8* dst_argb, + int width); void YToARGBRow_Any_NEON(const uint8* src_y, uint8* dst_argb, int width); diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 7efda5c07..6a890e91f 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -276,6 +276,14 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, } } #endif +#if defined(HAS_YTOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + YToARGBRow = YToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + YToARGBRow = YToARGBRow_AVX2; + } + } +#endif #if defined(HAS_YTOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { YToARGBRow = YToARGBRow_Any_NEON; diff --git a/source/row_any.cc b/source/row_any.cc index 36aa70d35..3a8fa0011 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -169,6 +169,10 @@ RGBANY(I400ToARGBRow_Any_SSE2, I400ToARGBRow_SSE2, I400ToARGBRow_C, RGBANY(YToARGBRow_Any_SSE2, YToARGBRow_SSE2, YToARGBRow_C, 7, 1, 4) #endif +#if defined(HAS_YTOARGBROW_AVX2) +RGBANY(YToARGBRow_Any_AVX2, YToARGBRow_AVX2, YToARGBRow_C, + 15, 1, 4) +#endif #if defined(HAS_YUY2TOARGBROW_SSSE3) RGBANY(YUY2ToARGBRow_Any_SSSE3, YUY2ToARGBRow_SSSE3, YUY2ToARGBRow_C, 15, 2, 4) diff --git a/source/row_win.cc b/source/row_win.cc index c7abc001f..a906968f9 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2299,8 +2299,8 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, #endif // HAS_I422TOARGBROW_SSSE3 -// TODO(fbarchard): Remove shift by 6. #ifdef HAS_YTOARGBROW_SSE2 +// 8 pixels of Y converted to 8 pixels of ARGB (32 bytes). __declspec(naked) __declspec(align(16)) void YToARGBRow_SSE2(const uint8* y_buf, uint8* rgb_buf, @@ -2341,12 +2341,62 @@ void YToARGBRow_SSE2(const uint8* y_buf, lea edx, [edx + 32] sub ecx, 8 jg convertloop - ret } } #endif // HAS_YTOARGBROW_SSE2 +#ifdef HAS_YTOARGBROW_AVX2 +// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes). +__declspec(naked) __declspec(align(16)) +void YToARGBRow_AVX2(const uint8* y_buf, + uint8* rgb_buf, + int width) { + __asm { + vpcmpeqb ymm4, ymm4, ymm4 // generate mask 0xff000000 + vpslld ymm4, ymm4, 24 + mov eax, 0x04ad04ad // 04ad = 1197 = round(1.164 * 64 * 16) + vmovd xmm3, eax + vbroadcastss ymm3, xmm3 + mov eax, 0x4a7f4a7f // 4a7f = 19071 = round(1.164 * 64 * 256) + vmovd xmm2, eax + vbroadcastss ymm2, xmm2 + + mov eax, [esp + 4] // Y + mov edx, [esp + 8] // rgb + mov ecx, [esp + 12] // width + + convertloop: + // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164 + vmovdqu xmm0, [eax] + lea eax, [eax + 16] + vpermq ymm0, ymm0, 0xd8 + vpunpcklbw ymm0, ymm0, ymm0 // Y.Y + vpmulhuw ymm0, ymm0, ymm2 + vpsubusw ymm0, ymm0, ymm3 + vpsrlw ymm0, ymm0, 6 + vpackuswb ymm0, ymm0, ymm0 // G. still mutated: 3120 + + // TODO(fbarchard): Weave alpha with unpack. + // Step 2: Weave into ARGB + vpunpcklbw ymm1, ymm0, ymm0 // GG - mutates + vpermq ymm1, ymm1, 0xd8 + vpunpcklwd ymm0, ymm1, ymm1 // GGGG first 4 pixels + vpunpckhwd ymm1, ymm1, ymm1 // GGGG next 4 pixels + vpor ymm0, ymm0, ymm4 + vpor ymm1, ymm1, ymm4 + vmovdqu [edx], ymm0 + vmovdqu [edx + 32], ymm1 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + vzeroupper + ret + } +} +#endif // HAS_YTOARGBROW_AVX2 + + #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. static const uvec8 kShuffleMirror = { diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 5ac2a2ecd..37eb97dbc 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -1255,11 +1255,13 @@ TEST_F(libyuvTest, TestYToARGB) { YToARGB(y, 0, argb, 0, 32, 1); for (int i = 0; i < 32; ++i) { - printf("%d: %d <-> %d,%d,%d,%d\n", y[i], expectedg[i], + printf("%2d %d: %d <-> %d,%d,%d,%d\n", i, y[i], expectedg[i], argb[i * 4 + 0], argb[i * 4 + 1], argb[i * 4 + 2], argb[i * 4 + 3]); + } + for (int i = 0; i < 32; ++i) { EXPECT_NEAR(expectedg[i], argb[i * 4 + 0], 1); } }