diff --git a/README.chromium b/README.chromium index 2cf1c3b75..5cd5f95ea 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 659 +Version: 660 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 62a9f5d2a..63f5f4e16 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -133,12 +133,12 @@ extern "C" { #define HAS_ARGBCOLORTABLEROW_X86 // Visual C 2012 required for AVX2. #if _MSC_VER >= 1700 -// TODO(fbarchard): Hook these up to all functions. e.g. format conversion. #define HAS_ARGBSHUFFLEROW_AVX2 #define HAS_ARGBTOUVROW_AVX2 -#define HAS_ARGBTOYROW_AVX2 #define HAS_ARGBTOYJROW_AVX2 +#define HAS_ARGBTOYROW_AVX2 #define HAS_HALFROW_AVX2 +#define HAS_I422TOARGBROW_AVX2 #define HAS_MERGEUVROW_AVX2 #define HAS_MIRRORROW_AVX2 #define HAS_SPLITUVROW_AVX2 @@ -826,7 +826,11 @@ void I422ToRGB565Row_C(const uint8* src_y, void YToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width); - +void I422ToARGBRow_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I444ToARGBRow_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -950,6 +954,11 @@ void I422ToRGBARow_Unaligned_SSSE3(const uint8* src_y, const uint8* src_v, uint8* dst_rgba, int width); +void I422ToARGBRow_Any_AVX2(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width); void I444ToARGBRow_Any_SSSE3(const uint8* src_y, const uint8* src_u, const uint8* src_v, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 0564f777e..9670bea3e 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 659 +#define LIBYUV_VERSION 660 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 8cb62fcc3..55d4d6904 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -152,14 +152,24 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, } } } -#elif defined(HAS_I422TOARGBROW_NEON) +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToARGBRow = I422ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToARGBRow = I422ToARGBRow_NEON; } } -#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2) +#endif +#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && diff --git a/source/convert_from.cc b/source/convert_from.cc index bb7c35b6f..870a81c8d 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -557,14 +557,24 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, } } } -#elif defined(HAS_I422TOARGBROW_NEON) +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToARGBRow = I422ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToARGBRow = I422ToARGBRow_NEON; } } -#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2) +#endif +#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && diff --git a/source/format_conversion.cc b/source/format_conversion.cc index d2e773f3d..b17619646 100644 --- a/source/format_conversion.cc +++ b/source/format_conversion.cc @@ -410,14 +410,24 @@ int I420ToBayer(const uint8* src_y, int src_stride_y, I422ToARGBRow = I422ToARGBRow_SSSE3; } } -#elif defined(HAS_I422TOARGBROW_NEON) +#endif +#if defined(HAS_I422TOARGBROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2) && width >= 16) { + I422ToARGBRow = I422ToARGBRow_Any_AVX2; + if (IS_ALIGNED(width, 16)) { + I422ToARGBRow = I422ToARGBRow_AVX2; + } + } +#endif +#if defined(HAS_I422TOARGBROW_NEON) if (TestCpuFlag(kCpuHasNEON) && width >= 8) { I422ToARGBRow = I422ToARGBRow_Any_NEON; if (IS_ALIGNED(width, 8)) { I422ToARGBRow = I422ToARGBRow_NEON; } } -#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2) +#endif +#if defined(HAS_I422TOARGBROW_MIPS_DSPR2) if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && diff --git a/source/row_any.cc b/source/row_any.cc index 7e042d603..20cd7c044 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -60,6 +60,9 @@ YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7) YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15) YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15) #endif // HAS_I422TOARGBROW_SSSE3 +#ifdef HAS_I422TOARGBROW_AVX2 +YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15) +#endif // HAS_I422TOARGBROW_AVX2 #ifdef HAS_I422TOARGBROW_NEON YANY(I444ToARGBRow_Any_NEON, I444ToARGBRow_NEON, I444ToARGBRow_C, 0, 4, 7) YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1, 4, 7) diff --git a/source/row_win.cc b/source/row_win.cc index 5a1ec83e7..826f9dca9 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2068,8 +2068,6 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, } #endif // HAS_ARGBTOYROW_SSSE3 -#ifdef HAS_I422TOARGBROW_SSSE3 - #define YG 74 /* static_cast(1.164 * 64 + 0.5) */ #define UB 127 /* min(63,static_cast(2.018 * 64)) */ @@ -2085,6 +2083,104 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, #define BG UG * 128 + VG * 128 #define BR UR * 128 + VR * 128 +#ifdef HAS_I422TOARGBROW_AVX2 + +static const lvec8 kUVToB_AVX = { + UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, + UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB +}; + +static const lvec8 kUVToR_AVX = { + UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, + UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR +}; + +static const lvec8 kUVToG_AVX = { + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, + UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG +}; + +static const lvec16 kYToRgb_AVX = { YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG }; +static const lvec16 kYSub16_AVX = { 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16 }; +static const lvec16 kUVBiasB_AVX = { BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}; +static const lvec16 kUVBiasG_AVX = { BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}; +static const lvec16 kUVBiasR_AVX = { BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}; + +// 16 pixels +// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes). +__declspec(naked) __declspec(align(16)) +void I422ToARGBRow_AVX2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* dst_argb, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // argb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + vpcmpeqb ymm5, ymm5, ymm5 // generate 0xffffffffffffffff for alpha + vpxor ymm4, ymm4, ymm4 + + align 16 + convertloop: + vmovq xmm0, qword ptr [esi] // U + vmovq xmm1, qword ptr [esi + edi] // V + lea esi, [esi + 8] + vpunpcklbw ymm0, ymm0, ymm1 // UV + vpermq ymm0, ymm0, 0xd8 + vpunpcklwd ymm0, ymm0, ymm0 // UVUV + vpmaddubsw ymm2, ymm0, kUVToB_AVX // scale B UV + vpmaddubsw ymm1, ymm0, kUVToG_AVX // scale G UV + vpmaddubsw ymm0, ymm0, kUVToR_AVX // scale R UV + vpsubw ymm2, ymm2, kUVBiasB_AVX // unbias back to signed + vpsubw ymm1, ymm1, kUVBiasG_AVX + vpsubw ymm0, ymm0, kUVBiasR_AVX + + // Step 2: Find Y contribution to 16 R,G,B values + vmovdqu xmm3, [eax] // NOLINT + lea eax, [eax + 16] + vpermq ymm3, ymm3, 0xd8 + vpunpcklbw ymm3, ymm3, ymm4 + vpsubsw ymm3, ymm3, kYSub16_AVX + vpmullw ymm3, ymm3, kYToRgb_AVX + vpaddsw ymm2, ymm2, ymm3 // B += Y + vpaddsw ymm1, ymm1, ymm3 // G += Y + vpaddsw ymm0, ymm0, ymm3 // R += Y + vpsraw ymm2, ymm2, 6 + vpsraw ymm1, ymm1, 6 + vpsraw ymm0, ymm0, 6 + vpackuswb ymm2, ymm2, ymm2 // B + vpackuswb ymm1, ymm1, ymm1 // G + vpackuswb ymm0, ymm0, ymm0 // R + + // Step 3: Weave into ARGB + vpunpcklbw ymm2, ymm2, ymm1 // BG + vpermq ymm2, ymm2, 0xd8 + vpunpcklbw ymm0, ymm0, ymm5 // RA + vpermq ymm0, ymm0, 0xd8 + vpunpcklwd ymm1, ymm2, ymm0 // BGRA first 4 pixels + vpunpckhwd ymm2, ymm2, ymm0 // BGRA next 4 pixels + vmovdqu [edx], ymm1 + vmovdqu [edx + 32], ymm2 + lea edx, [edx + 64] + sub ecx, 16 + jg convertloop + vzeroupper + + pop edi + pop esi + ret + } +} +#endif // HAS_I422TOARGBROW_AVX2 + +#ifdef HAS_I422TOARGBROW_SSSE3 + static const vec8 kUVToB = { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB }; diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index ac358b4c4..86a47ec9d 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -497,11 +497,10 @@ TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1, ARGB, 4) TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 0, ARGB, 4) TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 0, ARGB, 4) TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 0, ARGB, 4) -// TODO(fbarchard): These tests fail is width is odd. -// TESTPLANARTOB(I420, 2, 2, BayerBGGR, 1, 1, 2, ARGB, 4) -// TESTPLANARTOB(I420, 2, 2, BayerRGGB, 1, 1, 2, ARGB, 4) -// TESTPLANARTOB(I420, 2, 2, BayerGBRG, 1, 1, 2, ARGB, 4) -// TESTPLANARTOB(I420, 2, 2, BayerGRBG, 1, 1, 2, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, BayerBGGR, 1, 1, 2, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, BayerRGGB, 1, 1, 2, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, BayerGBRG, 1, 1, 2, ARGB, 4) +TESTPLANARTOB(I420, 2, 2, BayerGRBG, 1, 1, 2, ARGB, 4) #define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ W1280, DIFF, N, NEG, OFF) \