diff --git a/README.chromium b/README.chromium index e835cc1ba..a6f7c5f4a 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 358 +Version: 359 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 23e91e3c6..8e4218354 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 358 +#define LIBYUV_VERSION 359 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row.h b/source/row.h index 134dcb8b3..fb833bf8e 100644 --- a/source/row.h +++ b/source/row.h @@ -37,9 +37,6 @@ extern "C" { #define HAS_ABGRTOARGBROW_SSSE3 #define HAS_ABGRTOUVROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3 -// TODO(FBARCHARD): #define HAS_RGBATOARGBROW_SSSE3 -// TODO(FBARCHARD): #define HAS_RGBATOUVROW_SSSE3 -// TODO(FBARCHARD): #define HAS_RGBATOYROW_SSSE3 #define HAS_ARGBTORGBAROW_SSSE3 #define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2 @@ -61,7 +58,6 @@ extern "C" { #define HAS_I422TOARGBROW_SSSE3 #define HAS_I422TOBGRAROW_SSSE3 #define HAS_I422TOABGRROW_SSSE3 -// TODO(FBARCHARD): #define HAS_I422TORGBAROW_SSSE3 #define HAS_I444TOARGBROW_SSSE3 #define HAS_I411TOARGBROW_SSSE3 #define HAS_I400TOARGBROW_SSE2 @@ -91,10 +87,13 @@ extern "C" { // The following are Windows only: #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) -// TODO(fbarchard): Investigate possible issue in this function and reenable. #define HAS_ARGBCOLORTABLEROW_X86 #define HAS_NV12TOARGBROW_SSSE3 #define HAS_NV21TOARGBROW_SSSE3 +#define HAS_I422TORGBAROW_SSSE3 +#define HAS_RGBATOARGBROW_SSSE3 +#define HAS_RGBATOUVROW_SSSE3 +#define HAS_RGBATOYROW_SSSE3 #endif // The following are disabled when SSSE3 is available: diff --git a/source/row_common.cc b/source/row_common.cc index 03b1c1074..cd8d163cb 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -941,7 +941,7 @@ void ARGBUnattenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width) { } -#if defined(HAS_I422TOARGBROW_SSSE3) +#ifdef HAS_I422TOARGBROW_SSSE3 YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C, 0) YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C, 1) YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C, 2) @@ -949,13 +949,17 @@ Y2NY(NV12ToARGBRow_Any_SSSE3, NV12ToARGBRow_Unaligned_SSSE3, NV12ToARGBRow_C, 0) Y2NY(NV21ToARGBRow_Any_SSSE3, NV21ToARGBRow_Unaligned_SSSE3, NV21ToARGBRow_C, 0) YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1) YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1) -// TODO(fbarchard): NOLINT YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1) #endif -#if defined(HAS_I422TOARGBROW_NEON) +#ifdef HAS_I422TORGBAROW_SSSE3 +YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1) +#endif +#ifdef HAS_I422TOARGBROW_NEON YANY(I422ToARGBRow_Any_NEON, I422ToARGBRow_NEON, I422ToARGBRow_C, 1) YANY(I422ToBGRARow_Any_NEON, I422ToBGRARow_NEON, I422ToBGRARow_C, 1) YANY(I422ToABGRRow_Any_NEON, I422ToABGRRow_NEON, I422ToABGRRow_C, 1) -// TODO(fbarchard): NOLINT YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1) +#endif +#ifdef HAS_I422TORGBAROW_NEON +YANY(I422ToRGBARow_Any_NEON, I422ToRGBARow_NEON, I422ToRGBARow_C, 1) #endif #undef YANY @@ -987,7 +991,9 @@ RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2) YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4) YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4) YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4) -// TODO(fbarchard): YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4) +#ifdef HAS_RGBATOYROW_SSSE3 +YANY(RGBAToYRow_Any_SSSE3, RGBAToYRow_Unaligned_SSSE3, 4) +#endif YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2) YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2) #undef YANY @@ -1006,7 +1012,9 @@ YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2) UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4) UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4) UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4) -// TODO(fbarchard): NOLINT UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4) +#ifdef HAS_RGBATOYROW_SSSE3 +UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4) +#endif UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2) UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2) #undef UVANY diff --git a/source/row_win.cc b/source/row_win.cc index b8970bf58..ec37db9b0 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -59,6 +59,19 @@ static const vec8 kABGRToV = { 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 }; +// Constants for RGBA. +static const vec8 kRGBAToY = { + 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 +}; + +static const vec8 kRGBAToU = { + 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 +}; + +static const vec8 kRGBAToV = { + 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 +}; + static const uvec8 kAddY16 = { 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u }; @@ -78,16 +91,16 @@ static const uvec8 kShuffleMaskRAWToARGB = { 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u }; -// Shuffle table for converting ABGR to ARGB. -static const uvec8 kShuffleMaskABGRToARGB = { - 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u -}; - // Shuffle table for converting BGRA to ARGB. static const uvec8 kShuffleMaskBGRAToARGB = { 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u }; +// Shuffle table for converting ABGR to ARGB. +static const uvec8 kShuffleMaskABGRToARGB = { + 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u +}; + // Shuffle table for converting RGBA to ARGB. static const uvec8 kShuffleMaskRGBAToARGB = { 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u @@ -137,12 +150,12 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { } __declspec(naked) __declspec(align(16)) -void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { +void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { __asm { - mov eax, [esp + 4] // src_abgr + mov eax, [esp + 4] // src_bgra mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // pix - movdqa xmm5, kShuffleMaskABGRToARGB + movdqa xmm5, kShuffleMaskBGRAToARGB sub edx, eax align 16 @@ -158,12 +171,12 @@ __asm { } __declspec(naked) __declspec(align(16)) -void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { +void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { __asm { - mov eax, [esp + 4] // src_bgra + mov eax, [esp + 4] // src_abgr mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // pix - movdqa xmm5, kShuffleMaskBGRAToARGB + movdqa xmm5, kShuffleMaskABGRToARGB sub edx, eax align 16 @@ -844,6 +857,74 @@ __asm { } } +__declspec(naked) __declspec(align(16)) +void RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +__asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kRGBAToY + + align 16 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqa [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + jg convertloop + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +__asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kRGBAToY + + align 16 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + sub ecx, 16 + movdqu [edx], xmm0 + lea edx, [edx + 16] + jg convertloop + ret + } +} + __declspec(naked) __declspec(align(16)) void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { @@ -1251,6 +1332,142 @@ __asm { ret } } + +__declspec(naked) __declspec(align(16)) +void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { +__asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kRGBAToU + movdqa xmm6, kRGBAToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 16 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + movdqa xmm2, [eax + 32] + movdqa xmm3, [eax + 48] + pavgb xmm0, [eax + esi] + pavgb xmm1, [eax + esi + 16] + pavgb xmm2, [eax + esi + 32] + pavgb xmm3, [eax + esi + 48] + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { +__asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kRGBAToU + movdqa xmm6, kRGBAToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + align 16 + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + sub ecx, 16 + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + jg convertloop + + pop edi + pop esi + ret + } +} #endif // HAS_ARGBTOYROW_SSSE3 #ifdef HAS_I422TOARGBROW_SSSE3 @@ -1847,47 +2064,6 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf, } } -__declspec(naked) __declspec(align(16)) -void I422ToABGRRow_SSSE3(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* abgr_buf, - int width) { - __asm { - push esi - push edi - mov eax, [esp + 8 + 4] // Y - mov esi, [esp + 8 + 8] // U - mov edi, [esp + 8 + 12] // V - mov edx, [esp + 8 + 16] // abgr - mov ecx, [esp + 8 + 20] // width - sub edi, esi - pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha - pxor xmm4, xmm4 - - align 16 - convertloop: - READYUV422 - YUVTORGB - - // Step 3: Weave into ARGB - punpcklbw xmm2, xmm1 // RG - punpcklbw xmm0, xmm5 // BA - movdqa xmm1, xmm2 - punpcklwd xmm2, xmm0 // RGBA first 4 pixels - punpckhwd xmm1, xmm0 // RGBA next 4 pixels - movdqa [edx], xmm2 - movdqa [edx + 16], xmm1 - lea edx, [edx + 32] - sub ecx, 8 - jg convertloop - - pop edi - pop esi - ret - } -} - __declspec(naked) __declspec(align(16)) void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, const uint8* u_buf, @@ -1929,6 +2105,47 @@ void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, } } +__declspec(naked) __declspec(align(16)) +void I422ToABGRRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* abgr_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // abgr + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm2, xmm1 // RG + punpcklbw xmm0, xmm5 // BA + movdqa xmm1, xmm2 + punpcklwd xmm2, xmm0 // RGBA first 4 pixels + punpckhwd xmm1, xmm0 // RGBA next 4 pixels + movdqa [edx], xmm2 + movdqa [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + __declspec(naked) __declspec(align(16)) void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, const uint8* u_buf, @@ -1969,6 +2186,89 @@ void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, ret } } + +__declspec(naked) __declspec(align(16)) +void I422ToRGBARow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgba_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgba + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + + align 16 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into RGBA + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + punpcklbw xmm1, xmm2 // GR + punpcklbw xmm5, xmm0 // AB + movdqa xmm0, xmm5 + punpcklwd xmm5, xmm1 // RGBA first 4 pixels + punpckhwd xmm0, xmm1 // RGBA next 4 pixels + movdqa [edx], xmm5 + movdqa [edx + 16], xmm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) __declspec(align(16)) +void I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgba_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgba + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + + align 16 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into RGBA + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + punpcklbw xmm1, xmm2 // GR + punpcklbw xmm5, xmm0 // AB + movdqa xmm0, xmm5 + punpcklwd xmm5, xmm1 // RGBA first 4 pixels + punpckhwd xmm0, xmm1 // RGBA next 4 pixels + movdqu [edx], xmm5 + movdqu [edx + 16], xmm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + #endif // HAS_I422TOARGBROW_SSSE3 #ifdef HAS_YTOARGBROW_SSE2