diff --git a/README.chromium b/README.chromium index b3da154c1..839f3b0f3 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 398 +Version: 399 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 4814f2544..ebbc4572c 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -100,6 +100,8 @@ extern "C" { #define HAS_RGBATOARGBROW_SSSE3 #define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOYROW_SSSE3 +#define HAS_I422TORGB24ROW_SSSE3 +#define HAS_I422TORAWROW_SSSE3 #endif // The following are disabled when SSSE3 is available: @@ -436,6 +438,19 @@ void I422ToRGBARow_SSSE3(const uint8* y_buf, uint8* rgba_buf, int width); +// RGB24/RAW are unaligned. +void I422ToRGB24Row_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void I422ToRAWRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, @@ -528,6 +543,19 @@ void I422ToRGBARow_Any_SSSE3(const uint8* y_buf, uint8* rgba_buf, int width); +// RGB24/RAW are unaligned. +void I422ToRGB24Row_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void I422ToRAWRow_Any_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + void YToARGBRow_SSE2(const uint8* y_buf, uint8* argb_buf, int width); diff --git a/include/libyuv/version.h b/include/libyuv/version.h index b12c2e7ac..d761e3295 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 398 +#define LIBYUV_VERSION 399 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/convert_from.cc b/source/convert_from.cc index 4ea974acf..817dbd8b6 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -928,10 +928,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToRGB24Row = I422ToRGB24Row_Unaligned_SSSE3; - if (IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) { - I422ToRGB24Row = I422ToRGB24Row_SSSE3; - } + I422ToRGB24Row = I422ToRGB24Row_SSSE3; } } #endif @@ -982,10 +979,7 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I422ToRAWRow = I422ToRAWRow_Any_SSSE3; if (IS_ALIGNED(width, 8)) { - I422ToRAWRow = I422ToRAWRow_Unaligned_SSSE3; - if (IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) { - I422ToRAWRow = I422ToRAWRow_SSSE3; - } + I422ToRAWRow = I422ToRAWRow_SSSE3; } } #endif diff --git a/source/row_common.cc b/source/row_common.cc index c5f3ce050..e0e426cd8 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -1023,9 +1023,9 @@ YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C, 1) YANY(I422ToABGRRow_Any_SSSE3, I422ToABGRRow_Unaligned_SSSE3, I422ToABGRRow_C, 1) #endif #ifdef HAS_I422TORGB24ROW_SSSE3 -YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_Unaligned_SSSE3, \ - I422ToRGB24Row_C, 1) -YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_Unaligned_SSSE3, I422ToRAWRow_C, 1) +// I422ToRGB24Row_SSSE3 is unaligned. +YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1) +YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1) #endif #ifdef HAS_I422TORGBAROW_SSSE3 YANY(I422ToRGBARow_Any_SSSE3, I422ToRGBARow_Unaligned_SSSE3, I422ToRGBARow_C, 1) diff --git a/source/row_win.cc b/source/row_win.cc index e3b01f27f..de70b9435 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -122,6 +122,16 @@ static const uvec8 kShuffleMaskARGBToRAW = { 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u }; +// Shuffle table for converting ARGBToRGB24 for I420ToRGB24. First 8 + next 4 +static const uvec8 kShuffleMaskARGBToRGB24_0 = { + 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u +}; + +// Shuffle table for converting ARGB to RAW. +static const uvec8 kShuffleMaskARGBToRAW_0 = { + 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u +}; + __declspec(naked) __declspec(align(16)) void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { __asm { @@ -1654,6 +1664,100 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, } } +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I422ToRGB24Row_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb24_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgb24 + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + movdqa xmm5, kShuffleMaskARGBToRGB24_0 + movdqa xmm6, kShuffleMaskARGBToRGB24 + + align 16 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into RRGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm2 // RR + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRR first 4 pixels + punpckhwd xmm1, xmm2 // BGRR next 4 pixels + pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. + pshufb xmm1, xmm6 // Pack into first 12 bytes. + palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 + movq qword ptr [edx], xmm0 // First 8 bytes + movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. + lea edx, [edx + 24] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +// 8 pixels, dest aligned 16. +// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). +__declspec(naked) __declspec(align(16)) +void I422ToRAWRow_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* raw_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // raw + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + movdqa xmm5, kShuffleMaskARGBToRAW_0 + movdqa xmm6, kShuffleMaskARGBToRAW + + align 16 + convertloop: + READYUV422 + YUVTORGB + + // Step 3: Weave into RRGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm2 // RR + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRR first 4 pixels + punpckhwd xmm1, xmm2 // BGRR next 4 pixels + pshufb xmm0, xmm5 // Pack into first 8 and last 4 bytes. + pshufb xmm1, xmm6 // Pack into first 12 bytes. + palignr xmm1, xmm0, 12 // last 4 bytes of xmm0 + 12 from xmm1 + movq qword ptr [edx], xmm0 // First 8 bytes + movdqu [edx + 8], xmm1 // Last 16 bytes. = 24 bytes, 8 RGB pixels. + lea edx, [edx + 24] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + // 8 pixels, dest aligned 16. // 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). __declspec(naked) __declspec(align(16))