diff --git a/README.chromium b/README.chromium index f891fcca1..2564fcc8d 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 137 +Version: 140 License: BSD License File: LICENSE diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 018b9c11a..9469dbb6a 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1653,7 +1653,7 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, SIMD_ALIGNED(uint8 row[kMaxStride]); void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix); -#if defined(HAS_ARGBTORGB24ROW_SSSE3_DISABLED) +#if defined(HAS_ARGBTORGB24ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { @@ -1709,7 +1709,7 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, SIMD_ALIGNED(uint8 row[kMaxStride]); void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix); -#if defined(HAS_ARGBTORAWROW_SSSE3_DISABLED) +#if defined(HAS_ARGBTORAWROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { @@ -1765,7 +1765,7 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, SIMD_ALIGNED(uint8 row[kMaxStride]); void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix); -#if defined(HAS_ARGBTORGB565ROW_SSE2_DISABLED) +#if defined(HAS_ARGBTORGB565ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { @@ -1821,7 +1821,7 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, SIMD_ALIGNED(uint8 row[kMaxStride]); void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix); -#if defined(HAS_ARGBTOARGB1555ROW_SSE2_DISABLED) +#if defined(HAS_ARGBTOARGB1555ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { @@ -2195,7 +2195,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix); -#if defined(HAS_ARGBTORGB24ROW_SSSE3_DISABLED) +#if defined(HAS_ARGBTORGB24ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && @@ -2225,7 +2225,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, src_stride_argb = -src_stride_argb; } void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix); -#if defined(HAS_ARGBTORAWROW_SSSE3_DISABLED) +#if defined(HAS_ARGBTORAWROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) && @@ -2276,10 +2276,10 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y, SIMD_ALIGNED(uint8 row[kMaxStride]); void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix); -#if defined(HAS_ARGBTORGB565ROW_SSE2_DISABLED) +#if defined(HAS_ARGBTORGB565ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + IS_ALIGNED(dst_rgb, 16) && IS_ALIGNED(dst_stride_rgb, 16)) { ARGBToRGB565Row = ARGBToRGB565Row_SSE2; } else #endif diff --git a/source/row_win.cc b/source/row_win.cc index ecd9a82e9..f47ea4b83 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -523,16 +523,16 @@ __asm { pshufb xmm2, xmm5 pshufb xmm3, xmm5 movdqa xmm4, xmm1 - psllq xmm4, 12 + pslldq xmm4, 12 por xmm4, xmm0 movdqa [edx], xmm4 // first 16 bytes movdqa xmm4, xmm2 - psrlq xmm1, 4 - psllq xmm4, 8 + psrldq xmm1, 4 + pslldq xmm4, 8 por xmm1, xmm4 movdqa [edx + 16], xmm1 // middle 16 bytes - psrlq xmm2, 8 - psllq xmm3, 4 + psrldq xmm2, 8 + pslldq xmm3, 4 por xmm2, xmm3 movdqa [edx + 32], xmm2 // last 16 bytes lea edx, [edx + 48] @@ -562,16 +562,16 @@ __asm { pshufb xmm2, xmm5 pshufb xmm3, xmm5 movdqa xmm4, xmm1 - psllq xmm4, 12 + pslldq xmm4, 12 por xmm4, xmm0 movdqa [edx], xmm4 // first 16 bytes movdqa xmm4, xmm2 - psrlq xmm1, 4 - psllq xmm4, 8 + psrldq xmm1, 4 + pslldq xmm4, 8 por xmm1, xmm4 movdqa [edx + 16], xmm1 // middle 16 bytes - psrlq xmm2, 8 - psllq xmm3, 4 + psrldq xmm2, 8 + pslldq xmm3, 4 por xmm2, xmm3 movdqa [edx + 32], xmm2 // last 16 bytes lea edx, [edx + 48] @@ -582,6 +582,7 @@ __asm { } // TODO(fbarchard): Port to gcc +// TODO(fbarchard): Improve sign extension/packing __declspec(naked) void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { @@ -591,7 +592,7 @@ __asm { psrlw xmm4, 10 psllw xmm4, 5 pcmpeqb xmm5, xmm5 // generate mask 0xf800f800 - psrlw xmm5, 11 + psllw xmm5, 11 mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb @@ -599,20 +600,20 @@ __asm { convertloop: movdqa xmm0, [eax] // fetch 4 pixels of argb - lea eax, [eax + 16] movdqa xmm1, xmm0 // B - psrlw xmm1, 3 + psrld xmm1, 3 pand xmm1, xmm3 movdqa xmm2, xmm0 // G - psrlw xmm2, 5 + psrld xmm2, 5 pand xmm2, xmm4 por xmm1, xmm2 - psrlw xmm0, 8 // R + psrld xmm0, 8 // R pand xmm0, xmm5 por xmm0, xmm1 pslld xmm0, 16 psrad xmm0, 16 packssdw xmm0, xmm0 + lea eax, [eax + 16] movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 lea edx, [edx + 8] sub ecx, 4 @@ -622,6 +623,7 @@ __asm { } // TODO(fbarchard): Port to gcc +// TODO(fbarchard): Improve sign extension/packing __declspec(naked) void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { __asm { @@ -629,10 +631,10 @@ __asm { psrlw xmm3, 11 movdqa xmm4, xmm3 // generate mask 0x03e003e0 psllw xmm4, 5 - movdqa xmm5, xmm3 // generate mask 0x7c007c00 - psllw xmm5, 10 + movdqa xmm5, xmm4 // generate mask 0x7c007c00 + psllw xmm5, 5 pcmpeqb xmm6, xmm6 // generate mask 0x80008000 - psrlw xmm6, 15 + psllw xmm6, 15 mov eax, [esp + 4] // src_argb mov edx, [esp + 8] // dst_rgb @@ -640,26 +642,25 @@ __asm { convertloop: movdqa xmm0, [eax] // fetch 4 pixels of argb - lea eax, [eax + 16] movdqa xmm1, xmm0 // B - psrlw xmm1, 3 + psrld xmm1, 3 pand xmm1, xmm3 movdqa xmm2, xmm0 // G - psrlw xmm2, 6 + psrld xmm2, 6 pand xmm2, xmm4 por xmm1, xmm2 movdqa xmm2, xmm0 // R - psrlw xmm2, 9 + psrld xmm2, 9 pand xmm2, xmm5 por xmm1, xmm2 - movdqa xmm2, xmm0 // A - psrlw xmm2, 16 - pand xmm2, xmm6 - por xmm1, xmm2 + psrld xmm0, 16 // A + pand xmm0, xmm6 + por xmm0, xmm1 pslld xmm0, 16 psrad xmm0, 16 - packssdw xmm1, xmm1 - movq qword ptr [edx], xmm1 // store 4 pixels of ARGB1555 + packssdw xmm0, xmm0 + lea eax, [eax + 16] + movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 lea edx, [edx + 8] sub ecx, 4 ja convertloop @@ -682,7 +683,6 @@ __asm { convertloop: movdqa xmm0, [eax] // fetch 4 pixels of argb - lea eax, [eax + 16] movdqa xmm1, xmm0 pand xmm0, xmm3 // low nibble pand xmm1, xmm4 // high nibble @@ -690,6 +690,7 @@ __asm { psrl xmm1, 8 por xmm0, xmm1 packuswb xmm0, xmm0 + lea eax, [eax + 16] movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 lea edx, [edx + 8] sub ecx, 4