diff --git a/README.chromium b/README.chromium index 6c76507cb..89c1ea467 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 160 +Version: 161 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 4e6385d7b..5720e9063 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -16,7 +16,7 @@ namespace libyuv { extern "C" { #endif -#define LIBYUV_VERSION 160 +#define LIBYUV_VERSION 161 #ifdef __cplusplus } // extern "C" diff --git a/source/convert.cc b/source/convert.cc index 2bb7bc81c..43f8723ba 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -365,6 +365,11 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame, IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16) && IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { ARGBToYRow = ARGBToYRow_SSSE3; + } else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) { + ARGBToYRow = ARGBToYAnyRow_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ARGBToYRow_Unaligned_SSSE3; + } } else #endif { @@ -375,6 +380,12 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame, IS_ALIGNED(width, 16) && IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16)) { ARGBToUVRow = ARGBToUVRow_SSSE3; + } else if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 2) && width <= kMaxStride) { + ARGBToUVRow = ARGBToUVAnyRow_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3; + } } else #endif { @@ -416,6 +427,11 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame, IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16) && IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { ARGBToYRow = BGRAToYRow_SSSE3; + } else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) { + ARGBToYRow = BGRAToYAnyRow_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = BGRAToYRow_Unaligned_SSSE3; + } } else #endif { @@ -426,6 +442,12 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame, IS_ALIGNED(width, 16) && IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16)) { ARGBToUVRow = BGRAToUVRow_SSSE3; + } else if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 2) && width <= kMaxStride) { + ARGBToUVRow = BGRAToUVAnyRow_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = BGRAToUVRow_Unaligned_SSSE3; + } } else #endif { @@ -467,6 +489,11 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame, IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16) && IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) { ARGBToYRow = ABGRToYRow_SSSE3; + } else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) { + ARGBToYRow = ABGRToYAnyRow_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToYRow = ABGRToYRow_Unaligned_SSSE3; + } } else #endif { @@ -477,6 +504,12 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame, IS_ALIGNED(width, 16) && IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16)) { ARGBToUVRow = ABGRToUVRow_SSSE3; + } else if (TestCpuFlag(kCpuHasSSSE3) && + IS_ALIGNED(width, 2) && width <= kMaxStride) { + ARGBToUVRow = ABGRToUVAnyRow_SSSE3; + if (IS_ALIGNED(width, 16)) { + ARGBToUVRow = ABGRToUVRow_Unaligned_SSSE3; + } } else #endif { diff --git a/source/row.h b/source/row.h index 74984ad09..20ffdc8d3 100644 --- a/source/row.h +++ b/source/row.h @@ -100,12 +100,22 @@ void FastConvertYUVToABGRRow_NEON(const uint8* y_buf, void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); + void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width); +void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width); void MirrorRow_SSE2(const uint8* src, uint8* dst, int width); @@ -235,6 +245,16 @@ void ARGBToRGB565AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB1555AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); void ARGBToARGB4444AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix); +void ARGBToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void BGRAToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ABGRToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix); +void ARGBToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void BGRAToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); +void ABGRToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width); + void FastConvertYUVToARGBAnyRow_NEON(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, diff --git a/source/row_common.cc b/source/row_common.cc index d247c2b73..add01c803 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -380,8 +380,17 @@ void NAMEANY(const uint8* y_buf, \ memcpy(rgb_buf, row, width << 2); \ } +#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) +MAKEYUVANY(FastConvertYUVToARGBAnyRow_SSSE3, FastConvertYUVToARGBRow_SSSE3) +MAKEYUVANY(FastConvertYUVToBGRAAnyRow_SSSE3, FastConvertYUVToBGRARow_SSSE3) +MAKEYUVANY(FastConvertYUVToABGRAnyRow_SSSE3, FastConvertYUVToABGRRow_SSSE3) +#endif +#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) +MAKEYUVANY(FastConvertYUVToARGBAnyRow_NEON, FastConvertYUVToARGBRow_NEON) +MAKEYUVANY(FastConvertYUVToBGRAAnyRow_NEON, FastConvertYUVToBGRARow_NEON) +MAKEYUVANY(FastConvertYUVToABGRAnyRow_NEON, FastConvertYUVToABGRRow_NEON) +#endif -// Wrappers to handle odd sizes/alignments #define MAKEYUVANYRGB(NAMEANY, ARGBTORGB, BPP) \ void NAMEANY(const uint8* argb_buf, \ uint8* rgb_buf, \ @@ -391,20 +400,40 @@ void NAMEANY(const uint8* argb_buf, \ memcpy(rgb_buf, row, width * BPP); \ } -#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3) -MAKEYUVANY(FastConvertYUVToARGBAnyRow_SSSE3, FastConvertYUVToARGBRow_SSSE3) -MAKEYUVANY(FastConvertYUVToBGRAAnyRow_SSSE3, FastConvertYUVToBGRARow_SSSE3) -MAKEYUVANY(FastConvertYUVToABGRAnyRow_SSSE3, FastConvertYUVToABGRRow_SSSE3) +#if defined(HAS_ARGBTORGB24ROW_SSSE3) MAKEYUVANYRGB(ARGBToRGB24AnyRow_SSSE3, ARGBToRGB24Row_SSSE3, 3) MAKEYUVANYRGB(ARGBToRAWAnyRow_SSSE3, ARGBToRAWRow_SSSE3, 3) MAKEYUVANYRGB(ARGBToRGB565AnyRow_SSE2, ARGBToRGB565Row_SSE2, 2) MAKEYUVANYRGB(ARGBToARGB1555AnyRow_SSE2, ARGBToARGB1555Row_SSE2, 2) MAKEYUVANYRGB(ARGBToARGB4444AnyRow_SSE2, ARGBToARGB4444Row_SSE2, 2) #endif -#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON) -MAKEYUVANY(FastConvertYUVToARGBAnyRow_NEON, FastConvertYUVToARGBRow_NEON) -MAKEYUVANY(FastConvertYUVToBGRAAnyRow_NEON, FastConvertYUVToBGRARow_NEON) -MAKEYUVANY(FastConvertYUVToABGRAnyRow_NEON, FastConvertYUVToABGRRow_NEON) + +#ifdef HAS_ARGBTOYROW_SSSE3 + +#define MAKEARGBTOYANY(NAMEANY, ARGBTOY) \ + void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \ + SIMD_ALIGNED(uint8 row[kMaxStride]); \ + ARGBTOY(src_argb, row, width); \ + memcpy(dst_y, row, width); \ + } + +MAKEARGBTOYANY(ARGBToYAnyRow_SSSE3, ARGBToYRow_Unaligned_SSSE3) +MAKEARGBTOYANY(BGRAToYAnyRow_SSSE3, BGRAToYRow_Unaligned_SSSE3) +MAKEARGBTOYANY(ABGRToYAnyRow_SSSE3, ABGRToYRow_Unaligned_SSSE3) + +#define MAKEARGBTOUVANY(NAMEANY, ARGBTOUV) \ + void NAMEANY(const uint8* src_argb0, int src_stride_argb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); \ + ARGBTOUV(src_argb0, src_stride_argb, row, row + kMaxStride, width); \ + int halfwidth = (width + 1) >> 1; \ + memcpy(dst_u, row, halfwidth); \ + memcpy(dst_v, row + kMaxStride, halfwidth); \ + } + +MAKEARGBTOUVANY(ARGBToUVAnyRow_SSSE3, ARGBToUVRow_Unaligned_SSSE3) +MAKEARGBTOUVANY(BGRAToUVAnyRow_SSSE3, BGRAToUVRow_Unaligned_SSSE3) +MAKEARGBTOUVANY(ABGRToUVAnyRow_SSSE3, ABGRToUVRow_Unaligned_SSSE3) #endif #ifdef __cplusplus diff --git a/source/row_posix.cc b/source/row_posix.cc index a79277268..3003ac4f1 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -257,6 +257,43 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif +); +} + +void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + asm volatile ( + "movdqa %4,%%xmm5 \n" + "movdqa %3,%%xmm4 \n" +"1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm3 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm1 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm4,%%xmm3 \n" + "lea 0x40(%0),%0 \n" + "phaddw %%xmm1,%%xmm0 \n" + "phaddw %%xmm3,%%xmm2 \n" + "psrlw $0x7,%%xmm0 \n" + "psrlw $0x7,%%xmm2 \n" + "packuswb %%xmm2,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movdqu %%xmm0,(%1) \n" + "lea 0x10(%1),%1 \n" + "sub $0x10,%2 \n" + "ja 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_y), // %1 + "+r"(pix) // %2 + : "m"(kARGBToY), // %3 + "m"(kAddY16) // %4 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); } #endif @@ -325,6 +362,74 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, #endif ); } + +void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { + asm volatile ( + "movdqa %0,%%xmm4 \n" + "movdqa %1,%%xmm3 \n" + "movdqa %2,%%xmm5 \n" + : + : "m"(kARGBToU), // %0 + "m"(kARGBToV), // %1 + "m"(kAddUV128) // %2 + : +#if defined(__SSE2__) + "xmm3", "xmm4", "xmm5" +#endif + ); + asm volatile ( + "sub %1,%2 \n" +"1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "movdqu 0x20(%0),%%xmm2 \n" + "movdqu 0x30(%0),%%xmm6 \n" + "movdqu (%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqu 0x10(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm1 \n" + "movdqu 0x20(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqu 0x30(%0,%4,1),%%xmm7 \n" + "pavgb %%xmm7,%%xmm6 \n" + "lea 0x40(%0),%0 \n" + "movdqa %%xmm0,%%xmm7 \n" + "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm7 \n" + "pavgb %%xmm7,%%xmm0 \n" + "movdqa %%xmm2,%%xmm7 \n" + "shufps $0x88,%%xmm6,%%xmm2 \n" + "shufps $0xdd,%%xmm6,%%xmm7 \n" + "pavgb %%xmm7,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "movdqa %%xmm2,%%xmm6 \n" + "pmaddubsw %%xmm4,%%xmm0 \n" + "pmaddubsw %%xmm4,%%xmm2 \n" + "pmaddubsw %%xmm3,%%xmm1 \n" + "pmaddubsw %%xmm3,%%xmm6 \n" + "phaddw %%xmm2,%%xmm0 \n" + "phaddw %%xmm6,%%xmm1 \n" + "psraw $0x8,%%xmm0 \n" + "psraw $0x8,%%xmm1 \n" + "packsswb %%xmm1,%%xmm0 \n" + "paddb %%xmm5,%%xmm0 \n" + "movlps %%xmm0,(%1) \n" + "movhps %%xmm0,(%1,%2,1) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "ja 1b \n" + : "+r"(src_argb0), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+rm"(width) // %3 + : "r"(static_cast(src_stride_argb)) + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7" +#endif +); +} #endif #ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3 @@ -624,6 +729,18 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { BGRAToARGBRow_SSSE3(src_argb, row, pix); ARGBToYRow_SSSE3(row, dst_y, pix); } + +void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride]); + ABGRToARGBRow_C(src_argb, row, pix); + ARGBToYRow_SSSE3(row, dst_y, pix); +} + +void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride]); + BGRAToARGBRow_C(src_argb, row, pix); + ARGBToYRow_SSSE3(row, dst_y, pix); +} #endif #ifdef HAS_ARGBTOUVROW_SSSE3 @@ -642,6 +759,22 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb, BGRAToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix); ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix); } + +void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + ABGRToARGBRow_C(src_argb, row, pix); + ABGRToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix); + ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix); +} + +void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int pix) { + SIMD_ALIGNED(uint8 row[kMaxStride * 2]); + BGRAToARGBRow_C(src_argb, row, pix); + BGRAToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix); + ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix); +} #endif #ifdef HAS_MIRRORROW_SSSE3 diff --git a/source/row_win.cc b/source/row_win.cc index 805811897..657269def 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -611,6 +611,39 @@ __asm { } } +__declspec(naked) +void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +__asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kARGBToY + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja convertloop + ret + } +} + __declspec(naked) void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { @@ -644,6 +677,39 @@ __asm { } } +__declspec(naked) +void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +__asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kBGRAToY + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja convertloop + ret + } +} + __declspec(naked) void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { __asm { @@ -677,6 +743,39 @@ __asm { } } +__declspec(naked) +void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { +__asm { + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_y */ + mov ecx, [esp + 12] /* pix */ + movdqa xmm5, kAddY16 + movdqa xmm4, kABGRToY + + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + pmaddubsw xmm0, xmm4 + pmaddubsw xmm1, xmm4 + pmaddubsw xmm2, xmm4 + pmaddubsw xmm3, xmm4 + lea eax, [eax + 64] + phaddw xmm0, xmm1 + phaddw xmm2, xmm3 + psrlw xmm0, 7 + psrlw xmm2, 7 + packuswb xmm0, xmm2 + paddb xmm0, xmm5 + movdqu [edx], xmm0 + lea edx, [edx + 16] + sub ecx, 16 + ja convertloop + ret + } +} + __declspec(naked) void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { @@ -741,6 +840,75 @@ __asm { } } + +__declspec(naked) +void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { +__asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kARGBToU + movdqa xmm6, kARGBToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + ja convertloop + pop edi + pop esi + ret + } +} + __declspec(naked) void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { @@ -805,6 +973,74 @@ __asm { } } +__declspec(naked) +void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { +__asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kBGRAToU + movdqa xmm6, kBGRAToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + ja convertloop + pop edi + pop esi + ret + } +} + __declspec(naked) void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, uint8* dst_u, uint8* dst_v, int width) { @@ -869,6 +1105,75 @@ __asm { } } + +__declspec(naked) +void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, + uint8* dst_u, uint8* dst_v, int width) { +__asm { + push esi + push edi + mov eax, [esp + 8 + 4] // src_argb + mov esi, [esp + 8 + 8] // src_stride_argb + mov edx, [esp + 8 + 12] // dst_u + mov edi, [esp + 8 + 16] // dst_v + mov ecx, [esp + 8 + 20] // pix + movdqa xmm7, kABGRToU + movdqa xmm6, kABGRToV + movdqa xmm5, kAddUV128 + sub edi, edx // stride from u to v + + convertloop: + /* step 1 - subsample 16x2 argb pixels to 8x1 */ + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + movdqu xmm2, [eax + 32] + movdqu xmm3, [eax + 48] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] + movdqa xmm4, xmm0 + shufps xmm0, xmm1, 0x88 + shufps xmm4, xmm1, 0xdd + pavgb xmm0, xmm4 + movdqa xmm4, xmm2 + shufps xmm2, xmm3, 0x88 + shufps xmm4, xmm3, 0xdd + pavgb xmm2, xmm4 + + // step 2 - convert to U and V + // from here down is very similar to Y code except + // instead of 16 different pixels, its 8 pixels of U and 8 of V + movdqa xmm1, xmm0 + movdqa xmm3, xmm2 + pmaddubsw xmm0, xmm7 // U + pmaddubsw xmm2, xmm7 + pmaddubsw xmm1, xmm6 // V + pmaddubsw xmm3, xmm6 + phaddw xmm0, xmm2 + phaddw xmm1, xmm3 + psraw xmm0, 8 + psraw xmm1, 8 + packsswb xmm0, xmm1 + paddb xmm0, xmm5 // -> unsigned + + // step 3 - store 8 U and 8 V values + movlps qword ptr [edx], xmm0 // U + movhps qword ptr [edx + edi], xmm0 // V + lea edx, [edx + 8] + sub ecx, 16 + ja convertloop + pop edi + pop esi + ret + } +} + #ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3 #define YG 74 /* static_cast(1.164 * 64 + 0.5) */