diff --git a/README.chromium b/README.chromium index 5b4f2b808..e4e61cf01 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 231 +Version: 232 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 04283c8fa..c9aaade97 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define INCLUDE_LIBYUV_VERSION 231 +#define INCLUDE_LIBYUV_VERSION 232 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert_from.cc b/source/convert_from.cc index 24156892b..9a0d32ad9 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -671,11 +671,13 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, } } #elif defined(HAS_I420TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I420ToARGBRow = I420ToARGBRow_Any_SSSE3; - if (IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { - I420ToARGBRow = I420ToARGBRow_SSSE3; + if (IS_ALIGNED(width, 8)) { + I420ToARGBRow = I420ToARGBRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { + I420ToARGBRow = I420ToARGBRow_SSSE3; + } } } #endif @@ -717,11 +719,13 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, } } #elif defined(HAS_I420TOBGRAROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I420ToBGRARow = I420ToBGRARow_Any_SSSE3; - if (IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { - I420ToBGRARow = I420ToBGRARow_SSSE3; + if (IS_ALIGNED(width, 8)) { + I420ToBGRARow = I420ToBGRARow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) { + I420ToBGRARow = I420ToBGRARow_SSSE3; + } } } #endif @@ -763,11 +767,13 @@ int I420ToABGR(const uint8* src_y, int src_stride_y, } } #elif defined(HAS_I420TOABGRROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I420ToABGRRow = I420ToABGRRow_Any_SSSE3; - if (IS_ALIGNED(width, 8) && - IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { - I420ToABGRRow = I420ToABGRRow_SSSE3; + if (IS_ALIGNED(width, 8)) { + I420ToABGRRow = I420ToABGRRow_Unaligned_SSSE3; + if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) { + I420ToABGRRow = I420ToABGRRow_SSSE3; + } } } #endif @@ -816,7 +822,9 @@ int I420ToRGB24(const uint8* src_y, int src_stride_y, ARGBToRGB24Row_C; #if defined(HAS_ARGBTORGB24ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; + if (width * 3 <= kMaxStride) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; + } if (IS_ALIGNED(width, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; @@ -869,7 +877,9 @@ int I420ToRAW(const uint8* src_y, int src_stride_y, ARGBToRAWRow_C; #if defined(HAS_ARGBTORAWROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3)) { - ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; + if (width * 3 <= kMaxStride) { + ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; + } if (IS_ALIGNED(width, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { ARGBToRAWRow = ARGBToRAWRow_SSSE3; @@ -922,7 +932,9 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, ARGBToRGB565Row_C; #if defined(HAS_ARGBTORGB565ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; + if (width * 2 <= kMaxStride) { + ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2; + } if (IS_ALIGNED(width, 4)) { ARGBToRGB565Row = ARGBToRGB565Row_SSE2; } @@ -974,7 +986,9 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, ARGBToARGB1555Row_C; #if defined(HAS_ARGBTOARGB1555ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; + if (width * 2 <= kMaxStride) { + ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2; + } if (IS_ALIGNED(width, 4)) { ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2; } @@ -1026,7 +1040,9 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, ARGBToARGB4444Row_C; #if defined(HAS_ARGBTOARGB4444ROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { - ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; + if (width * 2 <= kMaxStride) { + ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2; + } if (IS_ALIGNED(width, 4)) { ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2; } diff --git a/source/planar_functions.cc b/source/planar_functions.cc index f5b834a8b..67cb7a46e 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -216,7 +216,7 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, } } #elif defined(HAS_I420TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I420ToARGBRow = I420ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { @@ -478,7 +478,9 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb, #if defined(HAS_ARGBTORGB24ROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; + if (width * 3 <= kMaxStride) { + ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3; + } if (IS_ALIGNED(width, 16) && IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) { ARGBToRGB24Row = ARGBToRGB24Row_SSSE3; @@ -508,7 +510,9 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb, #if defined(HAS_ARGBTORAWROW_SSSE3) if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) { - ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; + if (width * 3 <= kMaxStride) { + ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3; + } if (IS_ALIGNED(width, 16) && IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) { ARGBToRAWRow = ARGBToRAWRow_SSSE3; @@ -548,7 +552,7 @@ int NV12ToARGB(const uint8* src_y, int src_stride_y, } } #elif defined(HAS_I420TOARGBROW_SSSE3) - if (TestCpuFlag(kCpuHasSSSE3)) { + if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) { I420ToARGBRow = I420ToARGBRow_Any_SSSE3; if (IS_ALIGNED(width, 8) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { diff --git a/source/row.h b/source/row.h index 32eefc308..2b53b1c61 100644 --- a/source/row.h +++ b/source/row.h @@ -18,7 +18,7 @@ namespace libyuv { extern "C" { #endif -#define kMaxStride (2048 * 4) +#define kMaxStride (2560 * 4) #define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1))) #if defined(COVERAGE_ENABLED) || defined(TARGET_IPHONE_SIMULATOR) @@ -209,14 +209,14 @@ void I420ToABGRRow_C(const uint8* y_buf, int width); void I444ToARGBRow_C(const uint8* y_buf, - const uint8* u_buf, - const uint8* v_buf, - uint8* rgb_buf, - int width); + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); void YToARGBRow_C(const uint8* y_buf, - uint8* rgb_buf, - int width); + uint8* rgb_buf, + int width); void I420ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, @@ -236,6 +236,24 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf, uint8* rgb_buf, int width); +void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + +void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); + void I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, diff --git a/source/row_common.cc b/source/row_common.cc index 68b6682cf..b57ffb4d0 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -519,29 +519,33 @@ void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) { } // Wrappers to handle odd sizes/alignments -#define MAKEYUVANY(NAMEANY, NAME, COPYROW) \ +#define YUVANY(NAMEANY, I420TORGB_SSE, I420TORGB_C) \ void NAMEANY(const uint8* y_buf, \ const uint8* u_buf, \ const uint8* v_buf, \ uint8* rgb_buf, \ int width) { \ - SIMD_ALIGNED(uint8 row[kMaxStride]); \ - NAME(y_buf, u_buf, v_buf, row, width); \ - COPYROW(row, rgb_buf, width << 2); \ + int n = width & ~7; \ + I420TORGB_SSE(y_buf, u_buf, v_buf, rgb_buf, n); \ + I420TORGB_C(y_buf + n, \ + u_buf + (n >> 1), \ + v_buf + (n >> 1), \ + rgb_buf + n * 4, width & 7); \ } #if defined(HAS_I420TOARGBROW_SSSE3) -MAKEYUVANY(I420ToARGBRow_Any_SSSE3, I420ToARGBRow_SSSE3, CopyRow_X86) -MAKEYUVANY(I420ToBGRARow_Any_SSSE3, I420ToBGRARow_SSSE3, CopyRow_X86) -MAKEYUVANY(I420ToABGRRow_Any_SSSE3, I420ToABGRRow_SSSE3, CopyRow_X86) +YUVANY(I420ToARGBRow_Any_SSSE3, I420ToARGBRow_Unaligned_SSSE3, I420ToARGBRow_C) +YUVANY(I420ToBGRARow_Any_SSSE3, I420ToBGRARow_Unaligned_SSSE3, I420ToBGRARow_C) +YUVANY(I420ToABGRRow_Any_SSSE3, I420ToABGRRow_Unaligned_SSSE3, I420ToABGRRow_C) #endif #if defined(HAS_I420TOARGBROW_NEON) -MAKEYUVANY(I420ToARGBRow_Any_NEON, I420ToARGBRow_NEON, CopyRow_C) -MAKEYUVANY(I420ToBGRARow_Any_NEON, I420ToBGRARow_NEON, CopyRow_C) -MAKEYUVANY(I420ToABGRRow_Any_NEON, I420ToABGRRow_NEON, CopyRow_C) +YUVANY(I420ToARGBRow_Any_NEON, I420ToARGBRow_NEON, I420ToARGBRow_C) +YUVANY(I420ToBGRARow_Any_NEON, I420ToBGRARow_NEON, I420ToBGRARow_C) +YUVANY(I420ToABGRRow_Any_NEON, I420ToABGRRow_NEON, I420ToABGRRow_C) #endif +#undef YUVANY -#define MAKEYUVANYRGB(NAMEANY, ARGBTORGB, BPP) \ +#define RGBANY(NAMEANY, ARGBTORGB, BPP) \ void NAMEANY(const uint8* argb_buf, \ uint8* rgb_buf, \ int width) { \ @@ -551,41 +555,45 @@ MAKEYUVANY(I420ToABGRRow_Any_NEON, I420ToABGRRow_NEON, CopyRow_C) } #if defined(HAS_ARGBTORGB24ROW_SSSE3) -MAKEYUVANYRGB(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 3) -MAKEYUVANYRGB(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 3) -MAKEYUVANYRGB(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 2) -MAKEYUVANYRGB(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 2) -MAKEYUVANYRGB(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2) +RGBANY(ARGBToRGB24Row_Any_SSSE3, ARGBToRGB24Row_SSSE3, 3) +RGBANY(ARGBToRAWRow_Any_SSSE3, ARGBToRAWRow_SSSE3, 3) +RGBANY(ARGBToRGB565Row_Any_SSE2, ARGBToRGB565Row_SSE2, 2) +RGBANY(ARGBToARGB1555Row_Any_SSE2, ARGBToARGB1555Row_SSE2, 2) +RGBANY(ARGBToARGB4444Row_Any_SSE2, ARGBToARGB4444Row_SSE2, 2) #endif +#undef RGBANY #ifdef HAS_ARGBTOYROW_SSSE3 - -#define MAKEYANY(NAMEANY, ARGBTOY_SSE, BPP) \ +#define YANY(NAMEANY, ARGBTOY_SSE, BPP) \ void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \ ARGBTOY_SSE(src_argb, dst_y, width - 16); \ ARGBTOY_SSE(src_argb + (width - 16) * BPP, dst_y + (width - 16), 16); \ } -MAKEYANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4) -MAKEYANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4) -MAKEYANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4) -MAKEYANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2) -MAKEYANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2) +YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4) +YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4) +YANY(ABGRToYRow_Any_SSSE3, ABGRToYRow_Unaligned_SSSE3, 4) +YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2) +YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2) +#undef YANY -#define MAKEUVANY(NAMEANY, ARGBTOUV_SSE, ARGBTOUV_C, BPP) \ - void NAMEANY(const uint8* src_argb0, int src_stride_argb, \ +#define UVANY(NAMEANY, ARGBTOUV_SSE, ARGBTOUV_C, BPP) \ + void NAMEANY(const uint8* src_argb, int src_stride_argb, \ uint8* dst_u, uint8* dst_v, int width) { \ - ARGBTOUV_SSE(src_argb0, src_stride_argb, dst_u, dst_v, width & ~15); \ - ARGBTOUV_C(src_argb0 + (width & ~15) * BPP, src_stride_argb, \ - dst_u + (width & ~15) / 2, dst_v + (width & ~15) / 2, \ + int n = width & ~15; \ + ARGBTOUV_SSE(src_argb, src_stride_argb, dst_u, dst_v, n); \ + ARGBTOUV_C(src_argb + n * BPP, src_stride_argb, \ + dst_u + (n >> 1), \ + dst_v + (n >> 1), \ width & 15); \ } -MAKEUVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4) -MAKEUVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4) -MAKEUVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4) -MAKEUVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2) -MAKEUVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2) +UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4) +UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4) +UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4) +UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2) +UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2) +#undef UVANY #endif #ifdef __cplusplus diff --git a/source/row_posix.cc b/source/row_posix.cc index 709913ab0..f839e204e 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -1389,6 +1389,109 @@ void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf, ); } +void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + asm volatile ( + "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + "1: \n" + YUVTORGB + "punpcklbw %%xmm1,%%xmm0 \n" + "punpcklbw %%xmm5,%%xmm2 \n" + "movdqa %%xmm0,%%xmm1 \n" + "punpcklwd %%xmm2,%%xmm0 \n" + "punpckhwd %%xmm2,%%xmm1 \n" + "movdqu %%xmm0,(%3) \n" + "movdqu %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+rm"(width) // %4 + : "r"(&kYuvConstants.kUVToB) // %5 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + asm volatile ( + "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + "1: \n" + YUVTORGB + "pcmpeqb %%xmm5,%%xmm5 \n" + "punpcklbw %%xmm0,%%xmm1 \n" + "punpcklbw %%xmm2,%%xmm5 \n" + "movdqa %%xmm5,%%xmm0 \n" + "punpcklwd %%xmm1,%%xmm5 \n" + "punpckhwd %%xmm1,%%xmm0 \n" + "movdqu %%xmm5,(%3) \n" + "movdqu %%xmm0,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+rm"(width) // %4 + : "r"(&kYuvConstants.kUVToB) // %5 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + +void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + asm volatile ( + "sub %1,%2 \n" + "pcmpeqb %%xmm5,%%xmm5 \n" + "pxor %%xmm4,%%xmm4 \n" + "1: \n" + YUVTORGB + "punpcklbw %%xmm1,%%xmm2 \n" + "punpcklbw %%xmm5,%%xmm0 \n" + "movdqa %%xmm2,%%xmm1 \n" + "punpcklwd %%xmm0,%%xmm2 \n" + "punpckhwd %%xmm0,%%xmm1 \n" + "movdqu %%xmm2,(%3) \n" + "movdqu %%xmm1,0x10(%3) \n" + "lea 0x20(%3),%3 \n" + "sub $0x8,%4 \n" + "jg 1b \n" + : "+r"(y_buf), // %0 + "+r"(u_buf), // %1 + "+r"(v_buf), // %2 + "+r"(rgb_buf), // %3 + "+rm"(width) // %4 + : "r"(&kYuvConstants.kUVToB) // %5 + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" +#endif + ); +} + void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, diff --git a/source/row_win.cc b/source/row_win.cc index 06f361e73..c98cd1ab8 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -1388,6 +1388,126 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf, } } +__declspec(naked) +void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm0, xmm1 // BG + punpcklbw xmm2, xmm5 // RA + movdqa xmm1, xmm0 + punpcklwd xmm0, xmm2 // BGRA first 4 pixels + punpckhwd xmm1, xmm2 // BGRA next 4 pixels + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) +void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pxor xmm4, xmm4 + + align 16 + convertloop: + YUVTORGB + + // Step 3: Weave into BGRA + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + punpcklbw xmm1, xmm0 // GB + punpcklbw xmm5, xmm2 // AR + movdqa xmm0, xmm5 + punpcklwd xmm5, xmm1 // BGRA first 4 pixels + punpckhwd xmm0, xmm1 // BGRA next 4 pixels + movdqu [edx], xmm5 + movdqu [edx + 16], xmm0 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + +__declspec(naked) +void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm { + push esi + push edi + mov eax, [esp + 8 + 4] // Y + mov esi, [esp + 8 + 8] // U + mov edi, [esp + 8 + 12] // V + mov edx, [esp + 8 + 16] // rgb + mov ecx, [esp + 8 + 20] // width + sub edi, esi + pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha + pxor xmm4, xmm4 + + align 16 + convertloop: + YUVTORGB + + // Step 3: Weave into ARGB + punpcklbw xmm2, xmm1 // RG + punpcklbw xmm0, xmm5 // BA + movdqa xmm1, xmm2 + punpcklwd xmm2, xmm0 // RGBA first 4 pixels + punpckhwd xmm1, xmm0 // RGBA next 4 pixels + movdqu [edx], xmm2 + movdqu [edx + 16], xmm1 + lea edx, [edx + 32] + sub ecx, 8 + jg convertloop + + pop edi + pop esi + ret + } +} + __declspec(naked) void I444ToARGBRow_SSSE3(const uint8* y_buf, const uint8* u_buf,