diff --git a/source/convert_argb.cc b/source/convert_argb.cc index e070aaa09..8cc3e4981 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -618,21 +618,21 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2; src_stride_yuy2 = -src_stride_yuy2; } - void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C; + void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, + int pix) = YUY2ToUV422Row_C; void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) = YUY2ToYRow_C; #if defined(HAS_YUY2TOYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { if (width > 16) { - YUY2ToUVRow = YUY2ToUVRow_Any_SSE2; + YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2; YUY2ToYRow = YUY2ToYRow_Any_SSE2; } if (IS_ALIGNED(width, 16)) { - YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2; + YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2; YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2; if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) { - YUY2ToUVRow = YUY2ToUVRow_SSE2; + YUY2ToUV422Row = YUY2ToUV422Row_SSE2; YUY2ToYRow = YUY2ToYRow_SSE2; } } @@ -665,7 +665,7 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, SIMD_ALIGNED(uint8 rowv[kMaxStride]); for (int y = 0; y < height; ++y) { - YUY2ToUVRow(src_yuy2, 0, rowu, rowv, width); + YUY2ToUV422Row(src_yuy2, rowu, rowv, width); YUY2ToYRow(src_yuy2, rowy, width); I422ToARGBRow(rowy, rowu, rowv, dst_argb, width); src_yuy2 += src_stride_yuy2; @@ -688,21 +688,21 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy; src_stride_uyvy = -src_stride_uyvy; } - void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C; + void (*UYVYToUV422Row)(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, + int pix) = UYVYToUV422Row_C; void (*UYVYToYRow)(const uint8* src_uyvy, uint8* dst_y, int pix) = UYVYToYRow_C; #if defined(HAS_UYVYTOYROW_SSE2) if (TestCpuFlag(kCpuHasSSE2)) { if (width > 16) { - UYVYToUVRow = UYVYToUVRow_Any_SSE2; + UYVYToUV422Row = UYVYToUV422Row_Any_SSE2; UYVYToYRow = UYVYToYRow_Any_SSE2; } if (IS_ALIGNED(width, 16)) { - UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2; + UYVYToUV422Row = UYVYToUV422Row_Unaligned_SSE2; UYVYToYRow = UYVYToYRow_Unaligned_SSE2; if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) { - UYVYToUVRow = UYVYToUVRow_SSE2; + UYVYToUV422Row = UYVYToUV422Row_SSE2; UYVYToYRow = UYVYToYRow_SSE2; } } @@ -733,8 +733,9 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, SIMD_ALIGNED(uint8 rowy[kMaxStride]); SIMD_ALIGNED(uint8 rowu[kMaxStride]); SIMD_ALIGNED(uint8 rowv[kMaxStride]); + for (int y = 0; y < height; ++y) { - UYVYToUVRow(src_uyvy, 0, rowu, rowv, width); + UYVYToUV422Row(src_uyvy, rowu, rowv, width); UYVYToYRow(src_uyvy, rowy, width); I422ToARGBRow(rowy, rowu, rowv, dst_argb, width); src_uyvy += src_stride_uyvy; diff --git a/source/row.h b/source/row.h index 6973c45ee..f52ed69c4 100644 --- a/source/row.h +++ b/source/row.h @@ -70,7 +70,6 @@ extern "C" { #define HAS_UYVYTOUVROW_SSE2 #define HAS_UYVYTOYROW_SSE2 #define HAS_YTOARGBROW_SSE2 -#define HAS_YUY2TOUVROW_SSE2 #define HAS_YUY2TOYROW_SSE2 #define HAS_ARGBGRAYROW_SSSE3 #define HAS_ARGBSEPIAROW_SSSE3 @@ -455,33 +454,48 @@ void I422ToABGRRow_Any_NEON(const uint8* y_buf, void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_y, int pix); + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_y, int pix); + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_C(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); +void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int stride_yuy2, + uint8* dst_u, uint8* dst_v, int pix); +void YUY2ToUV422Row_Any_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix); void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_y, int pix); + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_y, int pix); - -void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix); -void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int pix); -void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix); + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int pix); - -void YUY2ToUVRow_Any_SSE2(const uint8* src_yuy2, int src_stride_yuy2, - uint8* dst_u, uint8* dst_v, int pix); -void YUY2ToYRow_Any_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); -void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int src_stride_uyvy, - uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_C(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); void UYVYToYRow_Any_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix); +void UYVYToUVRow_Any_SSE2(const uint8* src_uyvy, int stride_uyvy, + uint8* dst_u, uint8* dst_v, int pix); +void UYVYToUV422Row_Any_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix); + void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width); void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width); diff --git a/source/row_common.cc b/source/row_common.cc index 838175b00..cd37c4584 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -621,10 +621,10 @@ void CopyRow_C(const uint8* src, uint8* dst, int count) { memcpy(dst, src, count); } -// Filter 2 rows of YUY2 UV's (422) into U and V (420) +// Filter 2 rows of YUY2 UV's (422) into U and V (420). void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, uint8* dst_u, uint8* dst_v, int width) { - // Output a row of UV values, filtering 2 rows of YUY2 + // Output a row of UV values, filtering 2 rows of YUY2. for (int x = 0; x < width; x += 2) { dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; @@ -634,8 +634,22 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2, } } +// Copy row of YUY2 UV's (422) into U and V (422). +void YUY2ToUV422Row_C(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values. + for (int x = 0; x < width; x += 2) { + dst_u[0] = src_yuy2[1]; + dst_v[0] = src_yuy2[3]; + src_yuy2 += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of YUY2 Y's (422) into Y (420/422). void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { - // Copy a row of yuy2 Y values + // Output a row of Y values. for (int x = 0; x < width - 1; x += 2) { dst_y[x] = src_yuy2[0]; dst_y[x + 1] = src_yuy2[2]; @@ -646,9 +660,10 @@ void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { } } +// Filter 2 rows of UYVY UV's (422) into U and V (420). void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy, uint8* dst_u, uint8* dst_v, int width) { - // Copy a row of uyvy UV values + // Output a row of UV values. for (int x = 0; x < width; x += 2) { dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1; dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1; @@ -658,15 +673,29 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy, } } -void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) { - // Copy a row of uyvy Y values +// Copy row of UYVY UV's (422) into U and V (422). +void UYVYToUV422Row_C(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int width) { + // Output a row of UV values. + for (int x = 0; x < width; x += 2) { + dst_u[0] = src_uyvy[0]; + dst_v[0] = src_uyvy[2]; + src_uyvy += 4; + dst_u += 1; + dst_v += 1; + } +} + +// Copy row of UYVY Y's (422) into Y (420/422). +void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width) { + // Output a row of Y values. for (int x = 0; x < width - 1; x += 2) { - dst_y[x] = src_yuy2[1]; - dst_y[x + 1] = src_yuy2[3]; - src_yuy2 += 4; + dst_y[x] = src_uyvy[1]; + dst_y[x + 1] = src_uyvy[3]; + src_uyvy += 4; } if (width & 1) { - dst_y[width - 1] = src_yuy2[1]; + dst_y[width - 1] = src_uyvy[1]; } } @@ -910,12 +939,12 @@ YANY(YUY2ToYRow_Any_SSE2, YUY2ToYRow_Unaligned_SSE2, 2) YANY(UYVYToYRow_Any_SSE2, UYVYToYRow_Unaligned_SSE2, 2) #undef YANY -#define UVANY(NAMEANY, ARGBTOUV_SSE, ARGBTOUV_C, BPP) \ +#define UVANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP) \ void NAMEANY(const uint8* src_argb, int src_stride_argb, \ uint8* dst_u, uint8* dst_v, int width) { \ int n = width & ~15; \ - ARGBTOUV_SSE(src_argb, src_stride_argb, dst_u, dst_v, n); \ - ARGBTOUV_C(src_argb + n * BPP, src_stride_argb, \ + ANYTOUV_SSE(src_argb, src_stride_argb, dst_u, dst_v, n); \ + ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \ dst_u + (n >> 1), \ dst_v + (n >> 1), \ width & 15); \ @@ -927,6 +956,24 @@ UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4) UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2) UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2) #undef UVANY + +#define UV422ANY(NAMEANY, ANYTOUV_SSE, ANYTOUV_C, BPP) \ + void NAMEANY(const uint8* src_argb, \ + uint8* dst_u, uint8* dst_v, int width) { \ + int n = width & ~15; \ + ANYTOUV_SSE(src_argb, dst_u, dst_v, n); \ + ANYTOUV_C(src_argb + n * BPP, \ + dst_u + (n >> 1), \ + dst_v + (n >> 1), \ + width & 15); \ + } + +UV422ANY(YUY2ToUV422Row_Any_SSE2, YUY2ToUV422Row_Unaligned_SSE2, \ + YUY2ToUV422Row_C, 2) +UV422ANY(UYVYToUV422Row_Any_SSE2, UYVYToUV422Row_Unaligned_SSE2, \ + UYVYToUV422Row_C, 2) +#undef UV422ANY + #endif void ComputeCumulativeSumRow_C(const uint8* row, int32* cumsum, diff --git a/source/row_posix.cc b/source/row_posix.cc index ddfd27855..305934663 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -2114,7 +2114,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { } void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_y, int pix) { + uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -2143,7 +2143,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 - "+r"(dst_y), // %2 + "+r"(dst_v), // %2 "+r"(pix) // %3 : "r"(static_cast(stride_yuy2)) // %4 : "memory", "cc" @@ -2153,6 +2153,41 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, ); } +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,(%1,%2) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { @@ -2214,7 +2249,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, "jg 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 - "+r"(dst_y), // %2 + "+r"(dst_v), // %2 "+r"(pix) // %3 : "r"(static_cast(stride_yuy2)) // %4 : "memory", "cc" @@ -2224,6 +2259,42 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, ); } +void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,(%1,%2) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_yuy2), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( ".p2align 4 \n" @@ -2250,7 +2321,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { } void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_y, int pix) { + uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -2279,7 +2350,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 - "+r"(dst_y), // %2 + "+r"(dst_v), // %2 "+r"(pix) // %3 : "r"(static_cast(stride_uyvy)) // %4 : "memory", "cc" @@ -2289,6 +2360,42 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, ); } +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqa (%0),%%xmm0 \n" + "movdqa 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,(%1,%2) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} + void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( @@ -2316,7 +2423,7 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, } void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_y, int pix) { + uint8* dst_u, uint8* dst_v, int pix) { asm volatile ( "pcmpeqb %%xmm5,%%xmm5 \n" "psrlw $0x8,%%xmm5 \n" @@ -2345,7 +2452,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, "jg 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 - "+r"(dst_y), // %2 + "+r"(dst_v), // %2 "+r"(pix) // %3 : "r"(static_cast(stride_uyvy)) // %4 : "memory", "cc" @@ -2354,6 +2461,42 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, #endif ); } + +void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + asm volatile ( + "pcmpeqb %%xmm5,%%xmm5 \n" + "psrlw $0x8,%%xmm5 \n" + "sub %1,%2 \n" + ".p2align 4 \n" + "1: \n" + "movdqu (%0),%%xmm0 \n" + "movdqu 0x10(%0),%%xmm1 \n" + "lea 0x20(%0),%0 \n" + "pand %%xmm5,%%xmm0 \n" + "pand %%xmm5,%%xmm1 \n" + "packuswb %%xmm1,%%xmm0 \n" + "movdqa %%xmm0,%%xmm1 \n" + "pand %%xmm5,%%xmm0 \n" + "packuswb %%xmm0,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" + "packuswb %%xmm1,%%xmm1 \n" + "movq %%xmm0,(%1) \n" + "movq %%xmm1,(%1,%2) \n" + "lea 0x8(%1),%1 \n" + "sub $0x10,%3 \n" + "jg 1b \n" + : "+r"(src_uyvy), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "memory", "cc" +#if defined(__SSE2__) + , "xmm0", "xmm1", "xmm5" +#endif + ); +} #endif // HAS_YUY2TOYROW_SSE2 #ifdef HAS_ARGBBLENDROW_SSE2 diff --git a/source/row_win.cc b/source/row_win.cc index b69f9a23c..47a6749d1 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2198,7 +2198,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2, __declspec(naked) __declspec(align(16)) void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_y, int pix) { + uint8* dst_u, uint8* dst_v, int pix) { __asm { push esi push edi @@ -2240,6 +2240,43 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, } } +__declspec(naked) __declspec(align(16)) +void YUY2ToUV422Row_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 16 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + __declspec(naked) __declspec(align(16)) void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { @@ -2268,7 +2305,7 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, __declspec(naked) __declspec(align(16)) void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_y, int pix) { + uint8* dst_u, uint8* dst_v, int pix) { __asm { push esi push edi @@ -2310,6 +2347,43 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, } } +__declspec(naked) __declspec(align(16)) +void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 16 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + psrlw xmm0, 8 // YUYV -> UVUV + psrlw xmm1, 8 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + __declspec(naked) __declspec(align(16)) void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { @@ -2336,7 +2410,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy, __declspec(naked) __declspec(align(16)) void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_y, int pix) { + uint8* dst_u, uint8* dst_v, int pix) { __asm { push esi push edi @@ -2378,6 +2452,43 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, } } +__declspec(naked) __declspec(align(16)) +void UYVYToUV422Row_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 16 + convertloop: + movdqa xmm0, [eax] + movdqa xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} + __declspec(naked) __declspec(align(16)) void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) { @@ -2404,7 +2515,7 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, __declspec(naked) __declspec(align(16)) void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_y, int pix) { + uint8* dst_u, uint8* dst_v, int pix) { __asm { push esi push edi @@ -2445,6 +2556,43 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, ret } } + +__declspec(naked) __declspec(align(16)) +void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, + uint8* dst_u, uint8* dst_v, int pix) { + __asm { + push edi + mov eax, [esp + 4 + 4] // src_yuy2 + mov edx, [esp + 4 + 8] // dst_u + mov edi, [esp + 4 + 12] // dst_v + mov ecx, [esp + 4 + 16] // pix + pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff + psrlw xmm5, 8 + sub edi, edx + + align 16 + convertloop: + movdqu xmm0, [eax] + movdqu xmm1, [eax + 16] + lea eax, [eax + 32] + pand xmm0, xmm5 // UYVY -> UVUV + pand xmm1, xmm5 + packuswb xmm0, xmm1 + movdqa xmm1, xmm0 + pand xmm0, xmm5 // U + packuswb xmm0, xmm0 + psrlw xmm1, 8 // V + packuswb xmm1, xmm1 + movq qword ptr [edx], xmm0 + movq qword ptr [edx + edi], xmm1 + lea edx, [edx + 8] + sub ecx, 16 + jg convertloop + + pop edi + ret + } +} #endif // HAS_YUY2TOYROW_SSE2 #ifdef HAS_ARGBBLENDROW_SSE2 diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 7adb624ee..ace06ca3b 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -31,11 +31,12 @@ namespace libyuv { TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) { \ const int kWidth = 1280; \ const int kHeight = 720; \ + const int kStride = (kWidth * 8 * BPP_B + 7) / 8; \ align_buffer_16(src_y, kWidth * kHeight); \ align_buffer_16(src_u, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ align_buffer_16(src_v, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ - align_buffer_16(dst_argb_c, (kWidth * BPP_B) * kHeight); \ - align_buffer_16(dst_argb_opt, (kWidth * BPP_B) * kHeight); \ + align_buffer_16(dst_argb_c, kStride * kHeight); \ + align_buffer_16(dst_argb_opt, kStride * kHeight); \ srandom(time(NULL)); \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kWidth; ++j) \ @@ -49,16 +50,16 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) { \ FMT_PLANAR##To##FMT_B(src_y, kWidth, \ src_u, kWidth / SUBSAMP_X, \ src_v, kWidth / SUBSAMP_X, \ - dst_argb_c, kWidth * BPP_B, \ - kWidth, NEG##kHeight); \ + dst_argb_c, kStride, \ + kWidth, NEG kHeight); \ MaskCpuFlags(-1); \ const int runs = 1000; \ for (int i = 0; i < runs; ++i) { \ FMT_PLANAR##To##FMT_B(src_y, kWidth, \ src_u, kWidth / SUBSAMP_X, \ src_v, kWidth / SUBSAMP_X, \ - dst_argb_opt, kWidth * BPP_B, \ - kWidth, NEG##kHeight); \ + dst_argb_opt, kStride, \ + kWidth, NEG kHeight); \ } \ int max_diff = 0; \ for (int i = 0; i < kHeight; ++i) { \ @@ -96,7 +97,7 @@ TESTPLANARTOB(I422, 2, 1, ARGB, 4) TESTPLANARTOB(I444, 1, 1, ARGB, 4) TESTPLANARTOB(I420, 2, 2, YUY2, 2) TESTPLANARTOB(I420, 2, 2, UYVY, 2) -TESTPLANARTOB(I420, 2, 2, V210, 45 / 16) +TESTPLANARTOB(I420, 2, 2, V210, 16 / 6) TESTPLANARTOB(I420, 2, 2, I400, 1) TESTPLANARTOB(I420, 2, 2, BayerBGGR, 1) TESTPLANARTOB(I420, 2, 2, BayerRGGB, 1) @@ -124,14 +125,14 @@ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N##_OptVsC) { \ FMT_PLANAR##To##FMT_B(src_y, kWidth, \ src_uv, kWidth / SUBSAMP_X * 2, \ dst_argb_c, kWidth * BPP_B, \ - kWidth, NEG##kHeight); \ + kWidth, NEG kHeight); \ MaskCpuFlags(-1); \ const int runs = 1000; \ for (int i = 0; i < runs; ++i) { \ FMT_PLANAR##To##FMT_B(src_y, kWidth, \ src_uv, kWidth / SUBSAMP_X * 2, \ dst_argb_opt, kWidth * BPP_B, \ - kWidth, NEG##kHeight); \ + kWidth, NEG kHeight); \ } \ int max_diff = 0; \ for (int i = 0; i < kHeight; ++i) { \ @@ -164,7 +165,8 @@ TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2) TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N##_OptVsC) { \ const int kWidth = 1280; \ const int kHeight = 720; \ - align_buffer_16(src_argb, (kWidth * BPP_A) * kHeight); \ + const int kStride = (kWidth * 8 * BPP_A + 7) / 8; \ + align_buffer_16(src_argb, kStride * kHeight); \ align_buffer_16(dst_y_c, kWidth * kHeight); \ align_buffer_16(dst_u_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ align_buffer_16(dst_v_c, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ @@ -173,22 +175,22 @@ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N##_OptVsC) { \ align_buffer_16(dst_v_opt, kWidth / SUBSAMP_X * kHeight / SUBSAMP_Y); \ srandom(time(NULL)); \ for (int i = 0; i < kHeight; ++i) \ - for (int j = 0; j < kWidth * BPP_A; ++j) \ - src_argb[(i * kWidth * BPP_A) + j] = (random() & 0xff); \ + for (int j = 0; j < kStride; ++j) \ + src_argb[(i * kStride) + j] = (random() & 0xff); \ MaskCpuFlags(kCpuInitialized); \ - FMT_A##To##FMT_PLANAR(src_argb, kWidth * BPP_A, \ + FMT_A##To##FMT_PLANAR(src_argb, kStride, \ dst_y_c, kWidth, \ dst_u_c, kWidth / SUBSAMP_X, \ dst_v_c, kWidth / SUBSAMP_X, \ - kWidth, NEG##kHeight); \ + kWidth, NEG kHeight); \ MaskCpuFlags(-1); \ const int runs = 1000; \ for (int i = 0; i < runs; ++i) { \ - FMT_A##To##FMT_PLANAR(src_argb, kWidth * BPP_A, \ + FMT_A##To##FMT_PLANAR(src_argb, kStride, \ dst_y_opt, kWidth, \ dst_u_opt, kWidth / SUBSAMP_X, \ dst_v_opt, kWidth / SUBSAMP_X, \ - kWidth, NEG##kHeight); \ + kWidth, NEG kHeight); \ } \ int max_diff = 0; \ for (int i = 0; i < kHeight; ++i) { \ @@ -251,7 +253,7 @@ TESTATOPLANAR(ARGB, 4, I422, 2, 1) // TODO(fbarchard): Implement and test 411 and 444 TESTATOPLANAR(YUY2, 2, I420, 2, 2) TESTATOPLANAR(UYVY, 2, I420, 2, 2) -TESTATOPLANAR(V210, 45 / 16, I420, 2, 2) +TESTATOPLANAR(V210, 16 / 6, I420, 2, 2) TESTATOPLANAR(I400, 1, I420, 2, 2) TESTATOPLANAR(BayerBGGR, 1, I420, 2, 2) TESTATOPLANAR(BayerRGGB, 1, I420, 2, 2) @@ -272,13 +274,13 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##N##_OptVsC) { \ MaskCpuFlags(kCpuInitialized); \ FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A, \ dst_argb_c, kWidth * BPP_B, \ - kWidth, NEG##kHeight); \ + kWidth, NEG kHeight); \ MaskCpuFlags(-1); \ const int runs = 1000; \ for (int i = 0; i < runs; ++i) { \ FMT_A##To##FMT_B(src_argb, kWidth * STRIDE_A, \ dst_argb_opt, kWidth * BPP_B, \ - kWidth, NEG##kHeight); \ + kWidth, NEG kHeight); \ } \ int max_diff = 0; \ for (int i = 0; i < kHeight * kWidth * BPP_B; ++i) { \