diff --git a/README.chromium b/README.chromium index d9260d74a..3eca43879 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 277 +Version: 279 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index c880d8fec..dd2df8d05 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,7 +11,7 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 277 +#define LIBYUV_VERSION 279 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/row_posix.cc b/source/row_posix.cc index 479ece0ac..28f10c040 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -1212,7 +1212,6 @@ void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr, #endif ); } - #endif // HAS_ARGBTOYROW_SSSE3 #ifdef HAS_I422TOARGBROW_SSSE3 @@ -1251,73 +1250,32 @@ struct { { YG, YG, YG, YG, YG, YG, YG, YG } }; -// Convert 8 pixels: 8 UV and 8 Y -#define YUV444TORGB \ +// Read 8 UV from 411 +#define READYUV444 \ "movq (%1),%%xmm0 \n" \ "movq (%1,%2,1),%%xmm1 \n" \ "lea 0x8(%1),%1 \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \ - "movdqa %%xmm0,%%xmm1 \n" \ - "movdqa %%xmm0,%%xmm2 \n" \ - "pmaddubsw (%5),%%xmm0 \n" \ - "pmaddubsw 16(%5),%%xmm1 \n" \ - "pmaddubsw 32(%5),%%xmm2 \n" \ - "psubw 48(%5),%%xmm0 \n" \ - "psubw 64(%5),%%xmm1 \n" \ - "psubw 80(%5),%%xmm2 \n" \ - "movq (%0),%%xmm3 \n" \ - "lea 0x8(%0),%0 \n" \ - "punpcklbw %%xmm4,%%xmm3 \n" \ - "psubsw 96(%5),%%xmm3 \n" \ - "pmullw 112(%5),%%xmm3 \n" \ - "paddsw %%xmm3,%%xmm0 \n" \ - "paddsw %%xmm3,%%xmm1 \n" \ - "paddsw %%xmm3,%%xmm2 \n" \ - "psraw $0x6,%%xmm0 \n" \ - "psraw $0x6,%%xmm1 \n" \ - "psraw $0x6,%%xmm2 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "packuswb %%xmm1,%%xmm1 \n" \ - "packuswb %%xmm2,%%xmm2 \n" -// Convert 8 pixels: 4 UV and 8 Y -#define YUV422TORGB \ +// Read 4 UV from 422, upsample to 8 UV +#define READYUV422 \ "movd (%1),%%xmm0 \n" \ "movd (%1,%2,1),%%xmm1 \n" \ "lea 0x4(%1),%1 \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \ "punpcklwd %%xmm0,%%xmm0 \n" \ - "movdqa %%xmm0,%%xmm1 \n" \ - "movdqa %%xmm0,%%xmm2 \n" \ - "pmaddubsw (%5),%%xmm0 \n" \ - "pmaddubsw 16(%5),%%xmm1 \n" \ - "pmaddubsw 32(%5),%%xmm2 \n" \ - "psubw 48(%5),%%xmm0 \n" \ - "psubw 64(%5),%%xmm1 \n" \ - "psubw 80(%5),%%xmm2 \n" \ - "movq (%0),%%xmm3 \n" \ - "lea 0x8(%0),%0 \n" \ - "punpcklbw %%xmm4,%%xmm3 \n" \ - "psubsw 96(%5),%%xmm3 \n" \ - "pmullw 112(%5),%%xmm3 \n" \ - "paddsw %%xmm3,%%xmm0 \n" \ - "paddsw %%xmm3,%%xmm1 \n" \ - "paddsw %%xmm3,%%xmm2 \n" \ - "psraw $0x6,%%xmm0 \n" \ - "psraw $0x6,%%xmm1 \n" \ - "psraw $0x6,%%xmm2 \n" \ - "packuswb %%xmm0,%%xmm0 \n" \ - "packuswb %%xmm1,%%xmm1 \n" \ - "packuswb %%xmm2,%%xmm2 \n" -// Convert 8 pixels: 2 UV and 8 Y -#define YUV411TORGB \ +// Read 2 UV from 411, upsample to 8 UV +#define READYUV411 \ "movd (%1),%%xmm0 \n" \ "movd (%1,%2,1),%%xmm1 \n" \ "lea 0x2(%1),%1 \n" \ "punpcklbw %%xmm1,%%xmm0 \n" \ "punpcklwd %%xmm0,%%xmm0 \n" \ "punpckldq %%xmm0,%%xmm0 \n" \ + +// Convert 8 pixels: 8 UV and 8 Y +#define YUVTORGB \ "movdqa %%xmm0,%%xmm1 \n" \ "movdqa %%xmm0,%%xmm2 \n" \ "pmaddubsw (%5),%%xmm0 \n" \ @@ -1352,7 +1310,8 @@ void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf, "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" "1: \n" - YUV444TORGB + READYUV444 + YUVTORGB "punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -1387,7 +1346,8 @@ void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf, "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" "1: \n" - YUV422TORGB + READYUV422 + YUVTORGB "punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -1422,7 +1382,8 @@ void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf, "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" "1: \n" - YUV411TORGB + READYUV411 + YUVTORGB "punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -1457,7 +1418,8 @@ void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" "1: \n" - YUV444TORGB + READYUV444 + YUVTORGB "punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -1492,7 +1454,8 @@ void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" "1: \n" - YUV422TORGB + READYUV422 + YUVTORGB "punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -1527,7 +1490,8 @@ void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" "1: \n" - YUV411TORGB + READYUV411 + YUVTORGB "punpcklbw %%xmm1,%%xmm0 \n" "punpcklbw %%xmm5,%%xmm2 \n" "movdqa %%xmm0,%%xmm1 \n" @@ -1562,7 +1526,8 @@ void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf, "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" "1: \n" - YUV422TORGB + READYUV422 + YUVTORGB "pcmpeqb %%xmm5,%%xmm5 \n" "punpcklbw %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm5 \n" @@ -1598,7 +1563,8 @@ void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf, "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" "1: \n" - YUV422TORGB + READYUV422 + YUVTORGB "punpcklbw %%xmm1,%%xmm2 \n" "punpcklbw %%xmm5,%%xmm0 \n" "movdqa %%xmm2,%%xmm1 \n" @@ -1633,7 +1599,8 @@ void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" "1: \n" - YUV422TORGB + READYUV422 + YUVTORGB "pcmpeqb %%xmm5,%%xmm5 \n" "punpcklbw %%xmm0,%%xmm1 \n" "punpcklbw %%xmm2,%%xmm5 \n" @@ -1669,7 +1636,8 @@ void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, "pxor %%xmm4,%%xmm4 \n" ".p2align 4 \n" "1: \n" - YUV422TORGB + READYUV422 + YUVTORGB "punpcklbw %%xmm1,%%xmm2 \n" "punpcklbw %%xmm5,%%xmm0 \n" "movdqa %%xmm2,%%xmm1 \n" @@ -1741,7 +1709,7 @@ void YToARGBRow_SSE2(const uint8* y_buf, #endif ); } -#endif +#endif // HAS_YTOARGBROW_SSE2 #ifdef HAS_MIRRORROW_SSSE3 // Shuffle table for reversing the bytes. @@ -1772,7 +1740,7 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { #endif ); } -#endif +#endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_SSE2 void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { @@ -1803,7 +1771,7 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { #endif ); } -#endif +#endif // HAS_MIRRORROW_SSE2 #ifdef HAS_MIRRORROW_UV_SSSE3 // Shuffle table for reversing the bytes of UV channels. @@ -1838,7 +1806,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, #endif ); } -#endif +#endif // HAS_MIRRORROW_UV_SSSE3 #ifdef HAS_ADDROW_SSE2 // dst and width aligned to 16 @@ -1939,7 +1907,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { #endif ); } -#endif +#endif // HAS_SPLITUV_SSE2 #ifdef HAS_COPYROW_SSE2 void CopyRow_SSE2(const uint8* src, uint8* dst, int count) { @@ -1979,7 +1947,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) { : "memory", "cc" ); } -#endif +#endif // HAS_COPYROW_X86 #ifdef HAS_YUY2TOYROW_SSE2 void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) { diff --git a/source/row_win.cc b/source/row_win.cc index 60c1e6e31..308b08747 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -1199,6 +1199,7 @@ __asm { ret } } +#endif // HAS_ARGBTOYROW_SSSE3 #ifdef HAS_I422TOARGBROW_SSSE3 @@ -1237,80 +1238,36 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; // TODO(fbarchard): NV12/NV21 fetch UV and use directly. -// Convert 8 pixels: 8 UV and 8 Y -#define YUV444TORGB __asm { \ - /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ +// Read 8 UV from 411 +#define READYUV444 __asm { \ __asm movq xmm0, qword ptr [esi] /* U */ \ __asm movq xmm1, qword ptr [esi + edi] /* V */ \ __asm lea esi, [esi + 8] \ __asm punpcklbw xmm0, xmm1 /* UV */ \ - __asm movdqa xmm1, xmm0 \ - __asm movdqa xmm2, xmm0 \ - __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ - __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ - __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ - __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ - __asm psubw xmm1, kUVBiasG \ - __asm psubw xmm2, kUVBiasR \ - /* Step 2: Find Y contribution to 8 R,G,B values */ \ - __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ - __asm lea eax, [eax + 8] \ - __asm punpcklbw xmm3, xmm4 \ - __asm psubsw xmm3, kYSub16 \ - __asm pmullw xmm3, kYToRgb \ - __asm paddsw xmm0, xmm3 /* B += Y */ \ - __asm paddsw xmm1, xmm3 /* G += Y */ \ - __asm paddsw xmm2, xmm3 /* R += Y */ \ - __asm psraw xmm0, 6 \ - __asm psraw xmm1, 6 \ - __asm psraw xmm2, 6 \ - __asm packuswb xmm0, xmm0 /* B */ \ - __asm packuswb xmm1, xmm1 /* G */ \ - __asm packuswb xmm2, xmm2 /* R */ \ } -// Convert 8 pixels: 4 UV and 8 Y -#define YUV422TORGB __asm { \ - /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ +// Read 4 UV from 422, upsample to 8 UV +#define READYUV422 __asm { \ __asm movd xmm0, [esi] /* U */ \ __asm movd xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 4] \ __asm punpcklbw xmm0, xmm1 /* UV */ \ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ - __asm movdqa xmm1, xmm0 \ - __asm movdqa xmm2, xmm0 \ - __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ - __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ - __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ - __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ - __asm psubw xmm1, kUVBiasG \ - __asm psubw xmm2, kUVBiasR \ - /* Step 2: Find Y contribution to 8 R,G,B values */ \ - __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ - __asm lea eax, [eax + 8] \ - __asm punpcklbw xmm3, xmm4 \ - __asm psubsw xmm3, kYSub16 \ - __asm pmullw xmm3, kYToRgb \ - __asm paddsw xmm0, xmm3 /* B += Y */ \ - __asm paddsw xmm1, xmm3 /* G += Y */ \ - __asm paddsw xmm2, xmm3 /* R += Y */ \ - __asm psraw xmm0, 6 \ - __asm psraw xmm1, 6 \ - __asm psraw xmm2, 6 \ - __asm packuswb xmm0, xmm0 /* B */ \ - __asm packuswb xmm1, xmm1 /* G */ \ - __asm packuswb xmm2, xmm2 /* R */ \ } -// Convert 8 pixels: 2 UV and 8 Y -#define YUV411TORGB __asm { \ - /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ +// Read 2 UV from 411, upsample to 8 UV +#define READYUV411 __asm { \ __asm movd xmm0, [esi] /* U */ \ __asm movd xmm1, [esi + edi] /* V */ \ __asm lea esi, [esi + 2] \ __asm punpcklbw xmm0, xmm1 /* UV */ \ __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ + } + +// Convert 8 pixels: 8 UV and 8 Y +#define YUVTORGB __asm { \ + /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ __asm movdqa xmm1, xmm0 \ __asm movdqa xmm2, xmm0 \ __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ @@ -1358,7 +1315,8 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf, align 16 convertloop: - YUV444TORGB + READYUV444 + YUVTORGB // Step 3: Weave into ARGB punpcklbw xmm0, xmm1 // BG @@ -1400,7 +1358,8 @@ void I422ToARGBRow_SSSE3(const uint8* y_buf, align 16 convertloop: - YUV422TORGB + READYUV422 + YUVTORGB // Step 3: Weave into ARGB punpcklbw xmm0, xmm1 // BG @@ -1443,7 +1402,8 @@ void I411ToARGBRow_SSSE3(const uint8* y_buf, align 16 convertloop: - YUV411TORGB + READYUV411 + YUVTORGB // Step 3: Weave into ARGB punpcklbw xmm0, xmm1 // BG @@ -1485,7 +1445,8 @@ void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, align 16 convertloop: - YUV444TORGB + READYUV444 + YUVTORGB // Step 3: Weave into ARGB punpcklbw xmm0, xmm1 // BG @@ -1493,8 +1454,8 @@ void I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -1527,7 +1488,8 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, align 16 convertloop: - YUV422TORGB + READYUV422 + YUVTORGB // Step 3: Weave into ARGB punpcklbw xmm0, xmm1 // BG @@ -1535,8 +1497,8 @@ void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -1570,7 +1532,8 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, align 16 convertloop: - YUV411TORGB + READYUV411 + YUVTORGB // Step 3: Weave into ARGB punpcklbw xmm0, xmm1 // BG @@ -1578,8 +1541,8 @@ void I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, movdqa xmm1, xmm0 punpcklwd xmm0, xmm2 // BGRA first 4 pixels punpckhwd xmm1, xmm2 // BGRA next 4 pixels - movdqa [edx], xmm0 - movdqa [edx + 16], xmm1 + movdqu [edx], xmm0 + movdqu [edx + 16], xmm1 lea edx, [edx + 32] sub ecx, 8 jg convertloop @@ -1609,7 +1572,8 @@ void I422ToBGRARow_SSSE3(const uint8* y_buf, align 16 convertloop: - YUV422TORGB + READYUV422 + YUVTORGB // Step 3: Weave into BGRA pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha @@ -1650,7 +1614,8 @@ void I422ToABGRRow_SSSE3(const uint8* y_buf, align 16 convertloop: - YUV422TORGB + READYUV422 + YUVTORGB // Step 3: Weave into ARGB punpcklbw xmm2, xmm1 // RG @@ -1689,7 +1654,8 @@ void I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, align 16 convertloop: - YUV422TORGB + READYUV422 + YUVTORGB // Step 3: Weave into BGRA pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha @@ -1730,7 +1696,8 @@ void I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, align 16 convertloop: - YUV422TORGB + READYUV422 + YUVTORGB // Step 3: Weave into ARGB punpcklbw xmm2, xmm1 // RG @@ -1796,7 +1763,6 @@ void YToARGBRow_SSE2(const uint8* y_buf, } } #endif // HAS_YTOARGBROW_SSE2 -#endif #ifdef HAS_MIRRORROW_SSSE3 @@ -1825,7 +1791,7 @@ __asm { ret } } -#endif +#endif // HAS_MIRRORROW_SSSE3 #ifdef HAS_MIRRORROW_SSE2 // SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 @@ -1855,7 +1821,7 @@ __asm { ret } } -#endif +#endif // HAS_MIRRORROW_SSE2 #ifdef HAS_MIRRORROW_UV_SSSE3 // Shuffle table for reversing the bytes of UV channels. @@ -1891,7 +1857,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, ret } } -#endif +#endif // HAS_MIRRORROW_UV_SSSE3 #ifdef HAS_ADDROW_SSE2 // dst and width aligned to 16 @@ -1988,7 +1954,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { ret } } -#endif +#endif // HAS_SPLITUV_SSE2 #ifdef HAS_COPYROW_SSE2 // CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time @@ -2030,7 +1996,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) { ret } } -#endif +#endif // HAS_COPYROW_X86 #ifdef HAS_YUY2TOYROW_SSE2 __declspec(naked) __declspec(align(16))