diff --git a/README.chromium b/README.chromium index 7688ae57b..b60adf7c7 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1788 +Version: 1789 License: BSD License File: LICENSE diff --git a/docs/formats.md b/docs/formats.md index d628f7f96..12ea9465e 100644 --- a/docs/formats.md +++ b/docs/formats.md @@ -189,7 +189,6 @@ In memory R is the lowest and A is the highest. Each channel has value ranges from 0 to 65535. AR64 is similar to ARGB. - # NV12 and NV21 NV12 is a biplanar format with a full sized Y plane followed by a single @@ -200,3 +199,10 @@ height chroma channel, and therefore is a 420 subsampling. NV16 is 16 bits per pixel, with half width and full height. aka 422. NV24 is 24 bits per pixel with full sized chroma channel. aka 444. Most NV12 functions allow the destination Y pointer to be NULL. + +# YUY2 and UYVY + +YUY2 is a packed YUV format with half width, full height. + +YUY2 is YUYV in memory +UYVY is UYVY in memory diff --git a/include/libyuv/version.h b/include/libyuv/version.h index f713c4770..8b06777fc 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1788 +#define LIBYUV_VERSION 1789 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert_to_i420.cc b/source/convert_to_i420.cc index ac6eeab24..5869ecd7b 100644 --- a/source/convert_to_i420.cc +++ b/source/convert_to_i420.cc @@ -89,18 +89,26 @@ int ConvertToI420(const uint8_t* sample, switch (format) { // Single plane formats - case FOURCC_YUY2: + case FOURCC_YUY2: { // TODO(fbarchard): Find better odd crop fix. + uint8_t* u = (crop_x & 1) ? dst_v : dst_u; + uint8_t* v = (crop_x & 1) ? dst_u : dst_v; + int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u; + int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v; src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); + r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u, + stride_u, v, stride_v, crop_width, inv_crop_height); break; - case FOURCC_UYVY: + } + case FOURCC_UYVY: { + uint8_t* u = (crop_x & 1) ? dst_v : dst_u; + uint8_t* v = (crop_x & 1) ? dst_u : dst_v; + int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u; + int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v; src = sample + (aligned_src_width * crop_y + crop_x) * 2; - r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u, - dst_stride_u, dst_v, dst_stride_v, crop_width, - inv_crop_height); + r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u, + stride_u, v, stride_v, crop_width, inv_crop_height); break; + } case FOURCC_RGBP: src = sample + (src_width * crop_y + crop_x) * 2; r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, diff --git a/source/row_win.cc b/source/row_win.cc index 5203b57c6..7dccacc7f 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -2584,7 +2584,7 @@ __declspec(naked) void I422ToRGBARow_AVX2( __asm psraw xmm2, 6 \ __asm packuswb xmm0, xmm0 /* B */ \ __asm packuswb xmm1, xmm1 /* G */ \ - __asm packuswb xmm2, xmm2 /* R */ \ + __asm packuswb xmm2, xmm2 /* R */ \ } // Store 8 ARGB values. @@ -4746,22 +4746,22 @@ __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) { pmaddubsw xmm6, xmm2 phaddw xmm0, xmm6 psrlw xmm0, 7 - packuswb xmm0, xmm0 // 8 B values + packuswb xmm0, xmm0 // 8 B values movdqu xmm5, [eax] // G movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm3 pmaddubsw xmm1, xmm3 phaddw xmm5, xmm1 psrlw xmm5, 7 - packuswb xmm5, xmm5 // 8 G values - punpcklbw xmm0, xmm5 // 8 BG values + packuswb xmm5, xmm5 // 8 G values + punpcklbw xmm0, xmm5 // 8 BG values movdqu xmm5, [eax] // R movdqu xmm1, [eax + 16] pmaddubsw xmm5, xmm4 pmaddubsw xmm1, xmm4 phaddw xmm5, xmm1 psrlw xmm5, 7 - packuswb xmm5, xmm5 // 8 R values + packuswb xmm5, xmm5 // 8 R values movdqu xmm6, [eax] // A movdqu xmm1, [eax + 16] psrld xmm6, 24 @@ -4811,25 +4811,25 @@ __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb, movdqu xmm1, [eax + 16] pmaddubsw xmm6, xmm3 pmaddubsw xmm1, xmm3 - phaddsw xmm0, xmm7 // B - phaddsw xmm6, xmm1 // G - psraw xmm0, 6 // B - psraw xmm6, 6 // G - packuswb xmm0, xmm0 // 8 B values - packuswb xmm6, xmm6 // 8 G values - punpcklbw xmm0, xmm6 // 8 BG values + phaddsw xmm0, xmm7 // B + phaddsw xmm6, xmm1 // G + psraw xmm0, 6 // B + psraw xmm6, 6 // G + packuswb xmm0, xmm0 // 8 B values + packuswb xmm6, xmm6 // 8 G values + punpcklbw xmm0, xmm6 // 8 BG values movdqu xmm1, [eax] // R movdqu xmm7, [eax + 16] pmaddubsw xmm1, xmm4 pmaddubsw xmm7, xmm4 - phaddsw xmm1, xmm7 // R + phaddsw xmm1, xmm7 // R movdqu xmm6, [eax] // A movdqu xmm7, [eax + 16] pmaddubsw xmm6, xmm5 pmaddubsw xmm7, xmm5 phaddsw xmm6, xmm7 // A - psraw xmm1, 6 // R - psraw xmm6, 6 // A + psraw xmm1, 6 // R + psraw xmm6, 6 // A packuswb xmm1, xmm1 // 8 R values packuswb xmm6, xmm6 // 8 A values punpcklbw xmm1, xmm6 // 8 RA values @@ -4872,16 +4872,16 @@ __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb, convertloop: movdqu xmm0, [eax] // read 4 pixels - punpcklbw xmm0, xmm5 // first 2 pixels - pmulhuw xmm0, xmm2 // pixel * scale >> 16 + punpcklbw xmm0, xmm5 // first 2 pixels + pmulhuw xmm0, xmm2 // pixel * scale >> 16 movdqu xmm1, [eax] // read 4 pixels - punpckhbw xmm1, xmm5 // next 2 pixels + punpckhbw xmm1, xmm5 // next 2 pixels pmulhuw xmm1, xmm2 - pmullw xmm0, xmm3 // * interval_size + pmullw xmm0, xmm3 // * interval_size movdqu xmm7, [eax] // read 4 pixels pmullw xmm1, xmm3 - pand xmm7, xmm6 // mask alpha - paddw xmm0, xmm4 // + interval_size / 2 + pand xmm7, xmm6 // mask alpha + paddw xmm0, xmm4 // + interval_size / 2 paddw xmm1, xmm4 packuswb xmm0, xmm1 por xmm0, xmm7 @@ -4901,9 +4901,9 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb, int width, uint32_t value) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb - mov ecx, [esp + 12] // width + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb + mov ecx, [esp + 12] // width movd xmm2, [esp + 16] // value punpcklbw xmm2, xmm2 punpcklqdq xmm2, xmm2 @@ -4912,10 +4912,10 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb, movdqu xmm0, [eax] // read 4 pixels lea eax, [eax + 16] movdqa xmm1, xmm0 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - pmulhuw xmm0, xmm2 // argb * value - pmulhuw xmm1, xmm2 // argb * value + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + pmulhuw xmm0, xmm2 // argb * value + pmulhuw xmm1, xmm2 // argb * value psrlw xmm0, 8 psrlw xmm1, 8 packuswb xmm0, xmm1 @@ -4937,23 +4937,23 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - pxor xmm5, xmm5 // constant 0 + pxor xmm5, xmm5 // constant 0 convertloop: movdqu xmm0, [eax] // read 4 pixels from src_argb movdqu xmm2, [esi] // read 4 pixels from src_argb1 movdqu xmm1, xmm0 movdqu xmm3, xmm2 - punpcklbw xmm0, xmm0 // first 2 - punpckhbw xmm1, xmm1 // next 2 - punpcklbw xmm2, xmm5 // first 2 - punpckhbw xmm3, xmm5 // next 2 - pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2 - pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2 + punpcklbw xmm0, xmm0 // first 2 + punpckhbw xmm1, xmm1 // next 2 + punpcklbw xmm2, xmm5 // first 2 + punpckhbw xmm3, xmm5 // next 2 + pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2 + pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2 lea eax, [eax + 16] lea esi, [esi + 16] packuswb xmm0, xmm1 @@ -4977,8 +4977,8 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width @@ -4990,7 +4990,7 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb, lea eax, [eax + 16] movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] - paddusb xmm0, xmm1 // src_argb + src_argb1 + paddusb xmm0, xmm1 // src_argb + src_argb1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -5005,7 +5005,7 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb, lea eax, [eax + 4] movd xmm1, [esi] // read 1 pixels from src_argb1 lea esi, [esi + 4] - paddusb xmm0, xmm1 // src_argb + src_argb1 + paddusb xmm0, xmm1 // src_argb + src_argb1 movd [edx], xmm0 lea edx, [edx + 4] sub ecx, 1 @@ -5026,8 +5026,8 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width @@ -5036,7 +5036,7 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb, lea eax, [eax + 16] movdqu xmm1, [esi] // read 4 pixels from src_argb1 lea esi, [esi + 16] - psubusb xmm0, xmm1 // src_argb - src_argb1 + psubusb xmm0, xmm1 // src_argb - src_argb1 movdqu [edx], xmm0 lea edx, [edx + 16] sub ecx, 4 @@ -5056,11 +5056,11 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width - vpxor ymm5, ymm5, ymm5 // constant 0 + vpxor ymm5, ymm5, ymm5 // constant 0 convertloop: vmovdqu ymm1, [eax] // read 8 pixels from src_argb @@ -5094,8 +5094,8 @@ __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width @@ -5124,8 +5124,8 @@ __declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_argb - mov esi, [esp + 4 + 8] // src_argb1 + mov eax, [esp + 4 + 4] // src_argb + mov esi, [esp + 4 + 8] // src_argb1 mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width @@ -5159,8 +5159,8 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0, __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_y0 - mov esi, [esp + 8 + 8] // src_y1 + mov eax, [esp + 8 + 4] // src_y0 + mov esi, [esp + 8 + 8] // src_y1 mov edi, [esp + 8 + 12] // src_y2 mov edx, [esp + 8 + 16] // dst_sobelx mov ecx, [esp + 8 + 20] // width @@ -5170,17 +5170,17 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0, pxor xmm5, xmm5 // constant 0 convertloop: - movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] punpcklbw xmm0, xmm5 punpcklbw xmm1, xmm5 psubw xmm0, xmm1 - movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] + movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] punpcklbw xmm1, xmm5 punpcklbw xmm2, xmm5 psubw xmm1, xmm2 - movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] + movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] punpcklbw xmm2, xmm5 punpcklbw xmm3, xmm5 @@ -5215,8 +5215,8 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_y0 - mov esi, [esp + 4 + 8] // src_y1 + mov eax, [esp + 4 + 4] // src_y0 + mov esi, [esp + 4 + 8] // src_y1 mov edx, [esp + 4 + 12] // dst_sobely mov ecx, [esp + 4 + 16] // width sub esi, eax @@ -5224,17 +5224,17 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0, pxor xmm5, xmm5 // constant 0 convertloop: - movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] + movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] punpcklbw xmm0, xmm5 punpcklbw xmm1, xmm5 psubw xmm0, xmm1 - movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] + movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] punpcklbw xmm1, xmm5 punpcklbw xmm2, xmm5 psubw xmm1, xmm2 - movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] + movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] punpcklbw xmm2, xmm5 punpcklbw xmm3, xmm5 @@ -5269,8 +5269,8 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax @@ -5278,7 +5278,7 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx, pslld xmm5, 24 // 0xff000000 convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm0, [eax] // read 16 pixels src_sobelx movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] paddusb xmm0, xmm1 // sobel = sobelx + sobely @@ -5317,8 +5317,8 @@ __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax @@ -5351,15 +5351,15 @@ __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx, int width) { __asm { push esi - mov eax, [esp + 4 + 4] // src_sobelx - mov esi, [esp + 4 + 8] // src_sobely + mov eax, [esp + 4 + 4] // src_sobelx + mov esi, [esp + 4 + 8] // src_sobely mov edx, [esp + 4 + 12] // dst_argb mov ecx, [esp + 4 + 16] // width sub esi, eax pcmpeqb xmm5, xmm5 // alpha 255 convertloop: - movdqu xmm0, [eax] // read 16 pixels src_sobelx + movdqu xmm0, [eax] // read 16 pixels src_sobelx movdqu xmm1, [eax + esi] // read 16 pixels src_sobely lea eax, [eax + 16] movdqa xmm2, xmm0 @@ -5529,7 +5529,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: movdqu xmm0, [eax] psubd xmm0, [eax + edx * 4] @@ -5571,7 +5571,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row, test edx, 15 jne l4b - // 4 pixel loop + // 4 pixel loop l4: movdqu xmm2, [eax] // 4 argb pixels 16 bytes. lea eax, [eax + 16] @@ -5617,7 +5617,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row, add ecx, 4 - 1 jl l1b - // 1 pixel loop + // 1 pixel loop l1: movd xmm2, dword ptr [eax] // 1 argb pixel lea eax, [eax + 4] @@ -5651,7 +5651,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, mov esi, [esp + 16] // stride mov edx, [esp + 20] // dst_argb mov ecx, [esp + 24] // pointer to uv_dudv - movq xmm2, qword ptr [ecx] // uv + movq xmm2, qword ptr [ecx] // uv movq xmm7, qword ptr [ecx + 8] // dudv mov ecx, [esp + 28] // width shl esi, 16 // 4, stride @@ -5660,7 +5660,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, sub ecx, 4 jl l4b - // setup for 4 pixel loop + // setup for 4 pixel loop pshufd xmm7, xmm7, 0x44 // dup dudv pshufd xmm5, xmm5, 0 // dup 4, stride movdqa xmm0, xmm2 // x0, y0, x1, y1 @@ -5672,16 +5672,16 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb, addps xmm3, xmm4 addps xmm4, xmm4 // dudv *= 4 - // 4 pixel loop + // 4 pixel loop l4: cvttps2dq xmm0, xmm2 // x, y float to int first 2 cvttps2dq xmm1, xmm3 // x, y float to int next 2 packssdw xmm0, xmm1 // x, y as 8 shorts pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. movd esi, xmm0 - pshufd xmm0, xmm0, 0x39 // shift right + pshufd xmm0, xmm0, 0x39 // shift right movd edi, xmm0 - pshufd xmm0, xmm0, 0x39 // shift right + pshufd xmm0, xmm0, 0x39 // shift right movd xmm1, [eax + esi] // read pixel 0 movd xmm6, [eax + edi] // read pixel 1 punpckldq xmm1, xmm6 // combine pixel 0 and 1 @@ -5733,8 +5733,8 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr, __asm { push esi push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) @@ -5743,7 +5743,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr, je xloop100 // 0 / 256. Blend 100 / 0. sub edi, esi cmp eax, 128 - je xloop50 // 128 /256 is 0.50. Blend 50 / 50. + je xloop50 // 128 /256 is 0.50. Blend 50 / 50. vmovd xmm0, eax // high fraction 0..255 neg eax @@ -5770,7 +5770,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr, vpaddw ymm0, ymm0, ymm4 vpsrlw ymm1, ymm1, 8 vpsrlw ymm0, ymm0, 8 - vpackuswb ymm0, ymm0, ymm1 // unmutates + vpackuswb ymm0, ymm0, ymm1 // unmutates vmovdqu [esi + edi], ymm0 lea esi, [esi + 32] sub ecx, 32 @@ -5811,17 +5811,17 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr, push esi push edi - mov edi, [esp + 8 + 4] // dst_ptr - mov esi, [esp + 8 + 8] // src_ptr + mov edi, [esp + 8 + 4] // dst_ptr + mov esi, [esp + 8 + 8] // src_ptr mov edx, [esp + 8 + 12] // src_stride mov ecx, [esp + 8 + 16] // dst_width mov eax, [esp + 8 + 20] // source_y_fraction (0..255) sub edi, esi - // Dispatch to specialized filters if applicable. + // Dispatch to specialized filters if applicable. cmp eax, 0 je xloop100 // 0 /256. Blend 100 / 0. cmp eax, 128 - je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. + je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. movd xmm0, eax // high fraction 0..255 neg eax @@ -5840,7 +5840,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr, movdqu xmm1, xmm0 punpcklbw xmm0, xmm2 punpckhbw xmm1, xmm2 - psubb xmm0, xmm4 // bias image by -128 + psubb xmm0, xmm4 // bias image by -128 psubb xmm1, xmm4 movdqa xmm2, xmm5 movdqa xmm3, xmm5 @@ -5889,8 +5889,8 @@ __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb, const uint8_t* shuffler, int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler movdqu xmm5, [ecx] mov ecx, [esp + 16] // width @@ -5916,8 +5916,8 @@ __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb, const uint8_t* shuffler, int width) { __asm { - mov eax, [esp + 4] // src_argb - mov edx, [esp + 8] // dst_argb + mov eax, [esp + 4] // src_argb + mov edx, [esp + 8] // dst_argb mov ecx, [esp + 12] // shuffler vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. mov ecx, [esp + 16] // width @@ -5954,18 +5954,18 @@ __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y, __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u mov edx, [esp + 8 + 12] // src_v mov edi, [esp + 8 + 16] // dst_frame mov ecx, [esp + 8 + 20] // width sub edx, esi convertloop: - movq xmm2, qword ptr [esi] // U + movq xmm2, qword ptr [esi] // U movq xmm3, qword ptr [esi + edx] // V lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV + punpcklbw xmm2, xmm3 // UV movdqu xmm0, [eax] // Y lea eax, [eax + 16] movdqa xmm1, xmm0 @@ -5991,22 +5991,22 @@ __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y, __asm { push esi push edi - mov eax, [esp + 8 + 4] // src_y - mov esi, [esp + 8 + 8] // src_u + mov eax, [esp + 8 + 4] // src_y + mov esi, [esp + 8 + 8] // src_u mov edx, [esp + 8 + 12] // src_v mov edi, [esp + 8 + 16] // dst_frame mov ecx, [esp + 8 + 20] // width sub edx, esi convertloop: - movq xmm2, qword ptr [esi] // U + movq xmm2, qword ptr [esi] // U movq xmm3, qword ptr [esi + edx] // V lea esi, [esi + 8] - punpcklbw xmm2, xmm3 // UV + punpcklbw xmm2, xmm3 // UV movdqu xmm0, [eax] // Y movdqa xmm1, xmm2 lea eax, [eax + 16] - punpcklbw xmm1, xmm0 // UYVY + punpcklbw xmm1, xmm0 // UYVY punpckhbw xmm2, xmm0 movdqu [edi], xmm1 movdqu [edi + 16], xmm2 @@ -6033,10 +6033,10 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb, mov ecx, [esp + 4 + 16] /* width */ pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. - // 2 pixel loop. + // 2 pixel loop. convertloop: - // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel - // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel + // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel + // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel movq xmm0, qword ptr [eax] // BGRABGRA lea eax, [eax + 8] punpcklbw xmm0, xmm3 @@ -6085,8 +6085,8 @@ __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb, const float* poly, int width) { __asm { - mov eax, [esp + 4] /* src_argb */ - mov edx, [esp + 8] /* dst_argb */ + mov eax, [esp + 4] /* src_argb */ + mov edx, [esp + 8] /* dst_argb */ mov ecx, [esp + 12] /* poly */ vbroadcastf128 ymm4, [ecx] // C0 vbroadcastf128 ymm5, [ecx + 16] // C1 @@ -6125,8 +6125,8 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src, float scale, int width) { __asm { - mov eax, [esp + 4] /* src */ - mov edx, [esp + 8] /* dst */ + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ movd xmm4, dword ptr [esp + 12] /* scale */ mov ecx, [esp + 16] /* width */ mulss xmm4, kExpBias @@ -6134,7 +6134,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src, pxor xmm5, xmm5 sub edx, eax - // 8 pixel loop. + // 8 pixel loop. convertloop: movdqu xmm2, xmmword ptr [eax] // 8 shorts add eax, 16 @@ -6172,7 +6172,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src, vpxor ymm5, ymm5, ymm5 sub edx, eax - // 16 pixel loop. + // 16 pixel loop. convertloop: vmovdqu ymm2, [eax] // 16 shorts add eax, 32 @@ -6182,7 +6182,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src, vcvtdq2ps ymm2, ymm2 vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range. vmulps ymm2, ymm2, ymm4 - vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate + vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate vpsrld ymm2, ymm2, 13 vpackssdw ymm2, ymm2, ymm3 vmovdqu [eax + edx - 32], ymm2 @@ -6200,22 +6200,22 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src, float scale, int width) { __asm { - mov eax, [esp + 4] /* src */ - mov edx, [esp + 8] /* dst */ + mov eax, [esp + 4] /* src */ + mov edx, [esp + 8] /* dst */ vbroadcastss ymm4, [esp + 12] /* scale */ - mov ecx, [esp + 16] /* width */ + mov ecx, [esp + 16] /* width */ sub edx, eax - // 16 pixel loop. + // 16 pixel loop. convertloop: vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts add eax, 32 - vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats + vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats vcvtdq2ps ymm3, ymm3 vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 vmulps ymm3, ymm3, ymm4 - vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate + vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate vcvtps2ph xmm3, ymm3, 3 vmovdqu [eax + edx + 32], xmm2 vmovdqu [eax + edx + 32 + 16], xmm3 @@ -6234,8 +6234,8 @@ __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ mov ecx, [esp + 4 + 12] /* width */ // 1 pixel loop. @@ -6268,8 +6268,8 @@ __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb, int width) { __asm { push esi - mov eax, [esp + 4 + 4] /* dst_argb */ - mov esi, [esp + 4 + 8] /* table_argb */ + mov eax, [esp + 4 + 4] /* dst_argb */ + mov esi, [esp + 4 + 8] /* table_argb */ mov ecx, [esp + 4 + 12] /* width */ // 1 pixel loop. @@ -6303,8 +6303,8 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, __asm { push esi push edi - mov eax, [esp + 8 + 4] /* src_argb */ - mov edi, [esp + 8 + 8] /* dst_argb */ + mov eax, [esp + 8 + 4] /* src_argb */ + mov edi, [esp + 8 + 8] /* dst_argb */ mov ecx, [esp + 8 + 12] /* width */ movd xmm2, dword ptr [esp + 8 + 16] // luma table movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff @@ -6314,7 +6314,7 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, psllw xmm4, 8 pxor xmm5, xmm5 - // 4 pixel loop. + // 4 pixel loop. convertloop: movdqu xmm0, xmmword ptr [eax] // generate luma ptr pmaddubsw xmm0, xmm3