Fix ConvertToI420 when using YUY2 or UYVY with odd crop_x.

- swap U and V when crop x is odd
- document YUY2 and UYVY formats
- apply clang-format

Bug: libyuv:902
Change-Id: I045e44c907f4a9eb625d7c024b669bb308055f32
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3039549
Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
This commit is contained in:
Frank Barchard 2021-07-19 12:04:32 -07:00 committed by Frank Barchard
parent 0572e0a0b1
commit 639dd4ea76
5 changed files with 147 additions and 133 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: http://code.google.com/p/libyuv/ URL: http://code.google.com/p/libyuv/
Version: 1788 Version: 1789
License: BSD License: BSD
License File: LICENSE License File: LICENSE

View File

@ -189,7 +189,6 @@ In memory R is the lowest and A is the highest.
Each channel has value ranges from 0 to 65535. Each channel has value ranges from 0 to 65535.
AR64 is similar to ARGB. AR64 is similar to ARGB.
# NV12 and NV21 # NV12 and NV21
NV12 is a biplanar format with a full sized Y plane followed by a single NV12 is a biplanar format with a full sized Y plane followed by a single
@ -200,3 +199,10 @@ height chroma channel, and therefore is a 420 subsampling.
NV16 is 16 bits per pixel, with half width and full height. aka 422. NV16 is 16 bits per pixel, with half width and full height. aka 422.
NV24 is 24 bits per pixel with full sized chroma channel. aka 444. NV24 is 24 bits per pixel with full sized chroma channel. aka 444.
Most NV12 functions allow the destination Y pointer to be NULL. Most NV12 functions allow the destination Y pointer to be NULL.
# YUY2 and UYVY
YUY2 is a packed YUV format with half width, full height.
YUY2 is YUYV in memory
UYVY is UYVY in memory

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1788 #define LIBYUV_VERSION 1789
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -89,18 +89,26 @@ int ConvertToI420(const uint8_t* sample,
switch (format) { switch (format) {
// Single plane formats // Single plane formats
case FOURCC_YUY2: case FOURCC_YUY2: { // TODO(fbarchard): Find better odd crop fix.
uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
src = sample + (aligned_src_width * crop_y + crop_x) * 2; src = sample + (aligned_src_width * crop_y + crop_x) * 2;
r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u, r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
dst_stride_u, dst_v, dst_stride_v, crop_width, stride_u, v, stride_v, crop_width, inv_crop_height);
inv_crop_height);
break; break;
case FOURCC_UYVY: }
case FOURCC_UYVY: {
uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
src = sample + (aligned_src_width * crop_y + crop_x) * 2; src = sample + (aligned_src_width * crop_y + crop_x) * 2;
r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u, r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
dst_stride_u, dst_v, dst_stride_v, crop_width, stride_u, v, stride_v, crop_width, inv_crop_height);
inv_crop_height);
break; break;
}
case FOURCC_RGBP: case FOURCC_RGBP:
src = sample + (src_width * crop_y + crop_x) * 2; src = sample + (src_width * crop_y + crop_x) * 2;
r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u, r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,

View File

@ -2584,7 +2584,7 @@ __declspec(naked) void I422ToRGBARow_AVX2(
__asm psraw xmm2, 6 \ __asm psraw xmm2, 6 \
__asm packuswb xmm0, xmm0 /* B */ \ __asm packuswb xmm0, xmm0 /* B */ \
__asm packuswb xmm1, xmm1 /* G */ \ __asm packuswb xmm1, xmm1 /* G */ \
__asm packuswb xmm2, xmm2 /* R */ \ __asm packuswb xmm2, xmm2 /* R */ \
} }
// Store 8 ARGB values. // Store 8 ARGB values.
@ -4746,22 +4746,22 @@ __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
pmaddubsw xmm6, xmm2 pmaddubsw xmm6, xmm2
phaddw xmm0, xmm6 phaddw xmm0, xmm6
psrlw xmm0, 7 psrlw xmm0, 7
packuswb xmm0, xmm0 // 8 B values packuswb xmm0, xmm0 // 8 B values
movdqu xmm5, [eax] // G movdqu xmm5, [eax] // G
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
pmaddubsw xmm5, xmm3 pmaddubsw xmm5, xmm3
pmaddubsw xmm1, xmm3 pmaddubsw xmm1, xmm3
phaddw xmm5, xmm1 phaddw xmm5, xmm1
psrlw xmm5, 7 psrlw xmm5, 7
packuswb xmm5, xmm5 // 8 G values packuswb xmm5, xmm5 // 8 G values
punpcklbw xmm0, xmm5 // 8 BG values punpcklbw xmm0, xmm5 // 8 BG values
movdqu xmm5, [eax] // R movdqu xmm5, [eax] // R
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
pmaddubsw xmm5, xmm4 pmaddubsw xmm5, xmm4
pmaddubsw xmm1, xmm4 pmaddubsw xmm1, xmm4
phaddw xmm5, xmm1 phaddw xmm5, xmm1
psrlw xmm5, 7 psrlw xmm5, 7
packuswb xmm5, xmm5 // 8 R values packuswb xmm5, xmm5 // 8 R values
movdqu xmm6, [eax] // A movdqu xmm6, [eax] // A
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
psrld xmm6, 24 psrld xmm6, 24
@ -4811,25 +4811,25 @@ __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
movdqu xmm1, [eax + 16] movdqu xmm1, [eax + 16]
pmaddubsw xmm6, xmm3 pmaddubsw xmm6, xmm3
pmaddubsw xmm1, xmm3 pmaddubsw xmm1, xmm3
phaddsw xmm0, xmm7 // B phaddsw xmm0, xmm7 // B
phaddsw xmm6, xmm1 // G phaddsw xmm6, xmm1 // G
psraw xmm0, 6 // B psraw xmm0, 6 // B
psraw xmm6, 6 // G psraw xmm6, 6 // G
packuswb xmm0, xmm0 // 8 B values packuswb xmm0, xmm0 // 8 B values
packuswb xmm6, xmm6 // 8 G values packuswb xmm6, xmm6 // 8 G values
punpcklbw xmm0, xmm6 // 8 BG values punpcklbw xmm0, xmm6 // 8 BG values
movdqu xmm1, [eax] // R movdqu xmm1, [eax] // R
movdqu xmm7, [eax + 16] movdqu xmm7, [eax + 16]
pmaddubsw xmm1, xmm4 pmaddubsw xmm1, xmm4
pmaddubsw xmm7, xmm4 pmaddubsw xmm7, xmm4
phaddsw xmm1, xmm7 // R phaddsw xmm1, xmm7 // R
movdqu xmm6, [eax] // A movdqu xmm6, [eax] // A
movdqu xmm7, [eax + 16] movdqu xmm7, [eax + 16]
pmaddubsw xmm6, xmm5 pmaddubsw xmm6, xmm5
pmaddubsw xmm7, xmm5 pmaddubsw xmm7, xmm5
phaddsw xmm6, xmm7 // A phaddsw xmm6, xmm7 // A
psraw xmm1, 6 // R psraw xmm1, 6 // R
psraw xmm6, 6 // A psraw xmm6, 6 // A
packuswb xmm1, xmm1 // 8 R values packuswb xmm1, xmm1 // 8 R values
packuswb xmm6, xmm6 // 8 A values packuswb xmm6, xmm6 // 8 A values
punpcklbw xmm1, xmm6 // 8 RA values punpcklbw xmm1, xmm6 // 8 RA values
@ -4872,16 +4872,16 @@ __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
convertloop: convertloop:
movdqu xmm0, [eax] // read 4 pixels movdqu xmm0, [eax] // read 4 pixels
punpcklbw xmm0, xmm5 // first 2 pixels punpcklbw xmm0, xmm5 // first 2 pixels
pmulhuw xmm0, xmm2 // pixel * scale >> 16 pmulhuw xmm0, xmm2 // pixel * scale >> 16
movdqu xmm1, [eax] // read 4 pixels movdqu xmm1, [eax] // read 4 pixels
punpckhbw xmm1, xmm5 // next 2 pixels punpckhbw xmm1, xmm5 // next 2 pixels
pmulhuw xmm1, xmm2 pmulhuw xmm1, xmm2
pmullw xmm0, xmm3 // * interval_size pmullw xmm0, xmm3 // * interval_size
movdqu xmm7, [eax] // read 4 pixels movdqu xmm7, [eax] // read 4 pixels
pmullw xmm1, xmm3 pmullw xmm1, xmm3
pand xmm7, xmm6 // mask alpha pand xmm7, xmm6 // mask alpha
paddw xmm0, xmm4 // + interval_size / 2 paddw xmm0, xmm4 // + interval_size / 2
paddw xmm1, xmm4 paddw xmm1, xmm4
packuswb xmm0, xmm1 packuswb xmm0, xmm1
por xmm0, xmm7 por xmm0, xmm7
@ -4901,9 +4901,9 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
int width, int width,
uint32_t value) { uint32_t value) {
__asm { __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width mov ecx, [esp + 12] // width
movd xmm2, [esp + 16] // value movd xmm2, [esp + 16] // value
punpcklbw xmm2, xmm2 punpcklbw xmm2, xmm2
punpcklqdq xmm2, xmm2 punpcklqdq xmm2, xmm2
@ -4912,10 +4912,10 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
movdqu xmm0, [eax] // read 4 pixels movdqu xmm0, [eax] // read 4 pixels
lea eax, [eax + 16] lea eax, [eax + 16]
movdqa xmm1, xmm0 movdqa xmm1, xmm0
punpcklbw xmm0, xmm0 // first 2 punpcklbw xmm0, xmm0 // first 2
punpckhbw xmm1, xmm1 // next 2 punpckhbw xmm1, xmm1 // next 2
pmulhuw xmm0, xmm2 // argb * value pmulhuw xmm0, xmm2 // argb * value
pmulhuw xmm1, xmm2 // argb * value pmulhuw xmm1, xmm2 // argb * value
psrlw xmm0, 8 psrlw xmm0, 8
psrlw xmm1, 8 psrlw xmm1, 8
packuswb xmm0, xmm1 packuswb xmm0, xmm1
@ -4937,23 +4937,23 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
int width) { int width) {
__asm { __asm {
push esi push esi
mov eax, [esp + 4 + 4] // src_argb mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1 mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
pxor xmm5, xmm5 // constant 0 pxor xmm5, xmm5 // constant 0
convertloop: convertloop:
movdqu xmm0, [eax] // read 4 pixels from src_argb movdqu xmm0, [eax] // read 4 pixels from src_argb
movdqu xmm2, [esi] // read 4 pixels from src_argb1 movdqu xmm2, [esi] // read 4 pixels from src_argb1
movdqu xmm1, xmm0 movdqu xmm1, xmm0
movdqu xmm3, xmm2 movdqu xmm3, xmm2
punpcklbw xmm0, xmm0 // first 2 punpcklbw xmm0, xmm0 // first 2
punpckhbw xmm1, xmm1 // next 2 punpckhbw xmm1, xmm1 // next 2
punpcklbw xmm2, xmm5 // first 2 punpcklbw xmm2, xmm5 // first 2
punpckhbw xmm3, xmm5 // next 2 punpckhbw xmm3, xmm5 // next 2
pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2 pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2
pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2 pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2
lea eax, [eax + 16] lea eax, [eax + 16]
lea esi, [esi + 16] lea esi, [esi + 16]
packuswb xmm0, xmm1 packuswb xmm0, xmm1
@ -4977,8 +4977,8 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
int width) { int width) {
__asm { __asm {
push esi push esi
mov eax, [esp + 4 + 4] // src_argb mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1 mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
@ -4990,7 +4990,7 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
lea eax, [eax + 16] lea eax, [eax + 16]
movdqu xmm1, [esi] // read 4 pixels from src_argb1 movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16] lea esi, [esi + 16]
paddusb xmm0, xmm1 // src_argb + src_argb1 paddusb xmm0, xmm1 // src_argb + src_argb1
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4 sub ecx, 4
@ -5005,7 +5005,7 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
lea eax, [eax + 4] lea eax, [eax + 4]
movd xmm1, [esi] // read 1 pixels from src_argb1 movd xmm1, [esi] // read 1 pixels from src_argb1
lea esi, [esi + 4] lea esi, [esi + 4]
paddusb xmm0, xmm1 // src_argb + src_argb1 paddusb xmm0, xmm1 // src_argb + src_argb1
movd [edx], xmm0 movd [edx], xmm0
lea edx, [edx + 4] lea edx, [edx + 4]
sub ecx, 1 sub ecx, 1
@ -5026,8 +5026,8 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
int width) { int width) {
__asm { __asm {
push esi push esi
mov eax, [esp + 4 + 4] // src_argb mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1 mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
@ -5036,7 +5036,7 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
lea eax, [eax + 16] lea eax, [eax + 16]
movdqu xmm1, [esi] // read 4 pixels from src_argb1 movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16] lea esi, [esi + 16]
psubusb xmm0, xmm1 // src_argb - src_argb1 psubusb xmm0, xmm1 // src_argb - src_argb1
movdqu [edx], xmm0 movdqu [edx], xmm0
lea edx, [edx + 16] lea edx, [edx + 16]
sub ecx, 4 sub ecx, 4
@ -5056,11 +5056,11 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
int width) { int width) {
__asm { __asm {
push esi push esi
mov eax, [esp + 4 + 4] // src_argb mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1 mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
vpxor ymm5, ymm5, ymm5 // constant 0 vpxor ymm5, ymm5, ymm5 // constant 0
convertloop: convertloop:
vmovdqu ymm1, [eax] // read 8 pixels from src_argb vmovdqu ymm1, [eax] // read 8 pixels from src_argb
@ -5094,8 +5094,8 @@ __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb,
int width) { int width) {
__asm { __asm {
push esi push esi
mov eax, [esp + 4 + 4] // src_argb mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1 mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
@ -5124,8 +5124,8 @@ __declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
int width) { int width) {
__asm { __asm {
push esi push esi
mov eax, [esp + 4 + 4] // src_argb mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1 mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
@ -5159,8 +5159,8 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
__asm { __asm {
push esi push esi
push edi push edi
mov eax, [esp + 8 + 4] // src_y0 mov eax, [esp + 8 + 4] // src_y0
mov esi, [esp + 8 + 8] // src_y1 mov esi, [esp + 8 + 8] // src_y1
mov edi, [esp + 8 + 12] // src_y2 mov edi, [esp + 8 + 12] // src_y2
mov edx, [esp + 8 + 16] // dst_sobelx mov edx, [esp + 8 + 16] // dst_sobelx
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
@ -5170,17 +5170,17 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
pxor xmm5, xmm5 // constant 0 pxor xmm5, xmm5 // constant 0
convertloop: convertloop:
movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2] movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
punpcklbw xmm0, xmm5 punpcklbw xmm0, xmm5
punpcklbw xmm1, xmm5 punpcklbw xmm1, xmm5
psubw xmm0, xmm1 psubw xmm0, xmm1
movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
punpcklbw xmm1, xmm5 punpcklbw xmm1, xmm5
punpcklbw xmm2, xmm5 punpcklbw xmm2, xmm5
psubw xmm1, xmm2 psubw xmm1, xmm2
movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0] movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2] movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
punpcklbw xmm2, xmm5 punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm5 punpcklbw xmm3, xmm5
@ -5215,8 +5215,8 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
int width) { int width) {
__asm { __asm {
push esi push esi
mov eax, [esp + 4 + 4] // src_y0 mov eax, [esp + 4 + 4] // src_y0
mov esi, [esp + 4 + 8] // src_y1 mov esi, [esp + 4 + 8] // src_y1
mov edx, [esp + 4 + 12] // dst_sobely mov edx, [esp + 4 + 12] // dst_sobely
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
sub esi, eax sub esi, eax
@ -5224,17 +5224,17 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
pxor xmm5, xmm5 // constant 0 pxor xmm5, xmm5 // constant 0
convertloop: convertloop:
movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0] movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0] movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
punpcklbw xmm0, xmm5 punpcklbw xmm0, xmm5
punpcklbw xmm1, xmm5 punpcklbw xmm1, xmm5
psubw xmm0, xmm1 psubw xmm0, xmm1
movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1] movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1] movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
punpcklbw xmm1, xmm5 punpcklbw xmm1, xmm5
punpcklbw xmm2, xmm5 punpcklbw xmm2, xmm5
psubw xmm1, xmm2 psubw xmm1, xmm2
movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2] movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2] movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
punpcklbw xmm2, xmm5 punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm5 punpcklbw xmm3, xmm5
@ -5269,8 +5269,8 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
int width) { int width) {
__asm { __asm {
push esi push esi
mov eax, [esp + 4 + 4] // src_sobelx mov eax, [esp + 4 + 4] // src_sobelx
mov esi, [esp + 4 + 8] // src_sobely mov esi, [esp + 4 + 8] // src_sobely
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
sub esi, eax sub esi, eax
@ -5278,7 +5278,7 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
pslld xmm5, 24 // 0xff000000 pslld xmm5, 24 // 0xff000000
convertloop: convertloop:
movdqu xmm0, [eax] // read 16 pixels src_sobelx movdqu xmm0, [eax] // read 16 pixels src_sobelx
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16] lea eax, [eax + 16]
paddusb xmm0, xmm1 // sobel = sobelx + sobely paddusb xmm0, xmm1 // sobel = sobelx + sobely
@ -5317,8 +5317,8 @@ __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
int width) { int width) {
__asm { __asm {
push esi push esi
mov eax, [esp + 4 + 4] // src_sobelx mov eax, [esp + 4 + 4] // src_sobelx
mov esi, [esp + 4 + 8] // src_sobely mov esi, [esp + 4 + 8] // src_sobely
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
sub esi, eax sub esi, eax
@ -5351,15 +5351,15 @@ __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
int width) { int width) {
__asm { __asm {
push esi push esi
mov eax, [esp + 4 + 4] // src_sobelx mov eax, [esp + 4 + 4] // src_sobelx
mov esi, [esp + 4 + 8] // src_sobely mov esi, [esp + 4 + 8] // src_sobely
mov edx, [esp + 4 + 12] // dst_argb mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width mov ecx, [esp + 4 + 16] // width
sub esi, eax sub esi, eax
pcmpeqb xmm5, xmm5 // alpha 255 pcmpeqb xmm5, xmm5 // alpha 255
convertloop: convertloop:
movdqu xmm0, [eax] // read 16 pixels src_sobelx movdqu xmm0, [eax] // read 16 pixels src_sobelx
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16] lea eax, [eax + 16]
movdqa xmm2, xmm0 movdqa xmm2, xmm0
@ -5529,7 +5529,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
add ecx, 4 - 1 add ecx, 4 - 1
jl l1b jl l1b
// 1 pixel loop // 1 pixel loop
l1: l1:
movdqu xmm0, [eax] movdqu xmm0, [eax]
psubd xmm0, [eax + edx * 4] psubd xmm0, [eax + edx * 4]
@ -5571,7 +5571,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
test edx, 15 test edx, 15
jne l4b jne l4b
// 4 pixel loop // 4 pixel loop
l4: l4:
movdqu xmm2, [eax] // 4 argb pixels 16 bytes. movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
lea eax, [eax + 16] lea eax, [eax + 16]
@ -5617,7 +5617,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
add ecx, 4 - 1 add ecx, 4 - 1
jl l1b jl l1b
// 1 pixel loop // 1 pixel loop
l1: l1:
movd xmm2, dword ptr [eax] // 1 argb pixel movd xmm2, dword ptr [eax] // 1 argb pixel
lea eax, [eax + 4] lea eax, [eax + 4]
@ -5651,7 +5651,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
mov esi, [esp + 16] // stride mov esi, [esp + 16] // stride
mov edx, [esp + 20] // dst_argb mov edx, [esp + 20] // dst_argb
mov ecx, [esp + 24] // pointer to uv_dudv mov ecx, [esp + 24] // pointer to uv_dudv
movq xmm2, qword ptr [ecx] // uv movq xmm2, qword ptr [ecx] // uv
movq xmm7, qword ptr [ecx + 8] // dudv movq xmm7, qword ptr [ecx + 8] // dudv
mov ecx, [esp + 28] // width mov ecx, [esp + 28] // width
shl esi, 16 // 4, stride shl esi, 16 // 4, stride
@ -5660,7 +5660,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
sub ecx, 4 sub ecx, 4
jl l4b jl l4b
// setup for 4 pixel loop // setup for 4 pixel loop
pshufd xmm7, xmm7, 0x44 // dup dudv pshufd xmm7, xmm7, 0x44 // dup dudv
pshufd xmm5, xmm5, 0 // dup 4, stride pshufd xmm5, xmm5, 0 // dup 4, stride
movdqa xmm0, xmm2 // x0, y0, x1, y1 movdqa xmm0, xmm2 // x0, y0, x1, y1
@ -5672,16 +5672,16 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
addps xmm3, xmm4 addps xmm3, xmm4
addps xmm4, xmm4 // dudv *= 4 addps xmm4, xmm4 // dudv *= 4
// 4 pixel loop // 4 pixel loop
l4: l4:
cvttps2dq xmm0, xmm2 // x, y float to int first 2 cvttps2dq xmm0, xmm2 // x, y float to int first 2
cvttps2dq xmm1, xmm3 // x, y float to int next 2 cvttps2dq xmm1, xmm3 // x, y float to int next 2
packssdw xmm0, xmm1 // x, y as 8 shorts packssdw xmm0, xmm1 // x, y as 8 shorts
pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
movd esi, xmm0 movd esi, xmm0
pshufd xmm0, xmm0, 0x39 // shift right pshufd xmm0, xmm0, 0x39 // shift right
movd edi, xmm0 movd edi, xmm0
pshufd xmm0, xmm0, 0x39 // shift right pshufd xmm0, xmm0, 0x39 // shift right
movd xmm1, [eax + esi] // read pixel 0 movd xmm1, [eax + esi] // read pixel 0
movd xmm6, [eax + edi] // read pixel 1 movd xmm6, [eax + edi] // read pixel 1
punpckldq xmm1, xmm6 // combine pixel 0 and 1 punpckldq xmm1, xmm6 // combine pixel 0 and 1
@ -5733,8 +5733,8 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
__asm { __asm {
push esi push esi
push edi push edi
mov edi, [esp + 8 + 4] // dst_ptr mov edi, [esp + 8 + 4] // dst_ptr
mov esi, [esp + 8 + 8] // src_ptr mov esi, [esp + 8 + 8] // src_ptr
mov edx, [esp + 8 + 12] // src_stride mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255) mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
@ -5743,7 +5743,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
je xloop100 // 0 / 256. Blend 100 / 0. je xloop100 // 0 / 256. Blend 100 / 0.
sub edi, esi sub edi, esi
cmp eax, 128 cmp eax, 128
je xloop50 // 128 /256 is 0.50. Blend 50 / 50. je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
vmovd xmm0, eax // high fraction 0..255 vmovd xmm0, eax // high fraction 0..255
neg eax neg eax
@ -5770,7 +5770,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
vpaddw ymm0, ymm0, ymm4 vpaddw ymm0, ymm0, ymm4
vpsrlw ymm1, ymm1, 8 vpsrlw ymm1, ymm1, 8
vpsrlw ymm0, ymm0, 8 vpsrlw ymm0, ymm0, 8
vpackuswb ymm0, ymm0, ymm1 // unmutates vpackuswb ymm0, ymm0, ymm1 // unmutates
vmovdqu [esi + edi], ymm0 vmovdqu [esi + edi], ymm0
lea esi, [esi + 32] lea esi, [esi + 32]
sub ecx, 32 sub ecx, 32
@ -5811,17 +5811,17 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
push esi push esi
push edi push edi
mov edi, [esp + 8 + 4] // dst_ptr mov edi, [esp + 8 + 4] // dst_ptr
mov esi, [esp + 8 + 8] // src_ptr mov esi, [esp + 8 + 8] // src_ptr
mov edx, [esp + 8 + 12] // src_stride mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255) mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi sub edi, esi
// Dispatch to specialized filters if applicable. // Dispatch to specialized filters if applicable.
cmp eax, 0 cmp eax, 0
je xloop100 // 0 /256. Blend 100 / 0. je xloop100 // 0 /256. Blend 100 / 0.
cmp eax, 128 cmp eax, 128
je xloop50 // 128 / 256 is 0.50. Blend 50 / 50. je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
movd xmm0, eax // high fraction 0..255 movd xmm0, eax // high fraction 0..255
neg eax neg eax
@ -5840,7 +5840,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
movdqu xmm1, xmm0 movdqu xmm1, xmm0
punpcklbw xmm0, xmm2 punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2 punpckhbw xmm1, xmm2
psubb xmm0, xmm4 // bias image by -128 psubb xmm0, xmm4 // bias image by -128
psubb xmm1, xmm4 psubb xmm1, xmm4
movdqa xmm2, xmm5 movdqa xmm2, xmm5
movdqa xmm3, xmm5 movdqa xmm3, xmm5
@ -5889,8 +5889,8 @@ __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
const uint8_t* shuffler, const uint8_t* shuffler,
int width) { int width) {
__asm { __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // shuffler mov ecx, [esp + 12] // shuffler
movdqu xmm5, [ecx] movdqu xmm5, [ecx]
mov ecx, [esp + 16] // width mov ecx, [esp + 16] // width
@ -5916,8 +5916,8 @@ __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
const uint8_t* shuffler, const uint8_t* shuffler,
int width) { int width) {
__asm { __asm {
mov eax, [esp + 4] // src_argb mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // shuffler mov ecx, [esp + 12] // shuffler
vbroadcastf128 ymm5, [ecx] // same shuffle in high as low. vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
mov ecx, [esp + 16] // width mov ecx, [esp + 16] // width
@ -5954,18 +5954,18 @@ __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
__asm { __asm {
push esi push esi
push edi push edi
mov eax, [esp + 8 + 4] // src_y mov eax, [esp + 8 + 4] // src_y
mov esi, [esp + 8 + 8] // src_u mov esi, [esp + 8 + 8] // src_u
mov edx, [esp + 8 + 12] // src_v mov edx, [esp + 8 + 12] // src_v
mov edi, [esp + 8 + 16] // dst_frame mov edi, [esp + 8 + 16] // dst_frame
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edx, esi sub edx, esi
convertloop: convertloop:
movq xmm2, qword ptr [esi] // U movq xmm2, qword ptr [esi] // U
movq xmm3, qword ptr [esi + edx] // V movq xmm3, qword ptr [esi + edx] // V
lea esi, [esi + 8] lea esi, [esi + 8]
punpcklbw xmm2, xmm3 // UV punpcklbw xmm2, xmm3 // UV
movdqu xmm0, [eax] // Y movdqu xmm0, [eax] // Y
lea eax, [eax + 16] lea eax, [eax + 16]
movdqa xmm1, xmm0 movdqa xmm1, xmm0
@ -5991,22 +5991,22 @@ __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
__asm { __asm {
push esi push esi
push edi push edi
mov eax, [esp + 8 + 4] // src_y mov eax, [esp + 8 + 4] // src_y
mov esi, [esp + 8 + 8] // src_u mov esi, [esp + 8 + 8] // src_u
mov edx, [esp + 8 + 12] // src_v mov edx, [esp + 8 + 12] // src_v
mov edi, [esp + 8 + 16] // dst_frame mov edi, [esp + 8 + 16] // dst_frame
mov ecx, [esp + 8 + 20] // width mov ecx, [esp + 8 + 20] // width
sub edx, esi sub edx, esi
convertloop: convertloop:
movq xmm2, qword ptr [esi] // U movq xmm2, qword ptr [esi] // U
movq xmm3, qword ptr [esi + edx] // V movq xmm3, qword ptr [esi + edx] // V
lea esi, [esi + 8] lea esi, [esi + 8]
punpcklbw xmm2, xmm3 // UV punpcklbw xmm2, xmm3 // UV
movdqu xmm0, [eax] // Y movdqu xmm0, [eax] // Y
movdqa xmm1, xmm2 movdqa xmm1, xmm2
lea eax, [eax + 16] lea eax, [eax + 16]
punpcklbw xmm1, xmm0 // UYVY punpcklbw xmm1, xmm0 // UYVY
punpckhbw xmm2, xmm0 punpckhbw xmm2, xmm0
movdqu [edi], xmm1 movdqu [edi], xmm1
movdqu [edi + 16], xmm2 movdqu [edi + 16], xmm2
@ -6033,10 +6033,10 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
mov ecx, [esp + 4 + 16] /* width */ mov ecx, [esp + 4 + 16] /* width */
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints. pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
// 2 pixel loop. // 2 pixel loop.
convertloop: convertloop:
// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel // pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel // pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
movq xmm0, qword ptr [eax] // BGRABGRA movq xmm0, qword ptr [eax] // BGRABGRA
lea eax, [eax + 8] lea eax, [eax + 8]
punpcklbw xmm0, xmm3 punpcklbw xmm0, xmm3
@ -6085,8 +6085,8 @@ __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
const float* poly, const float* poly,
int width) { int width) {
__asm { __asm {
mov eax, [esp + 4] /* src_argb */ mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_argb */ mov edx, [esp + 8] /* dst_argb */
mov ecx, [esp + 12] /* poly */ mov ecx, [esp + 12] /* poly */
vbroadcastf128 ymm4, [ecx] // C0 vbroadcastf128 ymm4, [ecx] // C0
vbroadcastf128 ymm5, [ecx + 16] // C1 vbroadcastf128 ymm5, [ecx + 16] // C1
@ -6125,8 +6125,8 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
float scale, float scale,
int width) { int width) {
__asm { __asm {
mov eax, [esp + 4] /* src */ mov eax, [esp + 4] /* src */
mov edx, [esp + 8] /* dst */ mov edx, [esp + 8] /* dst */
movd xmm4, dword ptr [esp + 12] /* scale */ movd xmm4, dword ptr [esp + 12] /* scale */
mov ecx, [esp + 16] /* width */ mov ecx, [esp + 16] /* width */
mulss xmm4, kExpBias mulss xmm4, kExpBias
@ -6134,7 +6134,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
pxor xmm5, xmm5 pxor xmm5, xmm5
sub edx, eax sub edx, eax
// 8 pixel loop. // 8 pixel loop.
convertloop: convertloop:
movdqu xmm2, xmmword ptr [eax] // 8 shorts movdqu xmm2, xmmword ptr [eax] // 8 shorts
add eax, 16 add eax, 16
@ -6172,7 +6172,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
vpxor ymm5, ymm5, ymm5 vpxor ymm5, ymm5, ymm5
sub edx, eax sub edx, eax
// 16 pixel loop. // 16 pixel loop.
convertloop: convertloop:
vmovdqu ymm2, [eax] // 16 shorts vmovdqu ymm2, [eax] // 16 shorts
add eax, 32 add eax, 32
@ -6182,7 +6182,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
vcvtdq2ps ymm2, ymm2 vcvtdq2ps ymm2, ymm2
vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range. vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
vmulps ymm2, ymm2, ymm4 vmulps ymm2, ymm2, ymm4
vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
vpsrld ymm2, ymm2, 13 vpsrld ymm2, ymm2, 13
vpackssdw ymm2, ymm2, ymm3 vpackssdw ymm2, ymm2, ymm3
vmovdqu [eax + edx - 32], ymm2 vmovdqu [eax + edx - 32], ymm2
@ -6200,22 +6200,22 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
float scale, float scale,
int width) { int width) {
__asm { __asm {
mov eax, [esp + 4] /* src */ mov eax, [esp + 4] /* src */
mov edx, [esp + 8] /* dst */ mov edx, [esp + 8] /* dst */
vbroadcastss ymm4, [esp + 12] /* scale */ vbroadcastss ymm4, [esp + 12] /* scale */
mov ecx, [esp + 16] /* width */ mov ecx, [esp + 16] /* width */
sub edx, eax sub edx, eax
// 16 pixel loop. // 16 pixel loop.
convertloop: convertloop:
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
add eax, 32 add eax, 32
vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
vcvtdq2ps ymm3, ymm3 vcvtdq2ps ymm3, ymm3
vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
vmulps ymm3, ymm3, ymm4 vmulps ymm3, ymm3, ymm4
vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
vcvtps2ph xmm3, ymm3, 3 vcvtps2ph xmm3, ymm3, 3
vmovdqu [eax + edx + 32], xmm2 vmovdqu [eax + edx + 32], xmm2
vmovdqu [eax + edx + 32 + 16], xmm3 vmovdqu [eax + edx + 32 + 16], xmm3
@ -6234,8 +6234,8 @@ __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
int width) { int width) {
__asm { __asm {
push esi push esi
mov eax, [esp + 4 + 4] /* dst_argb */ mov eax, [esp + 4 + 4] /* dst_argb */
mov esi, [esp + 4 + 8] /* table_argb */ mov esi, [esp + 4 + 8] /* table_argb */
mov ecx, [esp + 4 + 12] /* width */ mov ecx, [esp + 4 + 12] /* width */
// 1 pixel loop. // 1 pixel loop.
@ -6268,8 +6268,8 @@ __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
int width) { int width) {
__asm { __asm {
push esi push esi
mov eax, [esp + 4 + 4] /* dst_argb */ mov eax, [esp + 4 + 4] /* dst_argb */
mov esi, [esp + 4 + 8] /* table_argb */ mov esi, [esp + 4 + 8] /* table_argb */
mov ecx, [esp + 4 + 12] /* width */ mov ecx, [esp + 4 + 12] /* width */
// 1 pixel loop. // 1 pixel loop.
@ -6303,8 +6303,8 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
__asm { __asm {
push esi push esi
push edi push edi
mov eax, [esp + 8 + 4] /* src_argb */ mov eax, [esp + 8 + 4] /* src_argb */
mov edi, [esp + 8 + 8] /* dst_argb */ mov edi, [esp + 8 + 8] /* dst_argb */
mov ecx, [esp + 8 + 12] /* width */ mov ecx, [esp + 8 + 12] /* width */
movd xmm2, dword ptr [esp + 8 + 16] // luma table movd xmm2, dword ptr [esp + 8 + 16] // luma table
movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
@ -6314,7 +6314,7 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
psllw xmm4, 8 psllw xmm4, 8
pxor xmm5, xmm5 pxor xmm5, xmm5
// 4 pixel loop. // 4 pixel loop.
convertloop: convertloop:
movdqu xmm0, xmmword ptr [eax] // generate luma ptr movdqu xmm0, xmmword ptr [eax] // generate luma ptr
pmaddubsw xmm0, xmm3 pmaddubsw xmm0, xmm3