Fix ConvertToI420 when using YUY2 or UYVY with odd crop_x.

- swap U and V when crop x is odd
- document YUY2 and UYVY formats
- apply clang-format

Bug: libyuv:902
Change-Id: I045e44c907f4a9eb625d7c024b669bb308055f32
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3039549
Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
This commit is contained in:
Frank Barchard 2021-07-19 12:04:32 -07:00 committed by Frank Barchard
parent 0572e0a0b1
commit 639dd4ea76
5 changed files with 147 additions and 133 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1788
Version: 1789
License: BSD
License File: LICENSE

View File

@ -189,7 +189,6 @@ In memory R is the lowest and A is the highest.
Each channel has value ranges from 0 to 65535.
AR64 is similar to ARGB.
# NV12 and NV21
NV12 is a biplanar format with a full sized Y plane followed by a single
@ -200,3 +199,10 @@ height chroma channel, and therefore is a 420 subsampling.
NV16 is 16 bits per pixel, with half width and full height. aka 422.
NV24 is 24 bits per pixel with full sized chroma channel. aka 444.
Most NV12 functions allow the destination Y pointer to be NULL.
# YUY2 and UYVY
YUY2 is a packed YUV format with half width, full height.
YUY2 is YUYV in memory
UYVY is UYVY in memory

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1788
#define LIBYUV_VERSION 1789
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -89,18 +89,26 @@ int ConvertToI420(const uint8_t* sample,
switch (format) {
// Single plane formats
case FOURCC_YUY2:
case FOURCC_YUY2: { // TODO(fbarchard): Find better odd crop fix.
uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
dst_stride_u, dst_v, dst_stride_v, crop_width,
inv_crop_height);
r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
stride_u, v, stride_v, crop_width, inv_crop_height);
break;
case FOURCC_UYVY:
}
case FOURCC_UYVY: {
uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
dst_stride_u, dst_v, dst_stride_v, crop_width,
inv_crop_height);
r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
stride_u, v, stride_v, crop_width, inv_crop_height);
break;
}
case FOURCC_RGBP:
src = sample + (src_width * crop_y + crop_x) * 2;
r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,

View File

@ -2584,7 +2584,7 @@ __declspec(naked) void I422ToRGBARow_AVX2(
__asm psraw xmm2, 6 \
__asm packuswb xmm0, xmm0 /* B */ \
__asm packuswb xmm1, xmm1 /* G */ \
__asm packuswb xmm2, xmm2 /* R */ \
__asm packuswb xmm2, xmm2 /* R */ \
}
// Store 8 ARGB values.
@ -4746,22 +4746,22 @@ __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
pmaddubsw xmm6, xmm2
phaddw xmm0, xmm6
psrlw xmm0, 7
packuswb xmm0, xmm0 // 8 B values
packuswb xmm0, xmm0 // 8 B values
movdqu xmm5, [eax] // G
movdqu xmm1, [eax + 16]
pmaddubsw xmm5, xmm3
pmaddubsw xmm1, xmm3
phaddw xmm5, xmm1
psrlw xmm5, 7
packuswb xmm5, xmm5 // 8 G values
punpcklbw xmm0, xmm5 // 8 BG values
packuswb xmm5, xmm5 // 8 G values
punpcklbw xmm0, xmm5 // 8 BG values
movdqu xmm5, [eax] // R
movdqu xmm1, [eax + 16]
pmaddubsw xmm5, xmm4
pmaddubsw xmm1, xmm4
phaddw xmm5, xmm1
psrlw xmm5, 7
packuswb xmm5, xmm5 // 8 R values
packuswb xmm5, xmm5 // 8 R values
movdqu xmm6, [eax] // A
movdqu xmm1, [eax + 16]
psrld xmm6, 24
@ -4811,25 +4811,25 @@ __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
movdqu xmm1, [eax + 16]
pmaddubsw xmm6, xmm3
pmaddubsw xmm1, xmm3
phaddsw xmm0, xmm7 // B
phaddsw xmm6, xmm1 // G
psraw xmm0, 6 // B
psraw xmm6, 6 // G
packuswb xmm0, xmm0 // 8 B values
packuswb xmm6, xmm6 // 8 G values
punpcklbw xmm0, xmm6 // 8 BG values
phaddsw xmm0, xmm7 // B
phaddsw xmm6, xmm1 // G
psraw xmm0, 6 // B
psraw xmm6, 6 // G
packuswb xmm0, xmm0 // 8 B values
packuswb xmm6, xmm6 // 8 G values
punpcklbw xmm0, xmm6 // 8 BG values
movdqu xmm1, [eax] // R
movdqu xmm7, [eax + 16]
pmaddubsw xmm1, xmm4
pmaddubsw xmm7, xmm4
phaddsw xmm1, xmm7 // R
phaddsw xmm1, xmm7 // R
movdqu xmm6, [eax] // A
movdqu xmm7, [eax + 16]
pmaddubsw xmm6, xmm5
pmaddubsw xmm7, xmm5
phaddsw xmm6, xmm7 // A
psraw xmm1, 6 // R
psraw xmm6, 6 // A
psraw xmm1, 6 // R
psraw xmm6, 6 // A
packuswb xmm1, xmm1 // 8 R values
packuswb xmm6, xmm6 // 8 A values
punpcklbw xmm1, xmm6 // 8 RA values
@ -4872,16 +4872,16 @@ __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
convertloop:
movdqu xmm0, [eax] // read 4 pixels
punpcklbw xmm0, xmm5 // first 2 pixels
pmulhuw xmm0, xmm2 // pixel * scale >> 16
punpcklbw xmm0, xmm5 // first 2 pixels
pmulhuw xmm0, xmm2 // pixel * scale >> 16
movdqu xmm1, [eax] // read 4 pixels
punpckhbw xmm1, xmm5 // next 2 pixels
punpckhbw xmm1, xmm5 // next 2 pixels
pmulhuw xmm1, xmm2
pmullw xmm0, xmm3 // * interval_size
pmullw xmm0, xmm3 // * interval_size
movdqu xmm7, [eax] // read 4 pixels
pmullw xmm1, xmm3
pand xmm7, xmm6 // mask alpha
paddw xmm0, xmm4 // + interval_size / 2
pand xmm7, xmm6 // mask alpha
paddw xmm0, xmm4 // + interval_size / 2
paddw xmm1, xmm4
packuswb xmm0, xmm1
por xmm0, xmm7
@ -4901,9 +4901,9 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
int width,
uint32_t value) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // width
movd xmm2, [esp + 16] // value
punpcklbw xmm2, xmm2
punpcklqdq xmm2, xmm2
@ -4912,10 +4912,10 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
movdqu xmm0, [eax] // read 4 pixels
lea eax, [eax + 16]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm0 // first 2
punpckhbw xmm1, xmm1 // next 2
pmulhuw xmm0, xmm2 // argb * value
pmulhuw xmm1, xmm2 // argb * value
punpcklbw xmm0, xmm0 // first 2
punpckhbw xmm1, xmm1 // next 2
pmulhuw xmm0, xmm2 // argb * value
pmulhuw xmm1, xmm2 // argb * value
psrlw xmm0, 8
psrlw xmm1, 8
packuswb xmm0, xmm1
@ -4937,23 +4937,23 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
pxor xmm5, xmm5 // constant 0
pxor xmm5, xmm5 // constant 0
convertloop:
movdqu xmm0, [eax] // read 4 pixels from src_argb
movdqu xmm2, [esi] // read 4 pixels from src_argb1
movdqu xmm1, xmm0
movdqu xmm3, xmm2
punpcklbw xmm0, xmm0 // first 2
punpckhbw xmm1, xmm1 // next 2
punpcklbw xmm2, xmm5 // first 2
punpckhbw xmm3, xmm5 // next 2
pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2
pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2
punpcklbw xmm0, xmm0 // first 2
punpckhbw xmm1, xmm1 // next 2
punpcklbw xmm2, xmm5 // first 2
punpckhbw xmm3, xmm5 // next 2
pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2
pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2
lea eax, [eax + 16]
lea esi, [esi + 16]
packuswb xmm0, xmm1
@ -4977,8 +4977,8 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
@ -4990,7 +4990,7 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
lea eax, [eax + 16]
movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
paddusb xmm0, xmm1 // src_argb + src_argb1
paddusb xmm0, xmm1 // src_argb + src_argb1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
@ -5005,7 +5005,7 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
lea eax, [eax + 4]
movd xmm1, [esi] // read 1 pixels from src_argb1
lea esi, [esi + 4]
paddusb xmm0, xmm1 // src_argb + src_argb1
paddusb xmm0, xmm1 // src_argb + src_argb1
movd [edx], xmm0
lea edx, [edx + 4]
sub ecx, 1
@ -5026,8 +5026,8 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
@ -5036,7 +5036,7 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
lea eax, [eax + 16]
movdqu xmm1, [esi] // read 4 pixels from src_argb1
lea esi, [esi + 16]
psubusb xmm0, xmm1 // src_argb - src_argb1
psubusb xmm0, xmm1 // src_argb - src_argb1
movdqu [edx], xmm0
lea edx, [edx + 16]
sub ecx, 4
@ -5056,11 +5056,11 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
vpxor ymm5, ymm5, ymm5 // constant 0
vpxor ymm5, ymm5, ymm5 // constant 0
convertloop:
vmovdqu ymm1, [eax] // read 8 pixels from src_argb
@ -5094,8 +5094,8 @@ __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
@ -5124,8 +5124,8 @@ __declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov eax, [esp + 4 + 4] // src_argb
mov esi, [esp + 4 + 8] // src_argb1
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
@ -5159,8 +5159,8 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_y0
mov esi, [esp + 8 + 8] // src_y1
mov eax, [esp + 8 + 4] // src_y0
mov esi, [esp + 8 + 8] // src_y1
mov edi, [esp + 8 + 12] // src_y2
mov edx, [esp + 8 + 16] // dst_sobelx
mov ecx, [esp + 8 + 20] // width
@ -5170,17 +5170,17 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
pxor xmm5, xmm5 // constant 0
convertloop:
movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
punpcklbw xmm0, xmm5
punpcklbw xmm1, xmm5
psubw xmm0, xmm1
movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
punpcklbw xmm1, xmm5
punpcklbw xmm2, xmm5
psubw xmm1, xmm2
movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm5
@ -5215,8 +5215,8 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_y0
mov esi, [esp + 4 + 8] // src_y1
mov eax, [esp + 4 + 4] // src_y0
mov esi, [esp + 4 + 8] // src_y1
mov edx, [esp + 4 + 12] // dst_sobely
mov ecx, [esp + 4 + 16] // width
sub esi, eax
@ -5224,17 +5224,17 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
pxor xmm5, xmm5 // constant 0
convertloop:
movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
punpcklbw xmm0, xmm5
punpcklbw xmm1, xmm5
psubw xmm0, xmm1
movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
punpcklbw xmm1, xmm5
punpcklbw xmm2, xmm5
psubw xmm1, xmm2
movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
punpcklbw xmm2, xmm5
punpcklbw xmm3, xmm5
@ -5269,8 +5269,8 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_sobelx
mov esi, [esp + 4 + 8] // src_sobely
mov eax, [esp + 4 + 4] // src_sobelx
mov esi, [esp + 4 + 8] // src_sobely
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
sub esi, eax
@ -5278,7 +5278,7 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
pslld xmm5, 24 // 0xff000000
convertloop:
movdqu xmm0, [eax] // read 16 pixels src_sobelx
movdqu xmm0, [eax] // read 16 pixels src_sobelx
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16]
paddusb xmm0, xmm1 // sobel = sobelx + sobely
@ -5317,8 +5317,8 @@ __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_sobelx
mov esi, [esp + 4 + 8] // src_sobely
mov eax, [esp + 4 + 4] // src_sobelx
mov esi, [esp + 4 + 8] // src_sobely
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
sub esi, eax
@ -5351,15 +5351,15 @@ __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] // src_sobelx
mov esi, [esp + 4 + 8] // src_sobely
mov eax, [esp + 4 + 4] // src_sobelx
mov esi, [esp + 4 + 8] // src_sobely
mov edx, [esp + 4 + 12] // dst_argb
mov ecx, [esp + 4 + 16] // width
sub esi, eax
pcmpeqb xmm5, xmm5 // alpha 255
convertloop:
movdqu xmm0, [eax] // read 16 pixels src_sobelx
movdqu xmm0, [eax] // read 16 pixels src_sobelx
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
lea eax, [eax + 16]
movdqa xmm2, xmm0
@ -5529,7 +5529,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
add ecx, 4 - 1
jl l1b
// 1 pixel loop
// 1 pixel loop
l1:
movdqu xmm0, [eax]
psubd xmm0, [eax + edx * 4]
@ -5571,7 +5571,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
test edx, 15
jne l4b
// 4 pixel loop
// 4 pixel loop
l4:
movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
lea eax, [eax + 16]
@ -5617,7 +5617,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
add ecx, 4 - 1
jl l1b
// 1 pixel loop
// 1 pixel loop
l1:
movd xmm2, dword ptr [eax] // 1 argb pixel
lea eax, [eax + 4]
@ -5651,7 +5651,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
mov esi, [esp + 16] // stride
mov edx, [esp + 20] // dst_argb
mov ecx, [esp + 24] // pointer to uv_dudv
movq xmm2, qword ptr [ecx] // uv
movq xmm2, qword ptr [ecx] // uv
movq xmm7, qword ptr [ecx + 8] // dudv
mov ecx, [esp + 28] // width
shl esi, 16 // 4, stride
@ -5660,7 +5660,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
sub ecx, 4
jl l4b
// setup for 4 pixel loop
// setup for 4 pixel loop
pshufd xmm7, xmm7, 0x44 // dup dudv
pshufd xmm5, xmm5, 0 // dup 4, stride
movdqa xmm0, xmm2 // x0, y0, x1, y1
@ -5672,16 +5672,16 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
addps xmm3, xmm4
addps xmm4, xmm4 // dudv *= 4
// 4 pixel loop
// 4 pixel loop
l4:
cvttps2dq xmm0, xmm2 // x, y float to int first 2
cvttps2dq xmm1, xmm3 // x, y float to int next 2
packssdw xmm0, xmm1 // x, y as 8 shorts
pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
movd esi, xmm0
pshufd xmm0, xmm0, 0x39 // shift right
pshufd xmm0, xmm0, 0x39 // shift right
movd edi, xmm0
pshufd xmm0, xmm0, 0x39 // shift right
pshufd xmm0, xmm0, 0x39 // shift right
movd xmm1, [eax + esi] // read pixel 0
movd xmm6, [eax + edi] // read pixel 1
punpckldq xmm1, xmm6 // combine pixel 0 and 1
@ -5733,8 +5733,8 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
__asm {
push esi
push edi
mov edi, [esp + 8 + 4] // dst_ptr
mov esi, [esp + 8 + 8] // src_ptr
mov edi, [esp + 8 + 4] // dst_ptr
mov esi, [esp + 8 + 8] // src_ptr
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
@ -5743,7 +5743,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
je xloop100 // 0 / 256. Blend 100 / 0.
sub edi, esi
cmp eax, 128
je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
vmovd xmm0, eax // high fraction 0..255
neg eax
@ -5770,7 +5770,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
vpaddw ymm0, ymm0, ymm4
vpsrlw ymm1, ymm1, 8
vpsrlw ymm0, ymm0, 8
vpackuswb ymm0, ymm0, ymm1 // unmutates
vpackuswb ymm0, ymm0, ymm1 // unmutates
vmovdqu [esi + edi], ymm0
lea esi, [esi + 32]
sub ecx, 32
@ -5811,17 +5811,17 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
push esi
push edi
mov edi, [esp + 8 + 4] // dst_ptr
mov esi, [esp + 8 + 8] // src_ptr
mov edi, [esp + 8 + 4] // dst_ptr
mov esi, [esp + 8 + 8] // src_ptr
mov edx, [esp + 8 + 12] // src_stride
mov ecx, [esp + 8 + 16] // dst_width
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
sub edi, esi
// Dispatch to specialized filters if applicable.
// Dispatch to specialized filters if applicable.
cmp eax, 0
je xloop100 // 0 /256. Blend 100 / 0.
cmp eax, 128
je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
movd xmm0, eax // high fraction 0..255
neg eax
@ -5840,7 +5840,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
movdqu xmm1, xmm0
punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2
psubb xmm0, xmm4 // bias image by -128
psubb xmm0, xmm4 // bias image by -128
psubb xmm1, xmm4
movdqa xmm2, xmm5
movdqa xmm3, xmm5
@ -5889,8 +5889,8 @@ __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
const uint8_t* shuffler,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // shuffler
movdqu xmm5, [ecx]
mov ecx, [esp + 16] // width
@ -5916,8 +5916,8 @@ __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
const uint8_t* shuffler,
int width) {
__asm {
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov eax, [esp + 4] // src_argb
mov edx, [esp + 8] // dst_argb
mov ecx, [esp + 12] // shuffler
vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
mov ecx, [esp + 16] // width
@ -5954,18 +5954,18 @@ __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_y
mov esi, [esp + 8 + 8] // src_u
mov eax, [esp + 8 + 4] // src_y
mov esi, [esp + 8 + 8] // src_u
mov edx, [esp + 8 + 12] // src_v
mov edi, [esp + 8 + 16] // dst_frame
mov ecx, [esp + 8 + 20] // width
sub edx, esi
convertloop:
movq xmm2, qword ptr [esi] // U
movq xmm2, qword ptr [esi] // U
movq xmm3, qword ptr [esi + edx] // V
lea esi, [esi + 8]
punpcklbw xmm2, xmm3 // UV
punpcklbw xmm2, xmm3 // UV
movdqu xmm0, [eax] // Y
lea eax, [eax + 16]
movdqa xmm1, xmm0
@ -5991,22 +5991,22 @@ __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] // src_y
mov esi, [esp + 8 + 8] // src_u
mov eax, [esp + 8 + 4] // src_y
mov esi, [esp + 8 + 8] // src_u
mov edx, [esp + 8 + 12] // src_v
mov edi, [esp + 8 + 16] // dst_frame
mov ecx, [esp + 8 + 20] // width
sub edx, esi
convertloop:
movq xmm2, qword ptr [esi] // U
movq xmm2, qword ptr [esi] // U
movq xmm3, qword ptr [esi + edx] // V
lea esi, [esi + 8]
punpcklbw xmm2, xmm3 // UV
punpcklbw xmm2, xmm3 // UV
movdqu xmm0, [eax] // Y
movdqa xmm1, xmm2
lea eax, [eax + 16]
punpcklbw xmm1, xmm0 // UYVY
punpcklbw xmm1, xmm0 // UYVY
punpckhbw xmm2, xmm0
movdqu [edi], xmm1
movdqu [edi + 16], xmm2
@ -6033,10 +6033,10 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
mov ecx, [esp + 4 + 16] /* width */
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
// 2 pixel loop.
// 2 pixel loop.
convertloop:
// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
movq xmm0, qword ptr [eax] // BGRABGRA
lea eax, [eax + 8]
punpcklbw xmm0, xmm3
@ -6085,8 +6085,8 @@ __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
const float* poly,
int width) {
__asm {
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_argb */
mov eax, [esp + 4] /* src_argb */
mov edx, [esp + 8] /* dst_argb */
mov ecx, [esp + 12] /* poly */
vbroadcastf128 ymm4, [ecx] // C0
vbroadcastf128 ymm5, [ecx + 16] // C1
@ -6125,8 +6125,8 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
float scale,
int width) {
__asm {
mov eax, [esp + 4] /* src */
mov edx, [esp + 8] /* dst */
mov eax, [esp + 4] /* src */
mov edx, [esp + 8] /* dst */
movd xmm4, dword ptr [esp + 12] /* scale */
mov ecx, [esp + 16] /* width */
mulss xmm4, kExpBias
@ -6134,7 +6134,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
pxor xmm5, xmm5
sub edx, eax
// 8 pixel loop.
// 8 pixel loop.
convertloop:
movdqu xmm2, xmmword ptr [eax] // 8 shorts
add eax, 16
@ -6172,7 +6172,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
vpxor ymm5, ymm5, ymm5
sub edx, eax
// 16 pixel loop.
// 16 pixel loop.
convertloop:
vmovdqu ymm2, [eax] // 16 shorts
add eax, 32
@ -6182,7 +6182,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
vcvtdq2ps ymm2, ymm2
vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
vmulps ymm2, ymm2, ymm4
vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
vpsrld ymm2, ymm2, 13
vpackssdw ymm2, ymm2, ymm3
vmovdqu [eax + edx - 32], ymm2
@ -6200,22 +6200,22 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
float scale,
int width) {
__asm {
mov eax, [esp + 4] /* src */
mov edx, [esp + 8] /* dst */
mov eax, [esp + 4] /* src */
mov edx, [esp + 8] /* dst */
vbroadcastss ymm4, [esp + 12] /* scale */
mov ecx, [esp + 16] /* width */
mov ecx, [esp + 16] /* width */
sub edx, eax
// 16 pixel loop.
// 16 pixel loop.
convertloop:
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
add eax, 32
vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
vcvtdq2ps ymm3, ymm3
vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
vmulps ymm3, ymm3, ymm4
vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
vcvtps2ph xmm3, ymm3, 3
vmovdqu [eax + edx + 32], xmm2
vmovdqu [eax + edx + 32 + 16], xmm3
@ -6234,8 +6234,8 @@ __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] /* dst_argb */
mov esi, [esp + 4 + 8] /* table_argb */
mov eax, [esp + 4 + 4] /* dst_argb */
mov esi, [esp + 4 + 8] /* table_argb */
mov ecx, [esp + 4 + 12] /* width */
// 1 pixel loop.
@ -6268,8 +6268,8 @@ __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
int width) {
__asm {
push esi
mov eax, [esp + 4 + 4] /* dst_argb */
mov esi, [esp + 4 + 8] /* table_argb */
mov eax, [esp + 4 + 4] /* dst_argb */
mov esi, [esp + 4 + 8] /* table_argb */
mov ecx, [esp + 4 + 12] /* width */
// 1 pixel loop.
@ -6303,8 +6303,8 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
__asm {
push esi
push edi
mov eax, [esp + 8 + 4] /* src_argb */
mov edi, [esp + 8 + 8] /* dst_argb */
mov eax, [esp + 8 + 4] /* src_argb */
mov edi, [esp + 8 + 8] /* dst_argb */
mov ecx, [esp + 8 + 12] /* width */
movd xmm2, dword ptr [esp + 8 + 16] // luma table
movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
@ -6314,7 +6314,7 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
psllw xmm4, 8
pxor xmm5, xmm5
// 4 pixel loop.
// 4 pixel loop.
convertloop:
movdqu xmm0, xmmword ptr [eax] // generate luma ptr
pmaddubsw xmm0, xmm3