mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Fix ConvertToI420 when using YUY2 or UYVY with odd crop_x.
- swap U and V when crop x is odd - document YUY2 and UYVY formats - apply clang-format Bug: libyuv:902 Change-Id: I045e44c907f4a9eb625d7c024b669bb308055f32 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3039549 Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
This commit is contained in:
parent
0572e0a0b1
commit
639dd4ea76
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 1788
|
Version: 1789
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -189,7 +189,6 @@ In memory R is the lowest and A is the highest.
|
|||||||
Each channel has value ranges from 0 to 65535.
|
Each channel has value ranges from 0 to 65535.
|
||||||
AR64 is similar to ARGB.
|
AR64 is similar to ARGB.
|
||||||
|
|
||||||
|
|
||||||
# NV12 and NV21
|
# NV12 and NV21
|
||||||
|
|
||||||
NV12 is a biplanar format with a full sized Y plane followed by a single
|
NV12 is a biplanar format with a full sized Y plane followed by a single
|
||||||
@ -200,3 +199,10 @@ height chroma channel, and therefore is a 420 subsampling.
|
|||||||
NV16 is 16 bits per pixel, with half width and full height. aka 422.
|
NV16 is 16 bits per pixel, with half width and full height. aka 422.
|
||||||
NV24 is 24 bits per pixel with full sized chroma channel. aka 444.
|
NV24 is 24 bits per pixel with full sized chroma channel. aka 444.
|
||||||
Most NV12 functions allow the destination Y pointer to be NULL.
|
Most NV12 functions allow the destination Y pointer to be NULL.
|
||||||
|
|
||||||
|
# YUY2 and UYVY
|
||||||
|
|
||||||
|
YUY2 is a packed YUV format with half width, full height.
|
||||||
|
|
||||||
|
YUY2 is YUYV in memory
|
||||||
|
UYVY is UYVY in memory
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1788
|
#define LIBYUV_VERSION 1789
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|||||||
@ -89,18 +89,26 @@ int ConvertToI420(const uint8_t* sample,
|
|||||||
|
|
||||||
switch (format) {
|
switch (format) {
|
||||||
// Single plane formats
|
// Single plane formats
|
||||||
case FOURCC_YUY2:
|
case FOURCC_YUY2: { // TODO(fbarchard): Find better odd crop fix.
|
||||||
|
uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
|
||||||
|
uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
|
||||||
|
int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
|
||||||
|
int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
|
||||||
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
|
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
|
||||||
r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
|
r = YUY2ToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
|
||||||
dst_stride_u, dst_v, dst_stride_v, crop_width,
|
stride_u, v, stride_v, crop_width, inv_crop_height);
|
||||||
inv_crop_height);
|
|
||||||
break;
|
break;
|
||||||
case FOURCC_UYVY:
|
}
|
||||||
|
case FOURCC_UYVY: {
|
||||||
|
uint8_t* u = (crop_x & 1) ? dst_v : dst_u;
|
||||||
|
uint8_t* v = (crop_x & 1) ? dst_u : dst_v;
|
||||||
|
int stride_u = (crop_x & 1) ? dst_stride_v : dst_stride_u;
|
||||||
|
int stride_v = (crop_x & 1) ? dst_stride_u : dst_stride_v;
|
||||||
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
|
src = sample + (aligned_src_width * crop_y + crop_x) * 2;
|
||||||
r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, dst_u,
|
r = UYVYToI420(src, aligned_src_width * 2, dst_y, dst_stride_y, u,
|
||||||
dst_stride_u, dst_v, dst_stride_v, crop_width,
|
stride_u, v, stride_v, crop_width, inv_crop_height);
|
||||||
inv_crop_height);
|
|
||||||
break;
|
break;
|
||||||
|
}
|
||||||
case FOURCC_RGBP:
|
case FOURCC_RGBP:
|
||||||
src = sample + (src_width * crop_y + crop_x) * 2;
|
src = sample + (src_width * crop_y + crop_x) * 2;
|
||||||
r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
|
r = RGB565ToI420(src, src_width * 2, dst_y, dst_stride_y, dst_u,
|
||||||
|
|||||||
@ -2584,7 +2584,7 @@ __declspec(naked) void I422ToRGBARow_AVX2(
|
|||||||
__asm psraw xmm2, 6 \
|
__asm psraw xmm2, 6 \
|
||||||
__asm packuswb xmm0, xmm0 /* B */ \
|
__asm packuswb xmm0, xmm0 /* B */ \
|
||||||
__asm packuswb xmm1, xmm1 /* G */ \
|
__asm packuswb xmm1, xmm1 /* G */ \
|
||||||
__asm packuswb xmm2, xmm2 /* R */ \
|
__asm packuswb xmm2, xmm2 /* R */ \
|
||||||
}
|
}
|
||||||
|
|
||||||
// Store 8 ARGB values.
|
// Store 8 ARGB values.
|
||||||
@ -4746,22 +4746,22 @@ __declspec(naked) void ARGBSepiaRow_SSSE3(uint8_t* dst_argb, int width) {
|
|||||||
pmaddubsw xmm6, xmm2
|
pmaddubsw xmm6, xmm2
|
||||||
phaddw xmm0, xmm6
|
phaddw xmm0, xmm6
|
||||||
psrlw xmm0, 7
|
psrlw xmm0, 7
|
||||||
packuswb xmm0, xmm0 // 8 B values
|
packuswb xmm0, xmm0 // 8 B values
|
||||||
movdqu xmm5, [eax] // G
|
movdqu xmm5, [eax] // G
|
||||||
movdqu xmm1, [eax + 16]
|
movdqu xmm1, [eax + 16]
|
||||||
pmaddubsw xmm5, xmm3
|
pmaddubsw xmm5, xmm3
|
||||||
pmaddubsw xmm1, xmm3
|
pmaddubsw xmm1, xmm3
|
||||||
phaddw xmm5, xmm1
|
phaddw xmm5, xmm1
|
||||||
psrlw xmm5, 7
|
psrlw xmm5, 7
|
||||||
packuswb xmm5, xmm5 // 8 G values
|
packuswb xmm5, xmm5 // 8 G values
|
||||||
punpcklbw xmm0, xmm5 // 8 BG values
|
punpcklbw xmm0, xmm5 // 8 BG values
|
||||||
movdqu xmm5, [eax] // R
|
movdqu xmm5, [eax] // R
|
||||||
movdqu xmm1, [eax + 16]
|
movdqu xmm1, [eax + 16]
|
||||||
pmaddubsw xmm5, xmm4
|
pmaddubsw xmm5, xmm4
|
||||||
pmaddubsw xmm1, xmm4
|
pmaddubsw xmm1, xmm4
|
||||||
phaddw xmm5, xmm1
|
phaddw xmm5, xmm1
|
||||||
psrlw xmm5, 7
|
psrlw xmm5, 7
|
||||||
packuswb xmm5, xmm5 // 8 R values
|
packuswb xmm5, xmm5 // 8 R values
|
||||||
movdqu xmm6, [eax] // A
|
movdqu xmm6, [eax] // A
|
||||||
movdqu xmm1, [eax + 16]
|
movdqu xmm1, [eax + 16]
|
||||||
psrld xmm6, 24
|
psrld xmm6, 24
|
||||||
@ -4811,25 +4811,25 @@ __declspec(naked) void ARGBColorMatrixRow_SSSE3(const uint8_t* src_argb,
|
|||||||
movdqu xmm1, [eax + 16]
|
movdqu xmm1, [eax + 16]
|
||||||
pmaddubsw xmm6, xmm3
|
pmaddubsw xmm6, xmm3
|
||||||
pmaddubsw xmm1, xmm3
|
pmaddubsw xmm1, xmm3
|
||||||
phaddsw xmm0, xmm7 // B
|
phaddsw xmm0, xmm7 // B
|
||||||
phaddsw xmm6, xmm1 // G
|
phaddsw xmm6, xmm1 // G
|
||||||
psraw xmm0, 6 // B
|
psraw xmm0, 6 // B
|
||||||
psraw xmm6, 6 // G
|
psraw xmm6, 6 // G
|
||||||
packuswb xmm0, xmm0 // 8 B values
|
packuswb xmm0, xmm0 // 8 B values
|
||||||
packuswb xmm6, xmm6 // 8 G values
|
packuswb xmm6, xmm6 // 8 G values
|
||||||
punpcklbw xmm0, xmm6 // 8 BG values
|
punpcklbw xmm0, xmm6 // 8 BG values
|
||||||
movdqu xmm1, [eax] // R
|
movdqu xmm1, [eax] // R
|
||||||
movdqu xmm7, [eax + 16]
|
movdqu xmm7, [eax + 16]
|
||||||
pmaddubsw xmm1, xmm4
|
pmaddubsw xmm1, xmm4
|
||||||
pmaddubsw xmm7, xmm4
|
pmaddubsw xmm7, xmm4
|
||||||
phaddsw xmm1, xmm7 // R
|
phaddsw xmm1, xmm7 // R
|
||||||
movdqu xmm6, [eax] // A
|
movdqu xmm6, [eax] // A
|
||||||
movdqu xmm7, [eax + 16]
|
movdqu xmm7, [eax + 16]
|
||||||
pmaddubsw xmm6, xmm5
|
pmaddubsw xmm6, xmm5
|
||||||
pmaddubsw xmm7, xmm5
|
pmaddubsw xmm7, xmm5
|
||||||
phaddsw xmm6, xmm7 // A
|
phaddsw xmm6, xmm7 // A
|
||||||
psraw xmm1, 6 // R
|
psraw xmm1, 6 // R
|
||||||
psraw xmm6, 6 // A
|
psraw xmm6, 6 // A
|
||||||
packuswb xmm1, xmm1 // 8 R values
|
packuswb xmm1, xmm1 // 8 R values
|
||||||
packuswb xmm6, xmm6 // 8 A values
|
packuswb xmm6, xmm6 // 8 A values
|
||||||
punpcklbw xmm1, xmm6 // 8 RA values
|
punpcklbw xmm1, xmm6 // 8 RA values
|
||||||
@ -4872,16 +4872,16 @@ __declspec(naked) void ARGBQuantizeRow_SSE2(uint8_t* dst_argb,
|
|||||||
|
|
||||||
convertloop:
|
convertloop:
|
||||||
movdqu xmm0, [eax] // read 4 pixels
|
movdqu xmm0, [eax] // read 4 pixels
|
||||||
punpcklbw xmm0, xmm5 // first 2 pixels
|
punpcklbw xmm0, xmm5 // first 2 pixels
|
||||||
pmulhuw xmm0, xmm2 // pixel * scale >> 16
|
pmulhuw xmm0, xmm2 // pixel * scale >> 16
|
||||||
movdqu xmm1, [eax] // read 4 pixels
|
movdqu xmm1, [eax] // read 4 pixels
|
||||||
punpckhbw xmm1, xmm5 // next 2 pixels
|
punpckhbw xmm1, xmm5 // next 2 pixels
|
||||||
pmulhuw xmm1, xmm2
|
pmulhuw xmm1, xmm2
|
||||||
pmullw xmm0, xmm3 // * interval_size
|
pmullw xmm0, xmm3 // * interval_size
|
||||||
movdqu xmm7, [eax] // read 4 pixels
|
movdqu xmm7, [eax] // read 4 pixels
|
||||||
pmullw xmm1, xmm3
|
pmullw xmm1, xmm3
|
||||||
pand xmm7, xmm6 // mask alpha
|
pand xmm7, xmm6 // mask alpha
|
||||||
paddw xmm0, xmm4 // + interval_size / 2
|
paddw xmm0, xmm4 // + interval_size / 2
|
||||||
paddw xmm1, xmm4
|
paddw xmm1, xmm4
|
||||||
packuswb xmm0, xmm1
|
packuswb xmm0, xmm1
|
||||||
por xmm0, xmm7
|
por xmm0, xmm7
|
||||||
@ -4901,9 +4901,9 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
|
|||||||
int width,
|
int width,
|
||||||
uint32_t value) {
|
uint32_t value) {
|
||||||
__asm {
|
__asm {
|
||||||
mov eax, [esp + 4] // src_argb
|
mov eax, [esp + 4] // src_argb
|
||||||
mov edx, [esp + 8] // dst_argb
|
mov edx, [esp + 8] // dst_argb
|
||||||
mov ecx, [esp + 12] // width
|
mov ecx, [esp + 12] // width
|
||||||
movd xmm2, [esp + 16] // value
|
movd xmm2, [esp + 16] // value
|
||||||
punpcklbw xmm2, xmm2
|
punpcklbw xmm2, xmm2
|
||||||
punpcklqdq xmm2, xmm2
|
punpcklqdq xmm2, xmm2
|
||||||
@ -4912,10 +4912,10 @@ __declspec(naked) void ARGBShadeRow_SSE2(const uint8_t* src_argb,
|
|||||||
movdqu xmm0, [eax] // read 4 pixels
|
movdqu xmm0, [eax] // read 4 pixels
|
||||||
lea eax, [eax + 16]
|
lea eax, [eax + 16]
|
||||||
movdqa xmm1, xmm0
|
movdqa xmm1, xmm0
|
||||||
punpcklbw xmm0, xmm0 // first 2
|
punpcklbw xmm0, xmm0 // first 2
|
||||||
punpckhbw xmm1, xmm1 // next 2
|
punpckhbw xmm1, xmm1 // next 2
|
||||||
pmulhuw xmm0, xmm2 // argb * value
|
pmulhuw xmm0, xmm2 // argb * value
|
||||||
pmulhuw xmm1, xmm2 // argb * value
|
pmulhuw xmm1, xmm2 // argb * value
|
||||||
psrlw xmm0, 8
|
psrlw xmm0, 8
|
||||||
psrlw xmm1, 8
|
psrlw xmm1, 8
|
||||||
packuswb xmm0, xmm1
|
packuswb xmm0, xmm1
|
||||||
@ -4937,23 +4937,23 @@ __declspec(naked) void ARGBMultiplyRow_SSE2(const uint8_t* src_argb,
|
|||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
mov eax, [esp + 4 + 4] // src_argb
|
mov eax, [esp + 4 + 4] // src_argb
|
||||||
mov esi, [esp + 4 + 8] // src_argb1
|
mov esi, [esp + 4 + 8] // src_argb1
|
||||||
mov edx, [esp + 4 + 12] // dst_argb
|
mov edx, [esp + 4 + 12] // dst_argb
|
||||||
mov ecx, [esp + 4 + 16] // width
|
mov ecx, [esp + 4 + 16] // width
|
||||||
pxor xmm5, xmm5 // constant 0
|
pxor xmm5, xmm5 // constant 0
|
||||||
|
|
||||||
convertloop:
|
convertloop:
|
||||||
movdqu xmm0, [eax] // read 4 pixels from src_argb
|
movdqu xmm0, [eax] // read 4 pixels from src_argb
|
||||||
movdqu xmm2, [esi] // read 4 pixels from src_argb1
|
movdqu xmm2, [esi] // read 4 pixels from src_argb1
|
||||||
movdqu xmm1, xmm0
|
movdqu xmm1, xmm0
|
||||||
movdqu xmm3, xmm2
|
movdqu xmm3, xmm2
|
||||||
punpcklbw xmm0, xmm0 // first 2
|
punpcklbw xmm0, xmm0 // first 2
|
||||||
punpckhbw xmm1, xmm1 // next 2
|
punpckhbw xmm1, xmm1 // next 2
|
||||||
punpcklbw xmm2, xmm5 // first 2
|
punpcklbw xmm2, xmm5 // first 2
|
||||||
punpckhbw xmm3, xmm5 // next 2
|
punpckhbw xmm3, xmm5 // next 2
|
||||||
pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2
|
pmulhuw xmm0, xmm2 // src_argb * src_argb1 first 2
|
||||||
pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2
|
pmulhuw xmm1, xmm3 // src_argb * src_argb1 next 2
|
||||||
lea eax, [eax + 16]
|
lea eax, [eax + 16]
|
||||||
lea esi, [esi + 16]
|
lea esi, [esi + 16]
|
||||||
packuswb xmm0, xmm1
|
packuswb xmm0, xmm1
|
||||||
@ -4977,8 +4977,8 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
|
|||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
mov eax, [esp + 4 + 4] // src_argb
|
mov eax, [esp + 4 + 4] // src_argb
|
||||||
mov esi, [esp + 4 + 8] // src_argb1
|
mov esi, [esp + 4 + 8] // src_argb1
|
||||||
mov edx, [esp + 4 + 12] // dst_argb
|
mov edx, [esp + 4 + 12] // dst_argb
|
||||||
mov ecx, [esp + 4 + 16] // width
|
mov ecx, [esp + 4 + 16] // width
|
||||||
|
|
||||||
@ -4990,7 +4990,7 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
|
|||||||
lea eax, [eax + 16]
|
lea eax, [eax + 16]
|
||||||
movdqu xmm1, [esi] // read 4 pixels from src_argb1
|
movdqu xmm1, [esi] // read 4 pixels from src_argb1
|
||||||
lea esi, [esi + 16]
|
lea esi, [esi + 16]
|
||||||
paddusb xmm0, xmm1 // src_argb + src_argb1
|
paddusb xmm0, xmm1 // src_argb + src_argb1
|
||||||
movdqu [edx], xmm0
|
movdqu [edx], xmm0
|
||||||
lea edx, [edx + 16]
|
lea edx, [edx + 16]
|
||||||
sub ecx, 4
|
sub ecx, 4
|
||||||
@ -5005,7 +5005,7 @@ __declspec(naked) void ARGBAddRow_SSE2(const uint8_t* src_argb,
|
|||||||
lea eax, [eax + 4]
|
lea eax, [eax + 4]
|
||||||
movd xmm1, [esi] // read 1 pixels from src_argb1
|
movd xmm1, [esi] // read 1 pixels from src_argb1
|
||||||
lea esi, [esi + 4]
|
lea esi, [esi + 4]
|
||||||
paddusb xmm0, xmm1 // src_argb + src_argb1
|
paddusb xmm0, xmm1 // src_argb + src_argb1
|
||||||
movd [edx], xmm0
|
movd [edx], xmm0
|
||||||
lea edx, [edx + 4]
|
lea edx, [edx + 4]
|
||||||
sub ecx, 1
|
sub ecx, 1
|
||||||
@ -5026,8 +5026,8 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
|
|||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
mov eax, [esp + 4 + 4] // src_argb
|
mov eax, [esp + 4 + 4] // src_argb
|
||||||
mov esi, [esp + 4 + 8] // src_argb1
|
mov esi, [esp + 4 + 8] // src_argb1
|
||||||
mov edx, [esp + 4 + 12] // dst_argb
|
mov edx, [esp + 4 + 12] // dst_argb
|
||||||
mov ecx, [esp + 4 + 16] // width
|
mov ecx, [esp + 4 + 16] // width
|
||||||
|
|
||||||
@ -5036,7 +5036,7 @@ __declspec(naked) void ARGBSubtractRow_SSE2(const uint8_t* src_argb,
|
|||||||
lea eax, [eax + 16]
|
lea eax, [eax + 16]
|
||||||
movdqu xmm1, [esi] // read 4 pixels from src_argb1
|
movdqu xmm1, [esi] // read 4 pixels from src_argb1
|
||||||
lea esi, [esi + 16]
|
lea esi, [esi + 16]
|
||||||
psubusb xmm0, xmm1 // src_argb - src_argb1
|
psubusb xmm0, xmm1 // src_argb - src_argb1
|
||||||
movdqu [edx], xmm0
|
movdqu [edx], xmm0
|
||||||
lea edx, [edx + 16]
|
lea edx, [edx + 16]
|
||||||
sub ecx, 4
|
sub ecx, 4
|
||||||
@ -5056,11 +5056,11 @@ __declspec(naked) void ARGBMultiplyRow_AVX2(const uint8_t* src_argb,
|
|||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
mov eax, [esp + 4 + 4] // src_argb
|
mov eax, [esp + 4 + 4] // src_argb
|
||||||
mov esi, [esp + 4 + 8] // src_argb1
|
mov esi, [esp + 4 + 8] // src_argb1
|
||||||
mov edx, [esp + 4 + 12] // dst_argb
|
mov edx, [esp + 4 + 12] // dst_argb
|
||||||
mov ecx, [esp + 4 + 16] // width
|
mov ecx, [esp + 4 + 16] // width
|
||||||
vpxor ymm5, ymm5, ymm5 // constant 0
|
vpxor ymm5, ymm5, ymm5 // constant 0
|
||||||
|
|
||||||
convertloop:
|
convertloop:
|
||||||
vmovdqu ymm1, [eax] // read 8 pixels from src_argb
|
vmovdqu ymm1, [eax] // read 8 pixels from src_argb
|
||||||
@ -5094,8 +5094,8 @@ __declspec(naked) void ARGBAddRow_AVX2(const uint8_t* src_argb,
|
|||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
mov eax, [esp + 4 + 4] // src_argb
|
mov eax, [esp + 4 + 4] // src_argb
|
||||||
mov esi, [esp + 4 + 8] // src_argb1
|
mov esi, [esp + 4 + 8] // src_argb1
|
||||||
mov edx, [esp + 4 + 12] // dst_argb
|
mov edx, [esp + 4 + 12] // dst_argb
|
||||||
mov ecx, [esp + 4 + 16] // width
|
mov ecx, [esp + 4 + 16] // width
|
||||||
|
|
||||||
@ -5124,8 +5124,8 @@ __declspec(naked) void ARGBSubtractRow_AVX2(const uint8_t* src_argb,
|
|||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
mov eax, [esp + 4 + 4] // src_argb
|
mov eax, [esp + 4 + 4] // src_argb
|
||||||
mov esi, [esp + 4 + 8] // src_argb1
|
mov esi, [esp + 4 + 8] // src_argb1
|
||||||
mov edx, [esp + 4 + 12] // dst_argb
|
mov edx, [esp + 4 + 12] // dst_argb
|
||||||
mov ecx, [esp + 4 + 16] // width
|
mov ecx, [esp + 4 + 16] // width
|
||||||
|
|
||||||
@ -5159,8 +5159,8 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
|
|||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
push edi
|
push edi
|
||||||
mov eax, [esp + 8 + 4] // src_y0
|
mov eax, [esp + 8 + 4] // src_y0
|
||||||
mov esi, [esp + 8 + 8] // src_y1
|
mov esi, [esp + 8 + 8] // src_y1
|
||||||
mov edi, [esp + 8 + 12] // src_y2
|
mov edi, [esp + 8 + 12] // src_y2
|
||||||
mov edx, [esp + 8 + 16] // dst_sobelx
|
mov edx, [esp + 8 + 16] // dst_sobelx
|
||||||
mov ecx, [esp + 8 + 20] // width
|
mov ecx, [esp + 8 + 20] // width
|
||||||
@ -5170,17 +5170,17 @@ __declspec(naked) void SobelXRow_SSE2(const uint8_t* src_y0,
|
|||||||
pxor xmm5, xmm5 // constant 0
|
pxor xmm5, xmm5 // constant 0
|
||||||
|
|
||||||
convertloop:
|
convertloop:
|
||||||
movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
|
movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
|
||||||
movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
|
movq xmm1, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
|
||||||
punpcklbw xmm0, xmm5
|
punpcklbw xmm0, xmm5
|
||||||
punpcklbw xmm1, xmm5
|
punpcklbw xmm1, xmm5
|
||||||
psubw xmm0, xmm1
|
psubw xmm0, xmm1
|
||||||
movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
|
movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
|
||||||
movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
|
movq xmm2, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
|
||||||
punpcklbw xmm1, xmm5
|
punpcklbw xmm1, xmm5
|
||||||
punpcklbw xmm2, xmm5
|
punpcklbw xmm2, xmm5
|
||||||
psubw xmm1, xmm2
|
psubw xmm1, xmm2
|
||||||
movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
|
movq xmm2, qword ptr [eax + edi] // read 8 pixels from src_y2[0]
|
||||||
movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
|
movq xmm3, qword ptr [eax + edi + 2] // read 8 pixels from src_y2[2]
|
||||||
punpcklbw xmm2, xmm5
|
punpcklbw xmm2, xmm5
|
||||||
punpcklbw xmm3, xmm5
|
punpcklbw xmm3, xmm5
|
||||||
@ -5215,8 +5215,8 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
|
|||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
mov eax, [esp + 4 + 4] // src_y0
|
mov eax, [esp + 4 + 4] // src_y0
|
||||||
mov esi, [esp + 4 + 8] // src_y1
|
mov esi, [esp + 4 + 8] // src_y1
|
||||||
mov edx, [esp + 4 + 12] // dst_sobely
|
mov edx, [esp + 4 + 12] // dst_sobely
|
||||||
mov ecx, [esp + 4 + 16] // width
|
mov ecx, [esp + 4 + 16] // width
|
||||||
sub esi, eax
|
sub esi, eax
|
||||||
@ -5224,17 +5224,17 @@ __declspec(naked) void SobelYRow_SSE2(const uint8_t* src_y0,
|
|||||||
pxor xmm5, xmm5 // constant 0
|
pxor xmm5, xmm5 // constant 0
|
||||||
|
|
||||||
convertloop:
|
convertloop:
|
||||||
movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
|
movq xmm0, qword ptr [eax] // read 8 pixels from src_y0[0]
|
||||||
movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
|
movq xmm1, qword ptr [eax + esi] // read 8 pixels from src_y1[0]
|
||||||
punpcklbw xmm0, xmm5
|
punpcklbw xmm0, xmm5
|
||||||
punpcklbw xmm1, xmm5
|
punpcklbw xmm1, xmm5
|
||||||
psubw xmm0, xmm1
|
psubw xmm0, xmm1
|
||||||
movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
|
movq xmm1, qword ptr [eax + 1] // read 8 pixels from src_y0[1]
|
||||||
movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
|
movq xmm2, qword ptr [eax + esi + 1] // read 8 pixels from src_y1[1]
|
||||||
punpcklbw xmm1, xmm5
|
punpcklbw xmm1, xmm5
|
||||||
punpcklbw xmm2, xmm5
|
punpcklbw xmm2, xmm5
|
||||||
psubw xmm1, xmm2
|
psubw xmm1, xmm2
|
||||||
movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
|
movq xmm2, qword ptr [eax + 2] // read 8 pixels from src_y0[2]
|
||||||
movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
|
movq xmm3, qword ptr [eax + esi + 2] // read 8 pixels from src_y1[2]
|
||||||
punpcklbw xmm2, xmm5
|
punpcklbw xmm2, xmm5
|
||||||
punpcklbw xmm3, xmm5
|
punpcklbw xmm3, xmm5
|
||||||
@ -5269,8 +5269,8 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
|
|||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
mov eax, [esp + 4 + 4] // src_sobelx
|
mov eax, [esp + 4 + 4] // src_sobelx
|
||||||
mov esi, [esp + 4 + 8] // src_sobely
|
mov esi, [esp + 4 + 8] // src_sobely
|
||||||
mov edx, [esp + 4 + 12] // dst_argb
|
mov edx, [esp + 4 + 12] // dst_argb
|
||||||
mov ecx, [esp + 4 + 16] // width
|
mov ecx, [esp + 4 + 16] // width
|
||||||
sub esi, eax
|
sub esi, eax
|
||||||
@ -5278,7 +5278,7 @@ __declspec(naked) void SobelRow_SSE2(const uint8_t* src_sobelx,
|
|||||||
pslld xmm5, 24 // 0xff000000
|
pslld xmm5, 24 // 0xff000000
|
||||||
|
|
||||||
convertloop:
|
convertloop:
|
||||||
movdqu xmm0, [eax] // read 16 pixels src_sobelx
|
movdqu xmm0, [eax] // read 16 pixels src_sobelx
|
||||||
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
|
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
|
||||||
lea eax, [eax + 16]
|
lea eax, [eax + 16]
|
||||||
paddusb xmm0, xmm1 // sobel = sobelx + sobely
|
paddusb xmm0, xmm1 // sobel = sobelx + sobely
|
||||||
@ -5317,8 +5317,8 @@ __declspec(naked) void SobelToPlaneRow_SSE2(const uint8_t* src_sobelx,
|
|||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
mov eax, [esp + 4 + 4] // src_sobelx
|
mov eax, [esp + 4 + 4] // src_sobelx
|
||||||
mov esi, [esp + 4 + 8] // src_sobely
|
mov esi, [esp + 4 + 8] // src_sobely
|
||||||
mov edx, [esp + 4 + 12] // dst_argb
|
mov edx, [esp + 4 + 12] // dst_argb
|
||||||
mov ecx, [esp + 4 + 16] // width
|
mov ecx, [esp + 4 + 16] // width
|
||||||
sub esi, eax
|
sub esi, eax
|
||||||
@ -5351,15 +5351,15 @@ __declspec(naked) void SobelXYRow_SSE2(const uint8_t* src_sobelx,
|
|||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
mov eax, [esp + 4 + 4] // src_sobelx
|
mov eax, [esp + 4 + 4] // src_sobelx
|
||||||
mov esi, [esp + 4 + 8] // src_sobely
|
mov esi, [esp + 4 + 8] // src_sobely
|
||||||
mov edx, [esp + 4 + 12] // dst_argb
|
mov edx, [esp + 4 + 12] // dst_argb
|
||||||
mov ecx, [esp + 4 + 16] // width
|
mov ecx, [esp + 4 + 16] // width
|
||||||
sub esi, eax
|
sub esi, eax
|
||||||
pcmpeqb xmm5, xmm5 // alpha 255
|
pcmpeqb xmm5, xmm5 // alpha 255
|
||||||
|
|
||||||
convertloop:
|
convertloop:
|
||||||
movdqu xmm0, [eax] // read 16 pixels src_sobelx
|
movdqu xmm0, [eax] // read 16 pixels src_sobelx
|
||||||
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
|
movdqu xmm1, [eax + esi] // read 16 pixels src_sobely
|
||||||
lea eax, [eax + 16]
|
lea eax, [eax + 16]
|
||||||
movdqa xmm2, xmm0
|
movdqa xmm2, xmm0
|
||||||
@ -5529,7 +5529,7 @@ void CumulativeSumToAverageRow_SSE2(const int32_t* topleft,
|
|||||||
add ecx, 4 - 1
|
add ecx, 4 - 1
|
||||||
jl l1b
|
jl l1b
|
||||||
|
|
||||||
// 1 pixel loop
|
// 1 pixel loop
|
||||||
l1:
|
l1:
|
||||||
movdqu xmm0, [eax]
|
movdqu xmm0, [eax]
|
||||||
psubd xmm0, [eax + edx * 4]
|
psubd xmm0, [eax + edx * 4]
|
||||||
@ -5571,7 +5571,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
|
|||||||
test edx, 15
|
test edx, 15
|
||||||
jne l4b
|
jne l4b
|
||||||
|
|
||||||
// 4 pixel loop
|
// 4 pixel loop
|
||||||
l4:
|
l4:
|
||||||
movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
|
movdqu xmm2, [eax] // 4 argb pixels 16 bytes.
|
||||||
lea eax, [eax + 16]
|
lea eax, [eax + 16]
|
||||||
@ -5617,7 +5617,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
|
|||||||
add ecx, 4 - 1
|
add ecx, 4 - 1
|
||||||
jl l1b
|
jl l1b
|
||||||
|
|
||||||
// 1 pixel loop
|
// 1 pixel loop
|
||||||
l1:
|
l1:
|
||||||
movd xmm2, dword ptr [eax] // 1 argb pixel
|
movd xmm2, dword ptr [eax] // 1 argb pixel
|
||||||
lea eax, [eax + 4]
|
lea eax, [eax + 4]
|
||||||
@ -5651,7 +5651,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
|
|||||||
mov esi, [esp + 16] // stride
|
mov esi, [esp + 16] // stride
|
||||||
mov edx, [esp + 20] // dst_argb
|
mov edx, [esp + 20] // dst_argb
|
||||||
mov ecx, [esp + 24] // pointer to uv_dudv
|
mov ecx, [esp + 24] // pointer to uv_dudv
|
||||||
movq xmm2, qword ptr [ecx] // uv
|
movq xmm2, qword ptr [ecx] // uv
|
||||||
movq xmm7, qword ptr [ecx + 8] // dudv
|
movq xmm7, qword ptr [ecx + 8] // dudv
|
||||||
mov ecx, [esp + 28] // width
|
mov ecx, [esp + 28] // width
|
||||||
shl esi, 16 // 4, stride
|
shl esi, 16 // 4, stride
|
||||||
@ -5660,7 +5660,7 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
|
|||||||
sub ecx, 4
|
sub ecx, 4
|
||||||
jl l4b
|
jl l4b
|
||||||
|
|
||||||
// setup for 4 pixel loop
|
// setup for 4 pixel loop
|
||||||
pshufd xmm7, xmm7, 0x44 // dup dudv
|
pshufd xmm7, xmm7, 0x44 // dup dudv
|
||||||
pshufd xmm5, xmm5, 0 // dup 4, stride
|
pshufd xmm5, xmm5, 0 // dup 4, stride
|
||||||
movdqa xmm0, xmm2 // x0, y0, x1, y1
|
movdqa xmm0, xmm2 // x0, y0, x1, y1
|
||||||
@ -5672,16 +5672,16 @@ __declspec(naked) LIBYUV_API void ARGBAffineRow_SSE2(const uint8_t* src_argb,
|
|||||||
addps xmm3, xmm4
|
addps xmm3, xmm4
|
||||||
addps xmm4, xmm4 // dudv *= 4
|
addps xmm4, xmm4 // dudv *= 4
|
||||||
|
|
||||||
// 4 pixel loop
|
// 4 pixel loop
|
||||||
l4:
|
l4:
|
||||||
cvttps2dq xmm0, xmm2 // x, y float to int first 2
|
cvttps2dq xmm0, xmm2 // x, y float to int first 2
|
||||||
cvttps2dq xmm1, xmm3 // x, y float to int next 2
|
cvttps2dq xmm1, xmm3 // x, y float to int next 2
|
||||||
packssdw xmm0, xmm1 // x, y as 8 shorts
|
packssdw xmm0, xmm1 // x, y as 8 shorts
|
||||||
pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
|
pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride.
|
||||||
movd esi, xmm0
|
movd esi, xmm0
|
||||||
pshufd xmm0, xmm0, 0x39 // shift right
|
pshufd xmm0, xmm0, 0x39 // shift right
|
||||||
movd edi, xmm0
|
movd edi, xmm0
|
||||||
pshufd xmm0, xmm0, 0x39 // shift right
|
pshufd xmm0, xmm0, 0x39 // shift right
|
||||||
movd xmm1, [eax + esi] // read pixel 0
|
movd xmm1, [eax + esi] // read pixel 0
|
||||||
movd xmm6, [eax + edi] // read pixel 1
|
movd xmm6, [eax + edi] // read pixel 1
|
||||||
punpckldq xmm1, xmm6 // combine pixel 0 and 1
|
punpckldq xmm1, xmm6 // combine pixel 0 and 1
|
||||||
@ -5733,8 +5733,8 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
|
|||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
push edi
|
push edi
|
||||||
mov edi, [esp + 8 + 4] // dst_ptr
|
mov edi, [esp + 8 + 4] // dst_ptr
|
||||||
mov esi, [esp + 8 + 8] // src_ptr
|
mov esi, [esp + 8 + 8] // src_ptr
|
||||||
mov edx, [esp + 8 + 12] // src_stride
|
mov edx, [esp + 8 + 12] // src_stride
|
||||||
mov ecx, [esp + 8 + 16] // dst_width
|
mov ecx, [esp + 8 + 16] // dst_width
|
||||||
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
|
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
|
||||||
@ -5743,7 +5743,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
|
|||||||
je xloop100 // 0 / 256. Blend 100 / 0.
|
je xloop100 // 0 / 256. Blend 100 / 0.
|
||||||
sub edi, esi
|
sub edi, esi
|
||||||
cmp eax, 128
|
cmp eax, 128
|
||||||
je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
|
je xloop50 // 128 /256 is 0.50. Blend 50 / 50.
|
||||||
|
|
||||||
vmovd xmm0, eax // high fraction 0..255
|
vmovd xmm0, eax // high fraction 0..255
|
||||||
neg eax
|
neg eax
|
||||||
@ -5770,7 +5770,7 @@ __declspec(naked) void InterpolateRow_AVX2(uint8_t* dst_ptr,
|
|||||||
vpaddw ymm0, ymm0, ymm4
|
vpaddw ymm0, ymm0, ymm4
|
||||||
vpsrlw ymm1, ymm1, 8
|
vpsrlw ymm1, ymm1, 8
|
||||||
vpsrlw ymm0, ymm0, 8
|
vpsrlw ymm0, ymm0, 8
|
||||||
vpackuswb ymm0, ymm0, ymm1 // unmutates
|
vpackuswb ymm0, ymm0, ymm1 // unmutates
|
||||||
vmovdqu [esi + edi], ymm0
|
vmovdqu [esi + edi], ymm0
|
||||||
lea esi, [esi + 32]
|
lea esi, [esi + 32]
|
||||||
sub ecx, 32
|
sub ecx, 32
|
||||||
@ -5811,17 +5811,17 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
|
|||||||
push esi
|
push esi
|
||||||
push edi
|
push edi
|
||||||
|
|
||||||
mov edi, [esp + 8 + 4] // dst_ptr
|
mov edi, [esp + 8 + 4] // dst_ptr
|
||||||
mov esi, [esp + 8 + 8] // src_ptr
|
mov esi, [esp + 8 + 8] // src_ptr
|
||||||
mov edx, [esp + 8 + 12] // src_stride
|
mov edx, [esp + 8 + 12] // src_stride
|
||||||
mov ecx, [esp + 8 + 16] // dst_width
|
mov ecx, [esp + 8 + 16] // dst_width
|
||||||
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
|
mov eax, [esp + 8 + 20] // source_y_fraction (0..255)
|
||||||
sub edi, esi
|
sub edi, esi
|
||||||
// Dispatch to specialized filters if applicable.
|
// Dispatch to specialized filters if applicable.
|
||||||
cmp eax, 0
|
cmp eax, 0
|
||||||
je xloop100 // 0 /256. Blend 100 / 0.
|
je xloop100 // 0 /256. Blend 100 / 0.
|
||||||
cmp eax, 128
|
cmp eax, 128
|
||||||
je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
|
je xloop50 // 128 / 256 is 0.50. Blend 50 / 50.
|
||||||
|
|
||||||
movd xmm0, eax // high fraction 0..255
|
movd xmm0, eax // high fraction 0..255
|
||||||
neg eax
|
neg eax
|
||||||
@ -5840,7 +5840,7 @@ __declspec(naked) void InterpolateRow_SSSE3(uint8_t* dst_ptr,
|
|||||||
movdqu xmm1, xmm0
|
movdqu xmm1, xmm0
|
||||||
punpcklbw xmm0, xmm2
|
punpcklbw xmm0, xmm2
|
||||||
punpckhbw xmm1, xmm2
|
punpckhbw xmm1, xmm2
|
||||||
psubb xmm0, xmm4 // bias image by -128
|
psubb xmm0, xmm4 // bias image by -128
|
||||||
psubb xmm1, xmm4
|
psubb xmm1, xmm4
|
||||||
movdqa xmm2, xmm5
|
movdqa xmm2, xmm5
|
||||||
movdqa xmm3, xmm5
|
movdqa xmm3, xmm5
|
||||||
@ -5889,8 +5889,8 @@ __declspec(naked) void ARGBShuffleRow_SSSE3(const uint8_t* src_argb,
|
|||||||
const uint8_t* shuffler,
|
const uint8_t* shuffler,
|
||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
mov eax, [esp + 4] // src_argb
|
mov eax, [esp + 4] // src_argb
|
||||||
mov edx, [esp + 8] // dst_argb
|
mov edx, [esp + 8] // dst_argb
|
||||||
mov ecx, [esp + 12] // shuffler
|
mov ecx, [esp + 12] // shuffler
|
||||||
movdqu xmm5, [ecx]
|
movdqu xmm5, [ecx]
|
||||||
mov ecx, [esp + 16] // width
|
mov ecx, [esp + 16] // width
|
||||||
@ -5916,8 +5916,8 @@ __declspec(naked) void ARGBShuffleRow_AVX2(const uint8_t* src_argb,
|
|||||||
const uint8_t* shuffler,
|
const uint8_t* shuffler,
|
||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
mov eax, [esp + 4] // src_argb
|
mov eax, [esp + 4] // src_argb
|
||||||
mov edx, [esp + 8] // dst_argb
|
mov edx, [esp + 8] // dst_argb
|
||||||
mov ecx, [esp + 12] // shuffler
|
mov ecx, [esp + 12] // shuffler
|
||||||
vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
|
vbroadcastf128 ymm5, [ecx] // same shuffle in high as low.
|
||||||
mov ecx, [esp + 16] // width
|
mov ecx, [esp + 16] // width
|
||||||
@ -5954,18 +5954,18 @@ __declspec(naked) void I422ToYUY2Row_SSE2(const uint8_t* src_y,
|
|||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
push edi
|
push edi
|
||||||
mov eax, [esp + 8 + 4] // src_y
|
mov eax, [esp + 8 + 4] // src_y
|
||||||
mov esi, [esp + 8 + 8] // src_u
|
mov esi, [esp + 8 + 8] // src_u
|
||||||
mov edx, [esp + 8 + 12] // src_v
|
mov edx, [esp + 8 + 12] // src_v
|
||||||
mov edi, [esp + 8 + 16] // dst_frame
|
mov edi, [esp + 8 + 16] // dst_frame
|
||||||
mov ecx, [esp + 8 + 20] // width
|
mov ecx, [esp + 8 + 20] // width
|
||||||
sub edx, esi
|
sub edx, esi
|
||||||
|
|
||||||
convertloop:
|
convertloop:
|
||||||
movq xmm2, qword ptr [esi] // U
|
movq xmm2, qword ptr [esi] // U
|
||||||
movq xmm3, qword ptr [esi + edx] // V
|
movq xmm3, qword ptr [esi + edx] // V
|
||||||
lea esi, [esi + 8]
|
lea esi, [esi + 8]
|
||||||
punpcklbw xmm2, xmm3 // UV
|
punpcklbw xmm2, xmm3 // UV
|
||||||
movdqu xmm0, [eax] // Y
|
movdqu xmm0, [eax] // Y
|
||||||
lea eax, [eax + 16]
|
lea eax, [eax + 16]
|
||||||
movdqa xmm1, xmm0
|
movdqa xmm1, xmm0
|
||||||
@ -5991,22 +5991,22 @@ __declspec(naked) void I422ToUYVYRow_SSE2(const uint8_t* src_y,
|
|||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
push edi
|
push edi
|
||||||
mov eax, [esp + 8 + 4] // src_y
|
mov eax, [esp + 8 + 4] // src_y
|
||||||
mov esi, [esp + 8 + 8] // src_u
|
mov esi, [esp + 8 + 8] // src_u
|
||||||
mov edx, [esp + 8 + 12] // src_v
|
mov edx, [esp + 8 + 12] // src_v
|
||||||
mov edi, [esp + 8 + 16] // dst_frame
|
mov edi, [esp + 8 + 16] // dst_frame
|
||||||
mov ecx, [esp + 8 + 20] // width
|
mov ecx, [esp + 8 + 20] // width
|
||||||
sub edx, esi
|
sub edx, esi
|
||||||
|
|
||||||
convertloop:
|
convertloop:
|
||||||
movq xmm2, qword ptr [esi] // U
|
movq xmm2, qword ptr [esi] // U
|
||||||
movq xmm3, qword ptr [esi + edx] // V
|
movq xmm3, qword ptr [esi + edx] // V
|
||||||
lea esi, [esi + 8]
|
lea esi, [esi + 8]
|
||||||
punpcklbw xmm2, xmm3 // UV
|
punpcklbw xmm2, xmm3 // UV
|
||||||
movdqu xmm0, [eax] // Y
|
movdqu xmm0, [eax] // Y
|
||||||
movdqa xmm1, xmm2
|
movdqa xmm1, xmm2
|
||||||
lea eax, [eax + 16]
|
lea eax, [eax + 16]
|
||||||
punpcklbw xmm1, xmm0 // UYVY
|
punpcklbw xmm1, xmm0 // UYVY
|
||||||
punpckhbw xmm2, xmm0
|
punpckhbw xmm2, xmm0
|
||||||
movdqu [edi], xmm1
|
movdqu [edi], xmm1
|
||||||
movdqu [edi + 16], xmm2
|
movdqu [edi + 16], xmm2
|
||||||
@ -6033,10 +6033,10 @@ __declspec(naked) void ARGBPolynomialRow_SSE2(const uint8_t* src_argb,
|
|||||||
mov ecx, [esp + 4 + 16] /* width */
|
mov ecx, [esp + 4 + 16] /* width */
|
||||||
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
|
pxor xmm3, xmm3 // 0 constant for zero extending bytes to ints.
|
||||||
|
|
||||||
// 2 pixel loop.
|
// 2 pixel loop.
|
||||||
convertloop:
|
convertloop:
|
||||||
// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
|
// pmovzxbd xmm0, dword ptr [eax] // BGRA pixel
|
||||||
// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
|
// pmovzxbd xmm4, dword ptr [eax + 4] // BGRA pixel
|
||||||
movq xmm0, qword ptr [eax] // BGRABGRA
|
movq xmm0, qword ptr [eax] // BGRABGRA
|
||||||
lea eax, [eax + 8]
|
lea eax, [eax + 8]
|
||||||
punpcklbw xmm0, xmm3
|
punpcklbw xmm0, xmm3
|
||||||
@ -6085,8 +6085,8 @@ __declspec(naked) void ARGBPolynomialRow_AVX2(const uint8_t* src_argb,
|
|||||||
const float* poly,
|
const float* poly,
|
||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
mov eax, [esp + 4] /* src_argb */
|
mov eax, [esp + 4] /* src_argb */
|
||||||
mov edx, [esp + 8] /* dst_argb */
|
mov edx, [esp + 8] /* dst_argb */
|
||||||
mov ecx, [esp + 12] /* poly */
|
mov ecx, [esp + 12] /* poly */
|
||||||
vbroadcastf128 ymm4, [ecx] // C0
|
vbroadcastf128 ymm4, [ecx] // C0
|
||||||
vbroadcastf128 ymm5, [ecx + 16] // C1
|
vbroadcastf128 ymm5, [ecx + 16] // C1
|
||||||
@ -6125,8 +6125,8 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
|
|||||||
float scale,
|
float scale,
|
||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
mov eax, [esp + 4] /* src */
|
mov eax, [esp + 4] /* src */
|
||||||
mov edx, [esp + 8] /* dst */
|
mov edx, [esp + 8] /* dst */
|
||||||
movd xmm4, dword ptr [esp + 12] /* scale */
|
movd xmm4, dword ptr [esp + 12] /* scale */
|
||||||
mov ecx, [esp + 16] /* width */
|
mov ecx, [esp + 16] /* width */
|
||||||
mulss xmm4, kExpBias
|
mulss xmm4, kExpBias
|
||||||
@ -6134,7 +6134,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16_t* src,
|
|||||||
pxor xmm5, xmm5
|
pxor xmm5, xmm5
|
||||||
sub edx, eax
|
sub edx, eax
|
||||||
|
|
||||||
// 8 pixel loop.
|
// 8 pixel loop.
|
||||||
convertloop:
|
convertloop:
|
||||||
movdqu xmm2, xmmword ptr [eax] // 8 shorts
|
movdqu xmm2, xmmword ptr [eax] // 8 shorts
|
||||||
add eax, 16
|
add eax, 16
|
||||||
@ -6172,7 +6172,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
|
|||||||
vpxor ymm5, ymm5, ymm5
|
vpxor ymm5, ymm5, ymm5
|
||||||
sub edx, eax
|
sub edx, eax
|
||||||
|
|
||||||
// 16 pixel loop.
|
// 16 pixel loop.
|
||||||
convertloop:
|
convertloop:
|
||||||
vmovdqu ymm2, [eax] // 16 shorts
|
vmovdqu ymm2, [eax] // 16 shorts
|
||||||
add eax, 32
|
add eax, 32
|
||||||
@ -6182,7 +6182,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16_t* src,
|
|||||||
vcvtdq2ps ymm2, ymm2
|
vcvtdq2ps ymm2, ymm2
|
||||||
vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
|
vmulps ymm3, ymm3, ymm4 // scale to adjust exponent for 5 bit range.
|
||||||
vmulps ymm2, ymm2, ymm4
|
vmulps ymm2, ymm2, ymm4
|
||||||
vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
|
vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate
|
||||||
vpsrld ymm2, ymm2, 13
|
vpsrld ymm2, ymm2, 13
|
||||||
vpackssdw ymm2, ymm2, ymm3
|
vpackssdw ymm2, ymm2, ymm3
|
||||||
vmovdqu [eax + edx - 32], ymm2
|
vmovdqu [eax + edx - 32], ymm2
|
||||||
@ -6200,22 +6200,22 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16_t* src,
|
|||||||
float scale,
|
float scale,
|
||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
mov eax, [esp + 4] /* src */
|
mov eax, [esp + 4] /* src */
|
||||||
mov edx, [esp + 8] /* dst */
|
mov edx, [esp + 8] /* dst */
|
||||||
vbroadcastss ymm4, [esp + 12] /* scale */
|
vbroadcastss ymm4, [esp + 12] /* scale */
|
||||||
mov ecx, [esp + 16] /* width */
|
mov ecx, [esp + 16] /* width */
|
||||||
sub edx, eax
|
sub edx, eax
|
||||||
|
|
||||||
// 16 pixel loop.
|
// 16 pixel loop.
|
||||||
convertloop:
|
convertloop:
|
||||||
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
|
vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints
|
||||||
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
|
vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts
|
||||||
add eax, 32
|
add eax, 32
|
||||||
vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
|
vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats
|
||||||
vcvtdq2ps ymm3, ymm3
|
vcvtdq2ps ymm3, ymm3
|
||||||
vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
|
vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1
|
||||||
vmulps ymm3, ymm3, ymm4
|
vmulps ymm3, ymm3, ymm4
|
||||||
vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
|
vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate
|
||||||
vcvtps2ph xmm3, ymm3, 3
|
vcvtps2ph xmm3, ymm3, 3
|
||||||
vmovdqu [eax + edx + 32], xmm2
|
vmovdqu [eax + edx + 32], xmm2
|
||||||
vmovdqu [eax + edx + 32 + 16], xmm3
|
vmovdqu [eax + edx + 32 + 16], xmm3
|
||||||
@ -6234,8 +6234,8 @@ __declspec(naked) void ARGBColorTableRow_X86(uint8_t* dst_argb,
|
|||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
mov eax, [esp + 4 + 4] /* dst_argb */
|
mov eax, [esp + 4 + 4] /* dst_argb */
|
||||||
mov esi, [esp + 4 + 8] /* table_argb */
|
mov esi, [esp + 4 + 8] /* table_argb */
|
||||||
mov ecx, [esp + 4 + 12] /* width */
|
mov ecx, [esp + 4 + 12] /* width */
|
||||||
|
|
||||||
// 1 pixel loop.
|
// 1 pixel loop.
|
||||||
@ -6268,8 +6268,8 @@ __declspec(naked) void RGBColorTableRow_X86(uint8_t* dst_argb,
|
|||||||
int width) {
|
int width) {
|
||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
mov eax, [esp + 4 + 4] /* dst_argb */
|
mov eax, [esp + 4 + 4] /* dst_argb */
|
||||||
mov esi, [esp + 4 + 8] /* table_argb */
|
mov esi, [esp + 4 + 8] /* table_argb */
|
||||||
mov ecx, [esp + 4 + 12] /* width */
|
mov ecx, [esp + 4 + 12] /* width */
|
||||||
|
|
||||||
// 1 pixel loop.
|
// 1 pixel loop.
|
||||||
@ -6303,8 +6303,8 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
|
|||||||
__asm {
|
__asm {
|
||||||
push esi
|
push esi
|
||||||
push edi
|
push edi
|
||||||
mov eax, [esp + 8 + 4] /* src_argb */
|
mov eax, [esp + 8 + 4] /* src_argb */
|
||||||
mov edi, [esp + 8 + 8] /* dst_argb */
|
mov edi, [esp + 8 + 8] /* dst_argb */
|
||||||
mov ecx, [esp + 8 + 12] /* width */
|
mov ecx, [esp + 8 + 12] /* width */
|
||||||
movd xmm2, dword ptr [esp + 8 + 16] // luma table
|
movd xmm2, dword ptr [esp + 8 + 16] // luma table
|
||||||
movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
|
movd xmm3, dword ptr [esp + 8 + 20] // lumacoeff
|
||||||
@ -6314,7 +6314,7 @@ __declspec(naked) void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb,
|
|||||||
psllw xmm4, 8
|
psllw xmm4, 8
|
||||||
pxor xmm5, xmm5
|
pxor xmm5, xmm5
|
||||||
|
|
||||||
// 4 pixel loop.
|
// 4 pixel loop.
|
||||||
convertloop:
|
convertloop:
|
||||||
movdqu xmm0, xmmword ptr [eax] // generate luma ptr
|
movdqu xmm0, xmmword ptr [eax] // generate luma ptr
|
||||||
pmaddubsw xmm0, xmm3
|
pmaddubsw xmm0, xmm3
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user