Change ScaleUVRowUp2_Biinear_16_SSE2 to SSE41

Bug: libyuv:928

xed -i scale_gcc.o:
SYM ScaleUVRowUp2_Linear_16_SSE2:
XDIS 0: LOGICAL   SSE2       660FEFED                 pxor xmm5, xmm5
XDIS 4: SSE       SSE2       660F76E4                 pcmpeqd xmm4, xmm4
XDIS 8: SSE       SSE2       660F72D41F               psrld xmm4, 0x1f
XDIS d: SSE       SSE2       660F72F401               pslld xmm4, 0x1
XDIS 12: DATAXFER  SSE2       F30F7E07                 movq xmm0, qword ptr [rdi]
XDIS 16: DATAXFER  SSE2       F30F7E4F04               movq xmm1, qword ptr [rdi+0x4]
XDIS 1b: SSE       SSE2       660F61C5                 punpcklwd xmm0, xmm5
XDIS 1f: SSE       SSE2       660F61CD                 punpcklwd xmm1, xmm5
XDIS 23: DATAXFER  SSE2       660F6FD0                 movdqa xmm2, xmm0
XDIS 27: DATAXFER  SSE2       660F6FD9                 movdqa xmm3, xmm1
XDIS 2b: SSE       SSE2       660F70D24E               pshufd xmm2, xmm2, 0x4e
XDIS 30: SSE       SSE2       660F70DB4E               pshufd xmm3, xmm3, 0x4e
XDIS 35: SSE       SSE2       660FFED4                 paddd xmm2, xmm4
XDIS 39: SSE       SSE2       660FFEDC                 paddd xmm3, xmm4
XDIS 3d: SSE       SSE2       660FFED0                 paddd xmm2, xmm0
XDIS 41: SSE       SSE2       660FFED9                 paddd xmm3, xmm1
XDIS 45: SSE       SSE2       660FFEC0                 paddd xmm0, xmm0
XDIS 49: SSE       SSE2       660FFEC9                 paddd xmm1, xmm1
XDIS 4d: SSE       SSE2       660FFEC2                 paddd xmm0, xmm2
XDIS 51: SSE       SSE2       660FFECB                 paddd xmm1, xmm3
XDIS 55: SSE       SSE2       660F72D002               psrld xmm0, 0x2
XDIS 5a: SSE       SSE2       660F72D102               psrld xmm1, 0x2
XDIS 5f: SSE       SSE4       660F382BC1               packusdw xmm0, xmm1
XDIS 64: DATAXFER  SSE2       F30F7F06                 movdqu xmmword ptr [rsi], xmm0
XDIS 68: MISC      BASE       488D7F08                 lea rdi, ptr [rdi+0x8]
XDIS 6c: MISC      BASE       488D7610                 lea rsi, ptr [rsi+0x10]
XDIS 70: BINARY    BASE       83EA04                   sub edx, 0x4
XDIS 73: COND_BR   BASE       7F9D                     jnle 0x12 <ScaleUVRowUp2_Linear_16_SSE2+0x12>
XDIS 75: RET       BASE       C3                       ret

SYM ScaleUVRowUp2_Bilinear_16_SSE2:
XDIS 0: LOGICAL   SSE2       660FEFFF                 pxor xmm7, xmm7
XDIS 4: SSE       SSE2       660F76F6                 pcmpeqd xmm6, xmm6
XDIS 8: SSE       SSE2       660F72D61F               psrld xmm6, 0x1f
XDIS d: SSE       SSE2       660F72F603               pslld xmm6, 0x3
XDIS 12: DATAXFER  SSE2       F30F7E07                 movq xmm0, qword ptr [rdi]
XDIS 16: DATAXFER  SSE2       F30F7E4F04               movq xmm1, qword ptr [rdi+0x4]
XDIS 1b: SSE       SSE2       660F61C7                 punpcklwd xmm0, xmm7
XDIS 1f: SSE       SSE2       660F61CF                 punpcklwd xmm1, xmm7
XDIS 23: DATAXFER  SSE2       660F6FD0                 movdqa xmm2, xmm0
XDIS 27: DATAXFER  SSE2       660F6FD9                 movdqa xmm3, xmm1
XDIS 2b: SSE       SSE2       660F70D24E               pshufd xmm2, xmm2, 0x4e
XDIS 30: SSE       SSE2       660F70DB4E               pshufd xmm3, xmm3, 0x4e
XDIS 35: SSE       SSE2       660FFED0                 paddd xmm2, xmm0
XDIS 39: SSE       SSE2       660FFED9                 paddd xmm3, xmm1
XDIS 3d: SSE       SSE2       660FFEC0                 paddd xmm0, xmm0
XDIS 41: SSE       SSE2       660FFEC9                 paddd xmm1, xmm1
XDIS 45: SSE       SSE2       660FFEC2                 paddd xmm0, xmm2
XDIS 49: SSE       SSE2       660FFECB                 paddd xmm1, xmm3
XDIS 4d: DATAXFER  SSE2       F30F7E1477               movq xmm2, qword ptr [rdi+rsi*2]
XDIS 52: DATAXFER  SSE2       F30F7E5C7704             movq xmm3, qword ptr [rdi+rsi*2+0x4]
XDIS 58: SSE       SSE2       660F61D7                 punpcklwd xmm2, xmm7
XDIS 5c: SSE       SSE2       660F61DF                 punpcklwd xmm3, xmm7
XDIS 60: DATAXFER  SSE2       660F6FE2                 movdqa xmm4, xmm2
XDIS 64: DATAXFER  SSE2       660F6FEB                 movdqa xmm5, xmm3
XDIS 68: SSE       SSE2       660F70E44E               pshufd xmm4, xmm4, 0x4e
XDIS 6d: SSE       SSE2       660F70ED4E               pshufd xmm5, xmm5, 0x4e
XDIS 72: SSE       SSE2       660FFEE2                 paddd xmm4, xmm2
XDIS 76: SSE       SSE2       660FFEEB                 paddd xmm5, xmm3
XDIS 7a: SSE       SSE2       660FFED2                 paddd xmm2, xmm2
XDIS 7e: SSE       SSE2       660FFEDB                 paddd xmm3, xmm3
XDIS 82: SSE       SSE2       660FFED4                 paddd xmm2, xmm4
XDIS 86: SSE       SSE2       660FFEDD                 paddd xmm3, xmm5
XDIS 8a: DATAXFER  SSE2       660F6FE0                 movdqa xmm4, xmm0
XDIS 8e: DATAXFER  SSE2       660F6FEA                 movdqa xmm5, xmm2
XDIS 92: SSE       SSE2       660FFEE0                 paddd xmm4, xmm0
XDIS 96: SSE       SSE2       660FFEEE                 paddd xmm5, xmm6
XDIS 9a: SSE       SSE2       660FFEE0                 paddd xmm4, xmm0
XDIS 9e: SSE       SSE2       660FFEE5                 paddd xmm4, xmm5
XDIS a2: SSE       SSE2       660F72D404               psrld xmm4, 0x4
XDIS a7: DATAXFER  SSE2       660F6FEA                 movdqa xmm5, xmm2
XDIS ab: SSE       SSE2       660FFEEA                 paddd xmm5, xmm2
XDIS af: SSE       SSE2       660FFEC6                 paddd xmm0, xmm6
XDIS b3: SSE       SSE2       660FFEEA                 paddd xmm5, xmm2
XDIS b7: SSE       SSE2       660FFEE8                 paddd xmm5, xmm0
XDIS bb: SSE       SSE2       660F72D504               psrld xmm5, 0x4
XDIS c0: DATAXFER  SSE2       660F6FC1                 movdqa xmm0, xmm1
XDIS c4: DATAXFER  SSE2       660F6FD3                 movdqa xmm2, xmm3
XDIS c8: SSE       SSE2       660FFEC1                 paddd xmm0, xmm1
XDIS cc: SSE       SSE2       660FFED6                 paddd xmm2, xmm6
XDIS d0: SSE       SSE2       660FFEC1                 paddd xmm0, xmm1
XDIS d4: SSE       SSE2       660FFEC2                 paddd xmm0, xmm2
XDIS d8: SSE       SSE2       660F72D004               psrld xmm0, 0x4
XDIS dd: DATAXFER  SSE2       660F6FD3                 movdqa xmm2, xmm3
XDIS e1: SSE       SSE2       660FFED3                 paddd xmm2, xmm3
XDIS e5: SSE       SSE2       660FFECE                 paddd xmm1, xmm6
XDIS e9: SSE       SSE2       660FFED3                 paddd xmm2, xmm3
XDIS ed: SSE       SSE2       660FFED1                 paddd xmm2, xmm1
XDIS f1: SSE       SSE2       660F72D204               psrld xmm2, 0x4
XDIS f6: SSE       SSE4       660F382BE0               packusdw xmm4, xmm0
XDIS fb: DATAXFER  SSE2       F30F7F22                 movdqu xmmword ptr [rdx], xmm4
XDIS ff: SSE       SSE4       660F382BEA               packusdw xmm5, xmm2
XDIS 104: DATAXFER  SSE2       F30F7F2C4A               movdqu xmmword ptr [rdx+rcx*2], xmm5
XDIS 109: MISC      BASE       488D7F08                 lea rdi, ptr [rdi+0x8]
XDIS 10d: MISC      BASE       488D5210                 lea rdx, ptr [rdx+0x10]
XDIS 111: BINARY    BASE       4183E804                 sub r8d, 0x4
XDIS 115: COND_BR   BASE       0F8FF7FEFFFF             jnle 0x12 <ScaleUVRowUp2_Bilinear_16_SSE2+0x12>
XDIS 11b: RET       BASE       C3                       ret

Change-Id: Ia20860e9c3c45368822cfd8877167ff0bf973dcc
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3587602
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2022-04-15 11:21:25 -07:00 committed by libyuv LUCI CQ
parent 18f9110516
commit eec8dd37e8
10 changed files with 72 additions and 64 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1819
Version: 1820
License: BSD
License File: LICENSE

View File

@ -86,8 +86,8 @@ extern "C" {
#define HAS_SCALEROWUP2BILINEAR_16_SSE2
#define HAS_SCALEUVROWUP2LINEAR_SSSE3
#define HAS_SCALEUVROWUP2BILINEAR_SSSE3
#define HAS_SCALEUVROWUP2LINEAR_16_SSE2
#define HAS_SCALEUVROWUP2BILINEAR_16_SSE2
#define HAS_SCALEUVROWUP2LINEAR_16_SSE41
#define HAS_SCALEUVROWUP2BILINEAR_16_SSE41
#endif
// The following are available for gcc/clang x86 platforms, but
@ -1235,22 +1235,22 @@ void ScaleUVRowUp2_Bilinear_Any_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_16_Any_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_16_Any_SSE2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_16_Any_SSE41(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);
void ScaleUVRowUp2_Bilinear_16_Any_SSE41(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width);
void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width);

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1819
#define LIBYUV_VERSION 1820
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -83,7 +83,8 @@ int I420Copy(const uint8_t* src_y,
int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) {
return -1;
}
// Negative height means invert the image.
@ -125,7 +126,8 @@ int I010Copy(const uint16_t* src_y,
int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) {
return -1;
}
// Negative height means invert the image.
@ -169,7 +171,8 @@ static int Planar16bitTo8bit(const uint16_t* src_y,
int uv_width = SUBSAMPLE(width, subsample_x, subsample_x);
int uv_height = SUBSAMPLE(height, subsample_y, subsample_y);
int scale = 1 << (24 - depth);
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) {
return -1;
}
// Negative height means invert the image.
@ -539,7 +542,8 @@ int I422ToI210(const uint8_t* src_y,
int width,
int height) {
int halfwidth = (width + 1) >> 1;
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) {
return -1;
}
// Negative height means invert the image.

View File

@ -6647,9 +6647,9 @@ static int P010ToARGBMatrixBilinear(const uint16_t* src_y,
}
#endif
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE2;
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE41
if (TestCpuFlag(kCpuHasSSE41)) {
Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
}
#endif
@ -6737,9 +6737,9 @@ static int P210ToARGBMatrixLinear(const uint16_t* src_y,
}
#endif
#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE2;
#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE41
if (TestCpuFlag(kCpuHasSSE41)) {
ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41;
}
#endif
@ -6813,9 +6813,9 @@ static int P010ToAR30MatrixBilinear(const uint16_t* src_y,
}
#endif
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE2;
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE41
if (TestCpuFlag(kCpuHasSSE41)) {
Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
}
#endif
@ -6903,9 +6903,9 @@ static int P210ToAR30MatrixLinear(const uint16_t* src_y,
}
#endif
#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE2;
#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE41
if (TestCpuFlag(kCpuHasSSE41)) {
ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41;
}
#endif

View File

@ -85,7 +85,8 @@ int I420ToI010(const uint8_t* src_y,
int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) {
return -1;
}
// Negative height means invert the image.
@ -129,7 +130,8 @@ int I420ToI012(const uint8_t* src_y,
int height) {
int halfwidth = (width + 1) >> 1;
int halfheight = (height + 1) >> 1;
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) {
return -1;
}
// Negative height means invert the image.

View File

@ -240,7 +240,8 @@ int I422Copy(const uint8_t* src_y,
int height) {
int halfwidth = (width + 1) >> 1;
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) {
return -1;
}
@ -279,7 +280,8 @@ int I444Copy(const uint8_t* src_y,
int dst_stride_v,
int width,
int height) {
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) {
return -1;
}
// Negative height means invert the image.
@ -319,7 +321,8 @@ int I210Copy(const uint16_t* src_y,
int height) {
int halfwidth = (width + 1) >> 1;
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 || height == 0) {
if ((!src_y && dst_y) || !src_u || !src_v || !dst_u || !dst_v || width <= 0 ||
height == 0) {
return -1;
}

View File

@ -924,9 +924,9 @@ SBUH2LANY(ScaleUVRowUp2_Linear_Any_AVX2,
uint8_t)
#endif
#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE2,
ScaleUVRowUp2_Linear_16_SSE2,
#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE41
SBUH2LANY(ScaleUVRowUp2_Linear_16_Any_SSE41,
ScaleUVRowUp2_Linear_16_SSE41,
ScaleUVRowUp2_Linear_16_C,
3,
uint16_t)
@ -1022,9 +1022,9 @@ SBU2BLANY(ScaleUVRowUp2_Bilinear_Any_AVX2,
uint8_t)
#endif
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE2,
ScaleUVRowUp2_Bilinear_16_SSE2,
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE41
SBU2BLANY(ScaleUVRowUp2_Bilinear_16_Any_SSE41,
ScaleUVRowUp2_Bilinear_16_SSE41,
ScaleUVRowUp2_Bilinear_16_C,
7,
uint16_t)

View File

@ -1285,7 +1285,6 @@ void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
"psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
"packuswb %%xmm2,%%xmm0 \n"
"movdqu %%xmm0,(%1) \n"
"lea 0x8(%0),%0 \n"
"lea 0x10(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
@ -2666,10 +2665,10 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
}
#endif
#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE41
void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
"pxor %%xmm5,%%xmm5 \n"
"pcmpeqd %%xmm4,%%xmm4 \n"
@ -2716,12 +2715,12 @@ void ScaleUVRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
}
#endif
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
void ScaleUVRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE41
void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
ptrdiff_t src_stride,
uint16_t* dst_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"pxor %%xmm7,%%xmm7 \n"
"pcmpeqd %%xmm6,%%xmm6 \n"

View File

@ -747,9 +747,9 @@ void ScaleUVLinearUp2_16(int src_width,
// This function can only scale up by 2 times horizontally.
assert(src_width == ((dst_width + 1) / 2));
#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE2;
#ifdef HAS_SCALEUVROWUP2LINEAR_16_SSE41
if (TestCpuFlag(kCpuHasSSE41)) {
ScaleRowUp = ScaleUVRowUp2_Linear_16_Any_SSE41;
}
#endif
@ -800,9 +800,9 @@ void ScaleUVBilinearUp2_16(int src_width,
assert(src_width == ((dst_width + 1) / 2));
assert(src_height == ((dst_height + 1) / 2));
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE2;
#ifdef HAS_SCALEUVROWUP2BILINEAR_16_SSE41
if (TestCpuFlag(kCpuHasSSE41)) {
Scale2RowUp = ScaleUVRowUp2_Bilinear_16_Any_SSE41;
}
#endif