Pass rgbconstants via struct pointer instead of elements with m

Now 66 instructions
SYM ARGBToUVRow_SSSE3:
62ccd0: BASE       push ebp
62ccd1: BASE       mov ebp, esp
62ccd3: BASE       push ebx
62ccd4: BASE       push edi
62ccd5: BASE       push esi
62ccd6: BASE       and esp, 0xfffffffc
62ccd9: BASE       sub esp, 0xc
62ccdc: BASE       call 0x62cce1 <ARGBToUVRow_SSSE3+0x11>
62cce1: BASE       pop eax
62cce2: BASE       add eax, 0xe1c27
62cce8: BASE       mov ecx, dword ptr [ebp+0xc]
62cceb: BASE       mov edx, dword ptr [ebp+0x8]
62ccee: BASE       mov esi, dword ptr [ebp+0x10]
62ccf1: BASE       mov edi, dword ptr [ebp+0x18]
62ccf4: BASE       mov dword ptr [esp+0x8], edi
62ccf8: BASE       mov edi, dword ptr [ebp+0x14]
62ccfb: BASE       lea ebx, ptr [eax-0x5ecf88]
62cd01: SSE2       movdqa xmm4, xmmword ptr [ebx]
62cd05: SSE2       movdqa xmm5, xmmword ptr [ebx+0x10]
62cd0a: SSE2       pcmpeqb xmm6, xmm6
62cd0e: SSSE3      pabsb xmm6, xmm6
62cd13: SSE2       movdqa xmm7, xmmword ptr [eax-0x5ecfa8]
62cd1b: BASE       sub edi, esi

62cd1d: SSE2       movdqu xmm0, xmmword ptr [edx]
62cd21: SSE2       movdqu xmm1, xmmword ptr [edx+0x10]
62cd26: SSE2       movdqu xmm2, xmmword ptr [edx+ecx*1]
62cd2b: SSE2       movdqu xmm3, xmmword ptr [edx+ecx*1+0x10]
62cd31: SSSE3      pshufb xmm0, xmm7
62cd36: SSSE3      pshufb xmm1, xmm7
62cd3b: SSSE3      pshufb xmm2, xmm7
62cd40: SSSE3      pshufb xmm3, xmm7
62cd45: SSSE3      pmaddubsw xmm0, xmm6
62cd4a: SSSE3      pmaddubsw xmm1, xmm6
62cd4f: SSSE3      pmaddubsw xmm2, xmm6
62cd54: SSSE3      pmaddubsw xmm3, xmm6
62cd59: SSE2       paddw xmm0, xmm2
62cd5d: SSE2       paddw xmm1, xmm3
62cd61: SSE2       pxor xmm2, xmm2
62cd65: SSE2       psrlw xmm0, 0x1
62cd6a: SSE2       psrlw xmm1, 0x1
62cd6f: SSE2       pavgw xmm0, xmm2
62cd73: SSE2       pavgw xmm1, xmm2
62cd77: SSE2       packuswb xmm0, xmm1
62cd7b: SSE2       movdqa xmm2, xmm6
62cd7f: SSE2       psllw xmm2, 0xf
62cd84: SSE2       movdqa xmm1, xmm0
62cd88: SSSE3      pmaddubsw xmm1, xmm5
62cd8d: SSSE3      pmaddubsw xmm0, xmm4
62cd92: SSSE3      phaddw xmm0, xmm1
62cd97: SSE2       psubw xmm2, xmm0
62cd9b: SSE2       psrlw xmm2, 0x8
62cda0: SSE2       packuswb xmm2, xmm2
62cda4: SSE2       movd dword ptr [esi], xmm2
62cda8: SSE2       pshufd xmm2, xmm2, 0x55
62cdad: SSE2       movd dword ptr [esi+edi*1], xmm2
62cdb2: BASE       lea edx, ptr [edx+0x20]
62cdb5: BASE       lea esi, ptr [esi+0x4]
62cdb8: BASE       sub dword ptr [esp+0x8], 0x8
62cdbd: BASE       jnle 0x62cd1d <ARGBToUVRow_SSSE3+0x4d>

62cdc3: BASE       lea esp, ptr [ebp-0xc]
62cdc6: BASE       pop esi
62cdc7: BASE       pop edi
62cdc8: BASE       pop ebx
62cdc9: BASE       pop ebp
62cdca: BASE       ret

Was 68 instructions
ARGBToUVRow_SSSE3:
62ccd0: BASE       push ebp
62ccd1: BASE       mov ebp, esp
62ccd3: BASE       push edi
62ccd4: BASE       push esi
62ccd5: BASE       and esp, 0xfffffff0
62ccd8: BASE       sub esp, 0x30
62ccdb: BASE       call 0x62cce0 <ARGBToUVRow_SSSE3+0x10>
62cce0: BASE       pop eax
62cce1: BASE       add eax, 0xe1c28
62cce7: BASE       mov ecx, dword ptr [ebp+0xc]
62ccea: BASE       mov edx, dword ptr [ebp+0x8]
62cced: BASE       mov esi, dword ptr [ebp+0x10]
62ccf0: BASE       mov edi, dword ptr [ebp+0x18]
62ccf3: BASE       mov dword ptr [esp+0xc], edi
62ccf7: BASE       mov edi, dword ptr [ebp+0x14]
62ccfa: SSE        movaps xmm0, xmmword ptr [eax-0x5ecf88]
62cd01: SSE        movaps xmmword ptr [esp+0x20], xmm0
62cd06: SSE        movaps xmm0, xmmword ptr [eax-0x5ecf78]
62cd0d: SSE        movaps xmmword ptr [esp+0x10], xmm0
62cd12: SSE2       movdqa xmm4, xmmword ptr [esp+0x20]
62cd18: SSE2       movdqa xmm5, xmmword ptr [esp+0x10]
62cd1e: SSE2       pcmpeqb xmm6, xmm6
62cd22: SSSE3      pabsb xmm6, xmm6
62cd27: SSE2       movdqa xmm7, xmmword ptr [eax-0x5ecfa8]
62cd2f: BASE       sub edi, esi

62cd31: SSE2       movdqu xmm0, xmmword ptr [edx]
62cd35: SSE2       movdqu xmm1, xmmword ptr [edx+0x10]
62cd3a: SSE2       movdqu xmm2, xmmword ptr [edx+ecx*1]
62cd3f: SSE2       movdqu xmm3, xmmword ptr [edx+ecx*1+0x10]
62cd45: SSSE3      pshufb xmm0, xmm7
62cd4a: SSSE3      pshufb xmm1, xmm7
62cd4f: SSSE3      pshufb xmm2, xmm7
62cd54: SSSE3      pshufb xmm3, xmm7
62cd59: SSSE3      pmaddubsw xmm0, xmm6
62cd5e: SSSE3      pmaddubsw xmm1, xmm6
62cd63: SSSE3      pmaddubsw xmm2, xmm6
62cd68: SSSE3      pmaddubsw xmm3, xmm6
62cd6d: SSE2       paddw xmm0, xmm2
62cd71: SSE2       paddw xmm1, xmm3
62cd75: SSE2       pxor xmm2, xmm2
62cd79: SSE2       psrlw xmm0, 0x1
62cd7e: SSE2       psrlw xmm1, 0x1
62cd83: SSE2       pavgw xmm0, xmm2
62cd87: SSE2       pavgw xmm1, xmm2
62cd8b: SSE2       packuswb xmm0, xmm1
62cd8f: SSE2       movdqa xmm2, xmm6
62cd93: SSE2       psllw xmm2, 0xf
62cd98: SSE2       movdqa xmm1, xmm0
62cd9c: SSSE3      pmaddubsw xmm1, xmm5
62cda1: SSSE3      pmaddubsw xmm0, xmm4
62cda6: SSSE3      phaddw xmm0, xmm1
62cdab: SSE2       psubw xmm2, xmm0
62cdaf: SSE2       psrlw xmm2, 0x8
62cdb4: SSE2       packuswb xmm2, xmm2
62cdb8: SSE2       movd dword ptr [esi], xmm2
62cdbc: SSE2       pshufd xmm2, xmm2, 0x55
62cdc1: SSE2       movd dword ptr [esi+edi*1], xmm2
62cdc6: BASE       lea edx, ptr [edx+0x20]
62cdc9: BASE       lea esi, ptr [esi+0x4]
62cdcc: BASE       sub dword ptr [esp+0xc], 0x8
62cdd1: BASE       jnle 0x62cd31 <ARGBToUVRow_SSSE3+0x61>

62cdd7: BASE       lea esp, ptr [ebp-0x8]
62cdda: BASE       pop esi
62cddb: BASE       pop edi
62cddc: BASE       pop ebp
62cddd: BASE       ret
62cdde: BASE       int3
BUG=444157316

Change-Id: Iad044f851359f5b052091c7bdab9b96946fc3682
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6987370
Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
Frank Barchard 2025-09-26 12:19:35 -07:00
parent 5b22f31cb5
commit 94417b9d21
3 changed files with 75 additions and 116 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/ URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1919 Version: 1920
License: BSD-3-Clause License: BSD-3-Clause
License File: LICENSE License File: LICENSE
Shipped: yes Shipped: yes

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1919 #define LIBYUV_VERSION 1920
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -9,9 +9,6 @@
*/ */
#include "libyuv/row.h" #include "libyuv/row.h"
#if defined(__i386__) && defined(__pic__)
#include <string.h>
#endif
#ifdef __cplusplus #ifdef __cplusplus
namespace libyuv { namespace libyuv {
extern "C" { extern "C" {
@ -57,9 +54,6 @@ static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
static const uvec16 kAddY0 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, static const uvec16 kAddY0 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
0x8080u, 0x8080u, 0x8080u, 0x8080u}; 0x8080u, 0x8080u, 0x8080u, 0x8080u};
static const uvec16 kAddUV128 = {0x8000u, 0x8000u, 0x8000u, 0x8000u,
0x8000u, 0x8000u, 0x8000u, 0x8000u};
static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
0x8080u, 0x8080u, 0x8080u, 0x8080u}; 0x8080u, 0x8080u, 0x8080u, 0x8080u};
@ -286,7 +280,7 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
asm volatile( asm volatile(
"pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff "pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff
"psrld $0x18,%%xmm5 \n" "psrld $24,%%xmm5 \n"
"movdqa %3,%%xmm4 \n" "movdqa %3,%%xmm4 \n"
LABELALIGN LABELALIGN
@ -364,8 +358,8 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
"pcmpeqb %%xmm3,%%xmm3 \n" "pcmpeqb %%xmm3,%%xmm3 \n"
"psllw $0xb,%%xmm3 \n" "psllw $0xb,%%xmm3 \n"
"pcmpeqb %%xmm4,%%xmm4 \n" "pcmpeqb %%xmm4,%%xmm4 \n"
"psllw $0xa,%%xmm4 \n" "psllw $10,%%xmm4 \n"
"psrlw $0x5,%%xmm4 \n" "psrlw $5,%%xmm4 \n"
"pcmpeqb %%xmm7,%%xmm7 \n" "pcmpeqb %%xmm7,%%xmm7 \n"
"psllw $0x8,%%xmm7 \n" "psllw $0x8,%%xmm7 \n"
"sub %0,%1 \n" "sub %0,%1 \n"
@ -1592,15 +1586,20 @@ struct RgbUVConstants {
vec8 kRGBToV; vec8 kRGBToV;
}; };
// Offsets into RgbUVConstants structure
#define KRGBTOU 0
#define KRGBTOV 16
void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb, void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width, int width,
const struct RgbUVConstants* rgbuvconstants) { const struct RgbUVConstants* rgbuvconstants) {
asm volatile( asm volatile(
"movdqa %4,%%xmm3 \n" "pcmpeqb %%xmm5,%%xmm5 \n" // 0x8000
"movdqa %5,%%xmm4 \n" "psllw $15,%%xmm5 \n"
"movdqa %6,%%xmm5 \n" "movdqa 0x0(%4),%%xmm3 \n" // kRGBToU
"movdqa 0x10(%4),%%xmm4 \n" // kRGBToV
"sub %1,%2 \n" "sub %1,%2 \n"
LABELALIGN LABELALIGN
@ -1655,9 +1654,7 @@ void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,
#else #else
"+rm"(width) // %3 "+rm"(width) // %3
#endif #endif
: "m"(rgbuvconstants->kRGBToU), // %4 : "r"(rgbuvconstants) // %4
"m"(rgbuvconstants->kRGBToV), // %5
"m"(kAddUV128) // %6
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"); : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
} }
#endif // HAS_ARGBTOUV444ROW_SSSE3 #endif // HAS_ARGBTOUV444ROW_SSSE3
@ -1670,10 +1667,11 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
int width, int width,
const struct RgbUVConstants* rgbuvconstants) { const struct RgbUVConstants* rgbuvconstants) {
asm volatile( asm volatile(
"vbroadcastf128 %4,%%ymm3 \n" "vbroadcastf128 0x0(%4),%%ymm3 \n" // kRGBToU
"vbroadcastf128 %5,%%ymm4 \n" "vbroadcastf128 0x10(%4),%%ymm4 \n" // kRGBToV
"vbroadcastf128 %6,%%ymm5 \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // 0x8000
"vmovdqa %7,%%ymm7 \n" "vpsllw $15,%%ymm5,%%ymm5 \n"
"vmovdqa %5,%%ymm7 \n"
"sub %1,%2 \n" "sub %1,%2 \n"
LABELALIGN LABELALIGN
@ -1726,10 +1724,8 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
#else #else
"+rm"(width) // %3 "+rm"(width) // %3
#endif #endif
: "m"(rgbuvconstants->kRGBToU), // %4 : "r"(rgbuvconstants), // %4
"m"(rgbuvconstants->kRGBToV), // %5 "m"(kPermdARGBToY_AVX) // %5
"m"(kAddUV128), // %6
"m"(kPermdARGBToY_AVX) // %7
: "memory", "cc", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6", : "memory", "cc", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
"ymm7"); "ymm7");
} }
@ -1751,24 +1747,12 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
uint8_t* dst_v, uint8_t* dst_v,
int width, int width,
const struct RgbUVConstants* rgbuvconstants) { const struct RgbUVConstants* rgbuvconstants) {
#if defined(__i386__) && defined(__pic__)
// i386 + PIC builds: Inline asm may run out of general-purpose registers.
// In PIC, EBX is reserved for the GOT (and with a frame pointer EBP is also
// unavailable), so addressing struct fields via memory operands can require
// extra temporaries that the compiler cannot allocate given the asm constraints.
// To avoid this, copy the RGB-to-UV constants to stack locals first and let the
// asm use simple stack-relative addressing.
__attribute__((aligned(16))) vec8 local_kRGBToU = {};
__attribute__((aligned(16))) vec8 local_kRGBToV = {};
memcpy(&local_kRGBToU, &rgbuvconstants->kRGBToU, sizeof(local_kRGBToU));
memcpy(&local_kRGBToV, &rgbuvconstants->kRGBToV, sizeof(local_kRGBToV));
#endif
asm volatile( asm volatile(
"movdqa %5,%%xmm4 \n" // RGBToU "movdqa 0x0(%5),%%xmm4 \n" // RGBToU
"movdqa %6,%%xmm5 \n" // RGBToV "movdqa 0x10(%5),%%xmm5 \n" // RGBToV
"pcmpeqb %%xmm6,%%xmm6 \n" // 0x0101 "pcmpeqb %%xmm6,%%xmm6 \n" // 0x0101
"pabsb %%xmm6,%%xmm6 \n" "pabsb %%xmm6,%%xmm6 \n"
"movdqa %7,%%xmm7 \n" // kShuffleAARRGGBB "movdqa %6,%%xmm7 \n" // kShuffleAARRGGBB
"sub %1,%2 \n" "sub %1,%2 \n"
"1: \n" "1: \n"
@ -1819,16 +1803,10 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
"+rm"(width) // %3 "+rm"(width) // %3
#endif #endif
: "r"((intptr_t)(src_stride_argb)), // %4 : "r"((intptr_t)(src_stride_argb)), // %4
#if defined(__i386__) && defined(__pic__) "r"(rgbuvconstants), // %5
"m"(local_kRGBToU), // %5 "m"(kShuffleAARRGGBB) // %6
"m"(local_kRGBToV), // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
#else // defined(__i386__) && defined(__pic__) "xmm7");
"m"(rgbuvconstants->kRGBToU), // %5
"m"(rgbuvconstants->kRGBToV), // %6
#endif // defined(__i386__) && defined(__pic__)
"m"(kShuffleAARRGGBB) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm6", "xmm7");
} }
#endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_ARGBTOUVROW_SSSE3
@ -1843,25 +1821,12 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
uint8_t* dst_v, uint8_t* dst_v,
int width, int width,
const struct RgbUVConstants* rgbuvconstants) { const struct RgbUVConstants* rgbuvconstants) {
#if defined(__i386__) && defined(__pic__)
// i386 + PIC builds: Inline asm may run out of general-purpose registers.
// In PIC, EBX is reserved for the GOT (and with a frame pointer EBP is also
// unavailable), so addressing struct fields via memory operands can require
// extra temporaries that the compiler cannot allocate given the asm constraints.
// To avoid this, copy the RGB-to-UV constants to stack locals first and let the
// asm use simple stack-relative addressing.
__attribute__((aligned(32))) vec8 local_kRGBToU = {};
__attribute__((aligned(32))) vec8 local_kRGBToV = {};
memcpy(&local_kRGBToU, &rgbuvconstants->kRGBToU, sizeof(local_kRGBToU));
memcpy(&local_kRGBToV, &rgbuvconstants->kRGBToV, sizeof(local_kRGBToV));
#endif
asm volatile( asm volatile(
"vbroadcastf128 %5,%%ymm4 \n" // RGBToU "vbroadcastf128 0(%5),%%ymm4 \n" // RGBToU
"vbroadcastf128 %6,%%ymm5 \n" // RGBToV "vbroadcastf128 0x10(%5),%%ymm5 \n" // RGBToV
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 0x0101 "vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 0x0101
"vpabsb %%ymm6,%%ymm6 \n" "vpabsb %%ymm6,%%ymm6 \n"
"vmovdqa %7,%%ymm7 \n" // kShuffleAARRGGBB "vmovdqa %6,%%ymm7 \n" // kShuffleAARRGGBB
"sub %1,%2 \n" "sub %1,%2 \n"
"1: \n" "1: \n"
@ -1913,16 +1878,10 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
"+rm"(width) // %3 "+rm"(width) // %3
#endif #endif
: "r"((intptr_t)(src_stride_argb)), // %4 : "r"((intptr_t)(src_stride_argb)), // %4
#if defined(__i386__) && defined(__pic__) "r"(rgbuvconstants), // %5
"m"(local_kRGBToU), // %5 "m"(kShuffleAARRGGBB) // %6
"m"(local_kRGBToV), // %6 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
#else "xmm7");
"m"(rgbuvconstants->kRGBToU), // %5
"m"(rgbuvconstants->kRGBToV), // %6
#endif // defined(__i386__) && defined(__pic__)
"m"(kShuffleAARRGGBB) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"xmm6", "xmm7");
} }
#endif // HAS_ARGBTOUVROW_AVX2 #endif // HAS_ARGBTOUVROW_AVX2