mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
Pass rgbconstants via struct pointer instead of elements with m
Now 66 instructions SYM ARGBToUVRow_SSSE3: 62ccd0: BASE push ebp 62ccd1: BASE mov ebp, esp 62ccd3: BASE push ebx 62ccd4: BASE push edi 62ccd5: BASE push esi 62ccd6: BASE and esp, 0xfffffffc 62ccd9: BASE sub esp, 0xc 62ccdc: BASE call 0x62cce1 <ARGBToUVRow_SSSE3+0x11> 62cce1: BASE pop eax 62cce2: BASE add eax, 0xe1c27 62cce8: BASE mov ecx, dword ptr [ebp+0xc] 62cceb: BASE mov edx, dword ptr [ebp+0x8] 62ccee: BASE mov esi, dword ptr [ebp+0x10] 62ccf1: BASE mov edi, dword ptr [ebp+0x18] 62ccf4: BASE mov dword ptr [esp+0x8], edi 62ccf8: BASE mov edi, dword ptr [ebp+0x14] 62ccfb: BASE lea ebx, ptr [eax-0x5ecf88] 62cd01: SSE2 movdqa xmm4, xmmword ptr [ebx] 62cd05: SSE2 movdqa xmm5, xmmword ptr [ebx+0x10] 62cd0a: SSE2 pcmpeqb xmm6, xmm6 62cd0e: SSSE3 pabsb xmm6, xmm6 62cd13: SSE2 movdqa xmm7, xmmword ptr [eax-0x5ecfa8] 62cd1b: BASE sub edi, esi 62cd1d: SSE2 movdqu xmm0, xmmword ptr [edx] 62cd21: SSE2 movdqu xmm1, xmmword ptr [edx+0x10] 62cd26: SSE2 movdqu xmm2, xmmword ptr [edx+ecx*1] 62cd2b: SSE2 movdqu xmm3, xmmword ptr [edx+ecx*1+0x10] 62cd31: SSSE3 pshufb xmm0, xmm7 62cd36: SSSE3 pshufb xmm1, xmm7 62cd3b: SSSE3 pshufb xmm2, xmm7 62cd40: SSSE3 pshufb xmm3, xmm7 62cd45: SSSE3 pmaddubsw xmm0, xmm6 62cd4a: SSSE3 pmaddubsw xmm1, xmm6 62cd4f: SSSE3 pmaddubsw xmm2, xmm6 62cd54: SSSE3 pmaddubsw xmm3, xmm6 62cd59: SSE2 paddw xmm0, xmm2 62cd5d: SSE2 paddw xmm1, xmm3 62cd61: SSE2 pxor xmm2, xmm2 62cd65: SSE2 psrlw xmm0, 0x1 62cd6a: SSE2 psrlw xmm1, 0x1 62cd6f: SSE2 pavgw xmm0, xmm2 62cd73: SSE2 pavgw xmm1, xmm2 62cd77: SSE2 packuswb xmm0, xmm1 62cd7b: SSE2 movdqa xmm2, xmm6 62cd7f: SSE2 psllw xmm2, 0xf 62cd84: SSE2 movdqa xmm1, xmm0 62cd88: SSSE3 pmaddubsw xmm1, xmm5 62cd8d: SSSE3 pmaddubsw xmm0, xmm4 62cd92: SSSE3 phaddw xmm0, xmm1 62cd97: SSE2 psubw xmm2, xmm0 62cd9b: SSE2 psrlw xmm2, 0x8 62cda0: SSE2 packuswb xmm2, xmm2 62cda4: SSE2 movd dword ptr [esi], xmm2 62cda8: SSE2 pshufd xmm2, xmm2, 0x55 62cdad: SSE2 movd dword ptr [esi+edi*1], xmm2 62cdb2: BASE lea edx, ptr [edx+0x20] 62cdb5: BASE lea esi, ptr [esi+0x4] 62cdb8: BASE sub dword ptr [esp+0x8], 0x8 62cdbd: BASE jnle 0x62cd1d <ARGBToUVRow_SSSE3+0x4d> 62cdc3: BASE lea esp, ptr [ebp-0xc] 62cdc6: BASE pop esi 62cdc7: BASE pop edi 62cdc8: BASE pop ebx 62cdc9: BASE pop ebp 62cdca: BASE ret Was 68 instructions ARGBToUVRow_SSSE3: 62ccd0: BASE push ebp 62ccd1: BASE mov ebp, esp 62ccd3: BASE push edi 62ccd4: BASE push esi 62ccd5: BASE and esp, 0xfffffff0 62ccd8: BASE sub esp, 0x30 62ccdb: BASE call 0x62cce0 <ARGBToUVRow_SSSE3+0x10> 62cce0: BASE pop eax 62cce1: BASE add eax, 0xe1c28 62cce7: BASE mov ecx, dword ptr [ebp+0xc] 62ccea: BASE mov edx, dword ptr [ebp+0x8] 62cced: BASE mov esi, dword ptr [ebp+0x10] 62ccf0: BASE mov edi, dword ptr [ebp+0x18] 62ccf3: BASE mov dword ptr [esp+0xc], edi 62ccf7: BASE mov edi, dword ptr [ebp+0x14] 62ccfa: SSE movaps xmm0, xmmword ptr [eax-0x5ecf88] 62cd01: SSE movaps xmmword ptr [esp+0x20], xmm0 62cd06: SSE movaps xmm0, xmmword ptr [eax-0x5ecf78] 62cd0d: SSE movaps xmmword ptr [esp+0x10], xmm0 62cd12: SSE2 movdqa xmm4, xmmword ptr [esp+0x20] 62cd18: SSE2 movdqa xmm5, xmmword ptr [esp+0x10] 62cd1e: SSE2 pcmpeqb xmm6, xmm6 62cd22: SSSE3 pabsb xmm6, xmm6 62cd27: SSE2 movdqa xmm7, xmmword ptr [eax-0x5ecfa8] 62cd2f: BASE sub edi, esi 62cd31: SSE2 movdqu xmm0, xmmword ptr [edx] 62cd35: SSE2 movdqu xmm1, xmmword ptr [edx+0x10] 62cd3a: SSE2 movdqu xmm2, xmmword ptr [edx+ecx*1] 62cd3f: SSE2 movdqu xmm3, xmmword ptr [edx+ecx*1+0x10] 62cd45: SSSE3 pshufb xmm0, xmm7 62cd4a: SSSE3 pshufb xmm1, xmm7 62cd4f: SSSE3 pshufb xmm2, xmm7 62cd54: SSSE3 pshufb xmm3, xmm7 62cd59: SSSE3 pmaddubsw xmm0, xmm6 62cd5e: SSSE3 pmaddubsw xmm1, xmm6 62cd63: SSSE3 pmaddubsw xmm2, xmm6 62cd68: SSSE3 pmaddubsw xmm3, xmm6 62cd6d: SSE2 paddw xmm0, xmm2 62cd71: SSE2 paddw xmm1, xmm3 62cd75: SSE2 pxor xmm2, xmm2 62cd79: SSE2 psrlw xmm0, 0x1 62cd7e: SSE2 psrlw xmm1, 0x1 62cd83: SSE2 pavgw xmm0, xmm2 62cd87: SSE2 pavgw xmm1, xmm2 62cd8b: SSE2 packuswb xmm0, xmm1 62cd8f: SSE2 movdqa xmm2, xmm6 62cd93: SSE2 psllw xmm2, 0xf 62cd98: SSE2 movdqa xmm1, xmm0 62cd9c: SSSE3 pmaddubsw xmm1, xmm5 62cda1: SSSE3 pmaddubsw xmm0, xmm4 62cda6: SSSE3 phaddw xmm0, xmm1 62cdab: SSE2 psubw xmm2, xmm0 62cdaf: SSE2 psrlw xmm2, 0x8 62cdb4: SSE2 packuswb xmm2, xmm2 62cdb8: SSE2 movd dword ptr [esi], xmm2 62cdbc: SSE2 pshufd xmm2, xmm2, 0x55 62cdc1: SSE2 movd dword ptr [esi+edi*1], xmm2 62cdc6: BASE lea edx, ptr [edx+0x20] 62cdc9: BASE lea esi, ptr [esi+0x4] 62cdcc: BASE sub dword ptr [esp+0xc], 0x8 62cdd1: BASE jnle 0x62cd31 <ARGBToUVRow_SSSE3+0x61> 62cdd7: BASE lea esp, ptr [ebp-0x8] 62cdda: BASE pop esi 62cddb: BASE pop edi 62cddc: BASE pop ebp 62cddd: BASE ret 62cdde: BASE int3 BUG=444157316 Change-Id: Iad044f851359f5b052091c7bdab9b96946fc3682 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6987370 Reviewed-by: Justin Green <greenjustin@google.com>
This commit is contained in:
parent
5b22f31cb5
commit
94417b9d21
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: https://chromium.googlesource.com/libyuv/libyuv/
|
||||
Version: 1919
|
||||
Version: 1920
|
||||
License: BSD-3-Clause
|
||||
License File: LICENSE
|
||||
Shipped: yes
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 1919
|
||||
#define LIBYUV_VERSION 1920
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
@ -9,9 +9,6 @@
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
#if defined(__i386__) && defined(__pic__)
|
||||
#include <string.h>
|
||||
#endif
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
@ -57,9 +54,6 @@ static const uvec16 kAddY16 = {0x7e80u, 0x7e80u, 0x7e80u, 0x7e80u,
|
||||
static const uvec16 kAddY0 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
|
||||
0x8080u, 0x8080u, 0x8080u, 0x8080u};
|
||||
|
||||
static const uvec16 kAddUV128 = {0x8000u, 0x8000u, 0x8000u, 0x8000u,
|
||||
0x8000u, 0x8000u, 0x8000u, 0x8000u};
|
||||
|
||||
static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
|
||||
0x8080u, 0x8080u, 0x8080u, 0x8080u};
|
||||
|
||||
@ -286,7 +280,7 @@ void RAWToARGBRow_AVX2(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
|
||||
void RAWToRGBARow_SSSE3(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n" // 0x000000ff
|
||||
"psrld $0x18,%%xmm5 \n"
|
||||
"psrld $24,%%xmm5 \n"
|
||||
"movdqa %3,%%xmm4 \n"
|
||||
|
||||
LABELALIGN
|
||||
@ -364,8 +358,8 @@ void RGB565ToARGBRow_SSE2(const uint8_t* src, uint8_t* dst, int width) {
|
||||
"pcmpeqb %%xmm3,%%xmm3 \n"
|
||||
"psllw $0xb,%%xmm3 \n"
|
||||
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||
"psllw $0xa,%%xmm4 \n"
|
||||
"psrlw $0x5,%%xmm4 \n"
|
||||
"psllw $10,%%xmm4 \n"
|
||||
"psrlw $5,%%xmm4 \n"
|
||||
"pcmpeqb %%xmm7,%%xmm7 \n"
|
||||
"psllw $0x8,%%xmm7 \n"
|
||||
"sub %0,%1 \n"
|
||||
@ -1592,15 +1586,20 @@ struct RgbUVConstants {
|
||||
vec8 kRGBToV;
|
||||
};
|
||||
|
||||
// Offsets into RgbUVConstants structure
|
||||
#define KRGBTOU 0
|
||||
#define KRGBTOV 16
|
||||
|
||||
void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct RgbUVConstants* rgbuvconstants) {
|
||||
asm volatile(
|
||||
"movdqa %4,%%xmm3 \n"
|
||||
"movdqa %5,%%xmm4 \n"
|
||||
"movdqa %6,%%xmm5 \n"
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n" // 0x8000
|
||||
"psllw $15,%%xmm5 \n"
|
||||
"movdqa 0x0(%4),%%xmm3 \n" // kRGBToU
|
||||
"movdqa 0x10(%4),%%xmm4 \n" // kRGBToV
|
||||
"sub %1,%2 \n"
|
||||
|
||||
LABELALIGN
|
||||
@ -1655,9 +1654,7 @@ void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,
|
||||
#else
|
||||
"+rm"(width) // %3
|
||||
#endif
|
||||
: "m"(rgbuvconstants->kRGBToU), // %4
|
||||
"m"(rgbuvconstants->kRGBToV), // %5
|
||||
"m"(kAddUV128) // %6
|
||||
: "r"(rgbuvconstants) // %4
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6");
|
||||
}
|
||||
#endif // HAS_ARGBTOUV444ROW_SSSE3
|
||||
@ -1670,10 +1667,11 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
|
||||
int width,
|
||||
const struct RgbUVConstants* rgbuvconstants) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 %4,%%ymm3 \n"
|
||||
"vbroadcastf128 %5,%%ymm4 \n"
|
||||
"vbroadcastf128 %6,%%ymm5 \n"
|
||||
"vmovdqa %7,%%ymm7 \n"
|
||||
"vbroadcastf128 0x0(%4),%%ymm3 \n" // kRGBToU
|
||||
"vbroadcastf128 0x10(%4),%%ymm4 \n" // kRGBToV
|
||||
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" // 0x8000
|
||||
"vpsllw $15,%%ymm5,%%ymm5 \n"
|
||||
"vmovdqa %5,%%ymm7 \n"
|
||||
"sub %1,%2 \n"
|
||||
|
||||
LABELALIGN
|
||||
@ -1726,10 +1724,8 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
|
||||
#else
|
||||
"+rm"(width) // %3
|
||||
#endif
|
||||
: "m"(rgbuvconstants->kRGBToU), // %4
|
||||
"m"(rgbuvconstants->kRGBToV), // %5
|
||||
"m"(kAddUV128), // %6
|
||||
"m"(kPermdARGBToY_AVX) // %7
|
||||
: "r"(rgbuvconstants), // %4
|
||||
"m"(kPermdARGBToY_AVX) // %5
|
||||
: "memory", "cc", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
|
||||
"ymm7");
|
||||
}
|
||||
@ -1751,60 +1747,48 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct RgbUVConstants* rgbuvconstants) {
|
||||
#if defined(__i386__) && defined(__pic__)
|
||||
// i386 + PIC builds: Inline asm may run out of general-purpose registers.
|
||||
// In PIC, EBX is reserved for the GOT (and with a frame pointer EBP is also
|
||||
// unavailable), so addressing struct fields via memory operands can require
|
||||
// extra temporaries that the compiler cannot allocate given the asm constraints.
|
||||
// To avoid this, copy the RGB-to-UV constants to stack locals first and let the
|
||||
// asm use simple stack-relative addressing.
|
||||
__attribute__((aligned(16))) vec8 local_kRGBToU = {};
|
||||
__attribute__((aligned(16))) vec8 local_kRGBToV = {};
|
||||
memcpy(&local_kRGBToU, &rgbuvconstants->kRGBToU, sizeof(local_kRGBToU));
|
||||
memcpy(&local_kRGBToV, &rgbuvconstants->kRGBToV, sizeof(local_kRGBToV));
|
||||
#endif
|
||||
asm volatile(
|
||||
"movdqa %5,%%xmm4 \n" // RGBToU
|
||||
"movdqa %6,%%xmm5 \n" // RGBToV
|
||||
"pcmpeqb %%xmm6,%%xmm6 \n" // 0x0101
|
||||
"pabsb %%xmm6,%%xmm6 \n"
|
||||
"movdqa %7,%%xmm7 \n" // kShuffleAARRGGBB
|
||||
"sub %1,%2 \n"
|
||||
"movdqa 0x0(%5),%%xmm4 \n" // RGBToU
|
||||
"movdqa 0x10(%5),%%xmm5 \n" // RGBToV
|
||||
"pcmpeqb %%xmm6,%%xmm6 \n" // 0x0101
|
||||
"pabsb %%xmm6,%%xmm6 \n"
|
||||
"movdqa %6,%%xmm7 \n" // kShuffleAARRGGBB
|
||||
"sub %1,%2 \n"
|
||||
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n" // Read 8x2 ARGB Pixels
|
||||
"movdqu 0x10(%0),%%xmm1 \n"
|
||||
"movdqu 0x00(%0,%4,1),%%xmm2 \n"
|
||||
"movdqu 0x10(%0,%4,1),%%xmm3 \n"
|
||||
"pshufb %%xmm7,%%xmm0 \n" // aarrggbb
|
||||
"pshufb %%xmm7,%%xmm1 \n"
|
||||
"pshufb %%xmm7,%%xmm2 \n"
|
||||
"pshufb %%xmm7,%%xmm3 \n"
|
||||
"pmaddubsw %%xmm6,%%xmm0 \n" // 8x2 -> 4x2
|
||||
"pmaddubsw %%xmm6,%%xmm1 \n"
|
||||
"pmaddubsw %%xmm6,%%xmm2 \n"
|
||||
"pmaddubsw %%xmm6,%%xmm3 \n"
|
||||
"paddw %%xmm2,%%xmm0 \n" // 4x2 -> 4x1
|
||||
"paddw %%xmm3,%%xmm1 \n"
|
||||
"pxor %%xmm2,%%xmm2 \n" // 0 for vpavgw
|
||||
"psrlw $1,%%xmm0 \n"
|
||||
"psrlw $1,%%xmm1 \n"
|
||||
"pavgw %%xmm2,%%xmm0 \n"
|
||||
"pavgw %%xmm2,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n" // mutates
|
||||
"movdqu (%0),%%xmm0 \n" // Read 8x2 ARGB Pixels
|
||||
"movdqu 0x10(%0),%%xmm1 \n"
|
||||
"movdqu 0x00(%0,%4,1),%%xmm2 \n"
|
||||
"movdqu 0x10(%0,%4,1),%%xmm3 \n"
|
||||
"pshufb %%xmm7,%%xmm0 \n" // aarrggbb
|
||||
"pshufb %%xmm7,%%xmm1 \n"
|
||||
"pshufb %%xmm7,%%xmm2 \n"
|
||||
"pshufb %%xmm7,%%xmm3 \n"
|
||||
"pmaddubsw %%xmm6,%%xmm0 \n" // 8x2 -> 4x2
|
||||
"pmaddubsw %%xmm6,%%xmm1 \n"
|
||||
"pmaddubsw %%xmm6,%%xmm2 \n"
|
||||
"pmaddubsw %%xmm6,%%xmm3 \n"
|
||||
"paddw %%xmm2,%%xmm0 \n" // 4x2 -> 4x1
|
||||
"paddw %%xmm3,%%xmm1 \n"
|
||||
"pxor %%xmm2,%%xmm2 \n" // 0 for vpavgw
|
||||
"psrlw $1,%%xmm0 \n"
|
||||
"psrlw $1,%%xmm1 \n"
|
||||
"pavgw %%xmm2,%%xmm0 \n"
|
||||
"pavgw %%xmm2,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n" // mutates
|
||||
|
||||
"movdqa %%xmm6,%%xmm2 \n"
|
||||
"psllw $15,%%xmm2 \n" // 0x8000
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"pmaddubsw %%xmm5,%%xmm1 \n" // 4 V
|
||||
"pmaddubsw %%xmm4,%%xmm0 \n" // 4 U
|
||||
"phaddw %%xmm1,%%xmm0 \n" // uuuuvvvv
|
||||
"psubw %%xmm0,%%xmm2 \n"
|
||||
"psrlw $0x8,%%xmm2 \n"
|
||||
"packuswb %%xmm2,%%xmm2 \n"
|
||||
"movd %%xmm2,(%1) \n" // Write 4 U's
|
||||
"pshufd $0x55,%%xmm2,%%xmm2 \n" // Copy V to low 4 bytes
|
||||
"movd %%xmm2,0x00(%1,%2,1) \n" // Write 4 V's
|
||||
"movdqa %%xmm6,%%xmm2 \n"
|
||||
"psllw $15,%%xmm2 \n" // 0x8000
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"pmaddubsw %%xmm5,%%xmm1 \n" // 4 V
|
||||
"pmaddubsw %%xmm4,%%xmm0 \n" // 4 U
|
||||
"phaddw %%xmm1,%%xmm0 \n" // uuuuvvvv
|
||||
"psubw %%xmm0,%%xmm2 \n"
|
||||
"psrlw $0x8,%%xmm2 \n"
|
||||
"packuswb %%xmm2,%%xmm2 \n"
|
||||
"movd %%xmm2,(%1) \n" // Write 4 U's
|
||||
"pshufd $0x55,%%xmm2,%%xmm2 \n" // Copy V to low 4 bytes
|
||||
"movd %%xmm2,0x00(%1,%2,1) \n" // Write 4 V's
|
||||
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"lea 0x4(%1),%1 \n"
|
||||
@ -1819,16 +1803,10 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
|
||||
"+rm"(width) // %3
|
||||
#endif
|
||||
: "r"((intptr_t)(src_stride_argb)), // %4
|
||||
#if defined(__i386__) && defined(__pic__)
|
||||
"m"(local_kRGBToU), // %5
|
||||
"m"(local_kRGBToV), // %6
|
||||
#else // defined(__i386__) && defined(__pic__)
|
||||
"m"(rgbuvconstants->kRGBToU), // %5
|
||||
"m"(rgbuvconstants->kRGBToV), // %6
|
||||
#endif // defined(__i386__) && defined(__pic__)
|
||||
"m"(kShuffleAARRGGBB) // %7
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
|
||||
"xmm6", "xmm7");
|
||||
"r"(rgbuvconstants), // %5
|
||||
"m"(kShuffleAARRGGBB) // %6
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
|
||||
#endif // HAS_ARGBTOUVROW_SSSE3
|
||||
@ -1838,30 +1816,17 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
|
||||
// 16x2 -> 8x1 ARGB pixels converted to 8 U and 8 V
|
||||
// ARGBToUV does rounding average of 4 ARGB pixels
|
||||
void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct RgbUVConstants* rgbuvconstants) {
|
||||
#if defined(__i386__) && defined(__pic__)
|
||||
// i386 + PIC builds: Inline asm may run out of general-purpose registers.
|
||||
// In PIC, EBX is reserved for the GOT (and with a frame pointer EBP is also
|
||||
// unavailable), so addressing struct fields via memory operands can require
|
||||
// extra temporaries that the compiler cannot allocate given the asm constraints.
|
||||
// To avoid this, copy the RGB-to-UV constants to stack locals first and let the
|
||||
// asm use simple stack-relative addressing.
|
||||
__attribute__((aligned(32))) vec8 local_kRGBToU = {};
|
||||
__attribute__((aligned(32))) vec8 local_kRGBToV = {};
|
||||
memcpy(&local_kRGBToU, &rgbuvconstants->kRGBToU, sizeof(local_kRGBToU));
|
||||
memcpy(&local_kRGBToV, &rgbuvconstants->kRGBToV, sizeof(local_kRGBToV));
|
||||
#endif
|
||||
|
||||
int src_stride_argb,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width,
|
||||
const struct RgbUVConstants* rgbuvconstants) {
|
||||
asm volatile(
|
||||
"vbroadcastf128 %5,%%ymm4 \n" // RGBToU
|
||||
"vbroadcastf128 %6,%%ymm5 \n" // RGBToV
|
||||
"vbroadcastf128 0(%5),%%ymm4 \n" // RGBToU
|
||||
"vbroadcastf128 0x10(%5),%%ymm5 \n" // RGBToV
|
||||
"vpcmpeqb %%ymm6,%%ymm6,%%ymm6 \n" // 0x0101
|
||||
"vpabsb %%ymm6,%%ymm6 \n"
|
||||
"vmovdqa %7,%%ymm7 \n" // kShuffleAARRGGBB
|
||||
"vmovdqa %6,%%ymm7 \n" // kShuffleAARRGGBB
|
||||
"sub %1,%2 \n"
|
||||
|
||||
"1: \n"
|
||||
@ -1913,16 +1878,10 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
|
||||
"+rm"(width) // %3
|
||||
#endif
|
||||
: "r"((intptr_t)(src_stride_argb)), // %4
|
||||
#if defined(__i386__) && defined(__pic__)
|
||||
"m"(local_kRGBToU), // %5
|
||||
"m"(local_kRGBToV), // %6
|
||||
#else
|
||||
"m"(rgbuvconstants->kRGBToU), // %5
|
||||
"m"(rgbuvconstants->kRGBToV), // %6
|
||||
#endif // defined(__i386__) && defined(__pic__)
|
||||
"m"(kShuffleAARRGGBB) // %7
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
|
||||
"xmm6", "xmm7");
|
||||
"r"(rgbuvconstants), // %5
|
||||
"m"(kShuffleAARRGGBB) // %6
|
||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
|
||||
"xmm7");
|
||||
}
|
||||
#endif // HAS_ARGBTOUVROW_AVX2
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user