ARGBToUV allow 32 bit x86 build

- make width loop count on stack
- set YMM constants in its own asm block
- make struct for shuffle and add constants
- disable clang format on row_neon.cc function

Bug: 413781394
Change-Id: I263f6862cb7589dc31ac65d118f7ebeb65dbb24a
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6495259
Reviewed-by: Wan-Teh Chang <wtc@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2025-04-28 11:15:30 -07:00 committed by libyuv LUCI CQ
parent 1e40e34573
commit 9f9b5cf660
4 changed files with 129 additions and 110 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv Name: libyuv
URL: https://chromium.googlesource.com/libyuv/libyuv/ URL: https://chromium.googlesource.com/libyuv/libyuv/
Version: 1908 Version: 1909
License: BSD-3-Clause License: BSD-3-Clause
License File: LICENSE License File: LICENSE
Shipped: yes Shipped: yes

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1908 #define LIBYUV_VERSION 1909
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -1642,12 +1642,16 @@ void ARGBToUV444MatrixRow_SSSE3(const uint8_t* src_argb,
"lea 0x40(%0),%0 \n" "lea 0x40(%0),%0 \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
"sub $0x10,%3 \n" "subl $0x10,%3 \n"
"jg 1b \n" "jg 1b \n"
: "+r"(src_argb), // %0 : "+r"(src_argb), // %0
"+r"(dst_u), // %1 "+r"(dst_u), // %1
"+r"(dst_v), // %2 "+r"(dst_v), // %2
#if defined(__i386__)
"+m"(width) // %3
#else
"+rm"(width) // %3 "+rm"(width) // %3
#endif
: "m"(rgbuvconstants->kRGBToU), // %4 : "m"(rgbuvconstants->kRGBToU), // %4
"m"(rgbuvconstants->kRGBToV), // %5 "m"(rgbuvconstants->kRGBToV), // %5
"m"(kAddUV128) // %6 "m"(kAddUV128) // %6
@ -1708,74 +1712,6 @@ void ARGBToUV444MatrixRow_AVX2(const uint8_t* src_argb,
"vmovdqu %%ymm0,(%1,%2,1) \n" "vmovdqu %%ymm0,(%1,%2,1) \n"
"lea 0x80(%0),%0 \n" "lea 0x80(%0),%0 \n"
"lea 0x20(%1),%1 \n" "lea 0x20(%1),%1 \n"
"sub $0x20,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+rm"(width) // %3
: "m"(rgbuvconstants->kRGBToU), // %4
"m"(rgbuvconstants->kRGBToV), // %5
"m"(kAddUV128), // %6
"m"(kPermdARGBToY_AVX) // %7
: "memory", "cc", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
"ymm7");
}
#endif // HAS_ARGBTOUV444ROW_AVX2
// vpshufb for vphaddw + vpackuswb packed to shorts.
static const lvec8 kShufARGBToUV_AVX = {
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
int src_stride_argb,
uint8_t* dst_u,
uint8_t* dst_v,
int width,
const struct RgbUVConstants* rgbuvconstants) {
asm volatile(
"vbroadcastf128 %5,%%ymm5 \n"
"vbroadcastf128 %6,%%ymm6 \n"
"vbroadcastf128 %7,%%ymm7 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
"vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
"vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
"vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
"vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
"vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
"vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
"vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
"vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm6,%%ymm0,%%ymm1 \n"
"vpmaddubsw %%ymm6,%%ymm2,%%ymm3 \n"
"vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n"
"vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpsubw %%ymm0,%%ymm5,%%ymm0 \n"
"vpsubw %%ymm1,%%ymm5,%%ymm1 \n"
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm0,%%ymm1,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpshufb %8,%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0,(%1) \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
"lea 0x10(%1),%1 \n"
"subl $0x20,%3 \n" "subl $0x20,%3 \n"
"jg 1b \n" "jg 1b \n"
"vzeroupper \n" "vzeroupper \n"
@ -1787,28 +1723,31 @@ void ARGBToUVMatrixRow_AVX2(const uint8_t* src_argb,
#else #else
"+rm"(width) // %3 "+rm"(width) // %3
#endif #endif
: "r"((intptr_t)(src_stride_argb)), // %4 : "m"(rgbuvconstants->kRGBToU), // %4
"m"(kAddUV128), // %5 "m"(rgbuvconstants->kRGBToV), // %5
"m"(rgbuvconstants->kRGBToU), // %6 "m"(kAddUV128), // %6
"m"(rgbuvconstants->kRGBToV), // %7 "m"(kPermdARGBToY_AVX) // %7
"m"(kShufARGBToUV_AVX) // %8 : "memory", "cc", "ymm0", "ymm1", "ymm2", "ymm3", "ymm4", "ymm5", "ymm6",
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "ymm7");
"xmm7");
} }
#endif // HAS_ARGBTOUV444ROW_AVX2
#ifdef HAS_ARGBTOUVROW_SSSE3 #ifdef HAS_ARGBTOUVROW_SSSE3
void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb, void OMITFP ARGBToUVMatrixRow_SSSE3(
int src_stride_argb, const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u,
uint8_t* dst_u, uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) {
uint8_t* dst_v,
int width,
const struct RgbUVConstants* rgbuvconstants) {
asm volatile( asm volatile(
"movdqa %5,%%xmm3 \n" "movdqa %0,%%xmm3 \n"
"movdqa %6,%%xmm4 \n" "movdqa %1,%%xmm4 \n"
"movdqa %7,%%xmm5 \n" "movdqa %2,%%xmm5 \n"
"sub %1,%2 \n" :
: "m"(rgbuvconstants->kRGBToU), // %0
"m"(rgbuvconstants->kRGBToV), // %1
"m"(kAddUV128) // %2
: "xmm3", "xmm4", "xmm5");
asm volatile("sub %1,%2 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -1862,16 +1801,93 @@ void ARGBToUVMatrixRow_SSSE3(const uint8_t* src_argb,
#else #else
"+rm"(width) // %3 "+rm"(width) // %3
#endif #endif
: "r"((intptr_t)(src_stride_argb)), // %4 : "r"((intptr_t)(src_stride_argb)) // %4
"m"(rgbuvconstants->kRGBToU), // %5 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5",
"m"(rgbuvconstants->kRGBToV), // %6 "xmm6", "xmm7");
"m"(kAddUV128) // %7
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
} }
#endif // HAS_ARGBTOUVROW_SSSE3 #endif // HAS_ARGBTOUVROW_SSSE3
// vpshufb for vphaddw + vpackuswb packed to shorts.
// Coefficients expressed as negatives to allow 128
struct UVMatrixConstants {
lvec8 kShufARGBToUV;
ulvec8 kAddUV128;
};
static const UVMatrixConstants kShufARGBToUV_AVX = {
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128,
0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128, 0, 128};
void OMITFP ARGBToUVMatrixRow_AVX2(
const uint8_t* src_argb, int src_stride_argb, uint8_t* dst_u,
uint8_t* dst_v, int width, const struct RgbUVConstants* rgbuvconstants) {
asm volatile(
"vbroadcastf128 %0,%%ymm6 \n"
"vbroadcastf128 %1,%%ymm7 \n"
:
: "m"(rgbuvconstants->kRGBToU), // %0
"m"(rgbuvconstants->kRGBToV) // %1
:);
asm volatile(
"vmovdqa 32(%5),%%ymm5 \n"
"sub %1,%2 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n"
"vmovdqu 0x40(%0),%%ymm2 \n"
"vmovdqu 0x60(%0),%%ymm3 \n"
"vpavgb 0x00(%0,%4,1),%%ymm0,%%ymm0 \n"
"vpavgb 0x20(%0,%4,1),%%ymm1,%%ymm1 \n"
"vpavgb 0x40(%0,%4,1),%%ymm2,%%ymm2 \n"
"vpavgb 0x60(%0,%4,1),%%ymm3,%%ymm3 \n"
"lea 0x80(%0),%0 \n"
"vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
"vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
"vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
"vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
"vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
"vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm6,%%ymm0,%%ymm1 \n"
"vpmaddubsw %%ymm6,%%ymm2,%%ymm3 \n"
"vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n"
"vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
"vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
"vpsubw %%ymm0,%%ymm5,%%ymm0 \n"
"vpsubw %%ymm1,%%ymm5,%%ymm1 \n"
"vpsrlw $0x8,%%ymm1,%%ymm1 \n"
"vpsrlw $0x8,%%ymm0,%%ymm0 \n"
"vpackuswb %%ymm0,%%ymm1,%%ymm0 \n"
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
"vpshufb (%5),%%ymm0,%%ymm0 \n"
"vextractf128 $0x0,%%ymm0,(%1) \n"
"vextractf128 $0x1,%%ymm0,0x0(%1,%2,1) \n"
"lea 0x10(%1),%1 \n"
"subl $0x20,%3 \n"
"jg 1b \n"
"vzeroupper \n"
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
#if defined(__i386__)
"+m"(width) // %3
#else
"+rm"(width) // %3
#endif
: "r"((intptr_t)(src_stride_argb)), // %4
"r"(&kShufARGBToUV_AVX) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
}
#ifdef HAS_ARGBTOUV444ROW_SSSE3 #ifdef HAS_ARGBTOUV444ROW_SSSE3
// RGB to BT601 coefficients // RGB to BT601 coefficients

View File

@ -267,9 +267,12 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" // "1: \n" //
READYUV422 READYUV422 //
"subs %[width], %[width], #8 \n" YUVTORGB RGBTORGB8 "subs %[width], %[width], #8 \n" //
STORERGBA "bgt 1b \n" YUVTORGB //
RGBTORGB8 //
STORERGBA //
"bgt 1b \n"
: [src_y] "+r"(src_y), // %[src_y] : [src_y] "+r"(src_y), // %[src_y]
[src_u] "+r"(src_u), // %[src_u] [src_u] "+r"(src_u), // %[src_u]
[src_v] "+r"(src_v), // %[src_v] [src_v] "+r"(src_v), // %[src_v]