mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-02-16 23:29:52 +08:00
Port ARGBToJ420 to AVX2
ARGBToJ420 had an SSSE3 version, but not AVX2. ARGBToI420 had an AVX2, so adapt that code to J420. TBR=harryjin@google.com BUG=libyuv:553 Review URL: https://codereview.chromium.org/1702373004 .
This commit is contained in:
parent
127ff512b3
commit
22e062a448
@ -1023,6 +1023,67 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
|
|||||||
}
|
}
|
||||||
#endif // HAS_ARGBTOUVROW_AVX2
|
#endif // HAS_ARGBTOUVROW_AVX2
|
||||||
|
|
||||||
|
#ifdef HAS_ARGBTOUVJROW_AVX2
|
||||||
|
void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
|
||||||
|
uint8* dst_u, uint8* dst_v, int width) {
|
||||||
|
asm volatile (
|
||||||
|
"vbroadcastf128 %5,%%ymm5 \n"
|
||||||
|
"vbroadcastf128 %6,%%ymm6 \n"
|
||||||
|
"vbroadcastf128 %7,%%ymm7 \n"
|
||||||
|
"sub %1,%2 \n"
|
||||||
|
LABELALIGN
|
||||||
|
"1: \n"
|
||||||
|
"vmovdqu " MEMACCESS(0) ",%%ymm0 \n"
|
||||||
|
"vmovdqu " MEMACCESS2(0x20,0) ",%%ymm1 \n"
|
||||||
|
"vmovdqu " MEMACCESS2(0x40,0) ",%%ymm2 \n"
|
||||||
|
"vmovdqu " MEMACCESS2(0x60,0) ",%%ymm3 \n"
|
||||||
|
VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0) // vpavgb (%0,%4,1),%%ymm0,%%ymm0
|
||||||
|
VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
|
||||||
|
VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
|
||||||
|
VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
|
||||||
|
"lea " MEMLEA(0x80,0) ",%0 \n"
|
||||||
|
"vshufps $0x88,%%ymm1,%%ymm0,%%ymm4 \n"
|
||||||
|
"vshufps $0xdd,%%ymm1,%%ymm0,%%ymm0 \n"
|
||||||
|
"vpavgb %%ymm4,%%ymm0,%%ymm0 \n"
|
||||||
|
"vshufps $0x88,%%ymm3,%%ymm2,%%ymm4 \n"
|
||||||
|
"vshufps $0xdd,%%ymm3,%%ymm2,%%ymm2 \n"
|
||||||
|
"vpavgb %%ymm4,%%ymm2,%%ymm2 \n"
|
||||||
|
|
||||||
|
"vpmaddubsw %%ymm7,%%ymm0,%%ymm1 \n"
|
||||||
|
"vpmaddubsw %%ymm7,%%ymm2,%%ymm3 \n"
|
||||||
|
"vpmaddubsw %%ymm6,%%ymm0,%%ymm0 \n"
|
||||||
|
"vpmaddubsw %%ymm6,%%ymm2,%%ymm2 \n"
|
||||||
|
"vphaddw %%ymm3,%%ymm1,%%ymm1 \n"
|
||||||
|
"vphaddw %%ymm2,%%ymm0,%%ymm0 \n"
|
||||||
|
"vpaddw %%ymm5,%%ymm0,%%ymm0 \n"
|
||||||
|
"vpaddw %%ymm5,%%ymm1,%%ymm1 \n"
|
||||||
|
"vpsraw $0x8,%%ymm1,%%ymm1 \n"
|
||||||
|
"vpsraw $0x8,%%ymm0,%%ymm0 \n"
|
||||||
|
"vpacksswb %%ymm0,%%ymm1,%%ymm0 \n"
|
||||||
|
"vpermq $0xd8,%%ymm0,%%ymm0 \n"
|
||||||
|
"vpshufb %8,%%ymm0,%%ymm0 \n"
|
||||||
|
|
||||||
|
"vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
|
||||||
|
VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
|
||||||
|
"lea " MEMLEA(0x10,1) ",%1 \n"
|
||||||
|
"sub $0x20,%3 \n"
|
||||||
|
"jg 1b \n"
|
||||||
|
"vzeroupper \n"
|
||||||
|
: "+r"(src_argb0), // %0
|
||||||
|
"+r"(dst_u), // %1
|
||||||
|
"+r"(dst_v), // %2
|
||||||
|
"+rm"(width) // %3
|
||||||
|
: "r"((intptr_t)(src_stride_argb)), // %4
|
||||||
|
"m"(kAddUVJ128), // %5
|
||||||
|
"m"(kARGBToVJ), // %6
|
||||||
|
"m"(kARGBToUJ), // %7
|
||||||
|
"m"(kShufARGBToUV_AVX) // %8
|
||||||
|
: "memory", "cc", NACL_R14
|
||||||
|
"xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
#endif // HAS_ARGBTOUVJROW_AVX2
|
||||||
|
|
||||||
#ifdef HAS_ARGBTOUVJROW_SSSE3
|
#ifdef HAS_ARGBTOUVJROW_SSSE3
|
||||||
void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||||
uint8* dst_u, uint8* dst_v, int width) {
|
uint8* dst_u, uint8* dst_v, int width) {
|
||||||
@ -1475,7 +1536,7 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
|
|||||||
#define READYUV411_TEMP \
|
#define READYUV411_TEMP \
|
||||||
"movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \
|
"movzwl " MEMACCESS([u_buf]) ",%[temp] \n" \
|
||||||
"movd %[temp],%%xmm0 \n" \
|
"movd %[temp],%%xmm0 \n" \
|
||||||
MEMOPARG(movzwl,0x00,[u_buf],[v_buf],1,[temp]) " \n" \
|
MEMOPARG(movzwl, 0x00, [u_buf], [v_buf], 1, [temp]) " \n" \
|
||||||
"movd %[temp],%%xmm1 \n" \
|
"movd %[temp],%%xmm1 \n" \
|
||||||
"lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
|
"lea " MEMLEA(0x2, [u_buf]) ",%[u_buf] \n" \
|
||||||
"punpcklbw %%xmm1,%%xmm0 \n" \
|
"punpcklbw %%xmm1,%%xmm0 \n" \
|
||||||
@ -2032,7 +2093,7 @@ void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
|
|||||||
"vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
|
"vpackuswb %%ymm2,%%ymm2,%%ymm2 \n"
|
||||||
#define YUVTORGB_REGS_AVX2 \
|
#define YUVTORGB_REGS_AVX2 \
|
||||||
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
|
"xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
|
||||||
#else// Convert 16 pixels: 16 UV and 16 Y.
|
#else // Convert 16 pixels: 16 UV and 16 Y.
|
||||||
#define YUVTORGB_SETUP_AVX2(yuvconstants)
|
#define YUVTORGB_SETUP_AVX2(yuvconstants)
|
||||||
#define YUVTORGB_AVX2(yuvconstants) \
|
#define YUVTORGB_AVX2(yuvconstants) \
|
||||||
"vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
|
"vpmaddubsw " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2 \n" \
|
||||||
|
|||||||
@ -1505,7 +1505,7 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
|||||||
pmaddubsw xmm3, xmm6
|
pmaddubsw xmm3, xmm6
|
||||||
phaddw xmm0, xmm2
|
phaddw xmm0, xmm2
|
||||||
phaddw xmm1, xmm3
|
phaddw xmm1, xmm3
|
||||||
paddw xmm0, xmm5 // +.5 rounding -> unsigned
|
paddw xmm0, xmm5 // +.5 rounding -> unsigned
|
||||||
paddw xmm1, xmm5
|
paddw xmm1, xmm5
|
||||||
psraw xmm0, 8
|
psraw xmm0, 8
|
||||||
psraw xmm1, 8
|
psraw xmm1, 8
|
||||||
@ -1590,6 +1590,73 @@ void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
|
|||||||
}
|
}
|
||||||
#endif // HAS_ARGBTOUVROW_AVX2
|
#endif // HAS_ARGBTOUVROW_AVX2
|
||||||
|
|
||||||
|
#ifdef HAS_ARGBTOUVJROW_AVX2
|
||||||
|
__declspec(naked)
|
||||||
|
void ARGBToUVJRow_AVX2(const uint8* src_argb0, int src_stride_argb,
|
||||||
|
uint8* dst_u, uint8* dst_v, int width) {
|
||||||
|
__asm {
|
||||||
|
push esi
|
||||||
|
push edi
|
||||||
|
mov eax, [esp + 8 + 4] // src_argb
|
||||||
|
mov esi, [esp + 8 + 8] // src_stride_argb
|
||||||
|
mov edx, [esp + 8 + 12] // dst_u
|
||||||
|
mov edi, [esp + 8 + 16] // dst_v
|
||||||
|
mov ecx, [esp + 8 + 20] // width
|
||||||
|
vbroadcastf128 ymm5, xmmword ptr kAddUV128
|
||||||
|
vbroadcastf128 ymm6, xmmword ptr kARGBToV
|
||||||
|
vbroadcastf128 ymm7, xmmword ptr kARGBToU
|
||||||
|
sub edi, edx // stride from u to v
|
||||||
|
|
||||||
|
convertloop:
|
||||||
|
/* step 1 - subsample 32x2 argb pixels to 16x1 */
|
||||||
|
vmovdqu ymm0, [eax]
|
||||||
|
vmovdqu ymm1, [eax + 32]
|
||||||
|
vmovdqu ymm2, [eax + 64]
|
||||||
|
vmovdqu ymm3, [eax + 96]
|
||||||
|
vpavgb ymm0, ymm0, [eax + esi]
|
||||||
|
vpavgb ymm1, ymm1, [eax + esi + 32]
|
||||||
|
vpavgb ymm2, ymm2, [eax + esi + 64]
|
||||||
|
vpavgb ymm3, ymm3, [eax + esi + 96]
|
||||||
|
lea eax, [eax + 128]
|
||||||
|
vshufps ymm4, ymm0, ymm1, 0x88
|
||||||
|
vshufps ymm0, ymm0, ymm1, 0xdd
|
||||||
|
vpavgb ymm0, ymm0, ymm4 // mutated by vshufps
|
||||||
|
vshufps ymm4, ymm2, ymm3, 0x88
|
||||||
|
vshufps ymm2, ymm2, ymm3, 0xdd
|
||||||
|
vpavgb ymm2, ymm2, ymm4 // mutated by vshufps
|
||||||
|
|
||||||
|
// step 2 - convert to U and V
|
||||||
|
// from here down is very similar to Y code except
|
||||||
|
// instead of 32 different pixels, its 16 pixels of U and 16 of V
|
||||||
|
vpmaddubsw ymm1, ymm0, ymm7 // U
|
||||||
|
vpmaddubsw ymm3, ymm2, ymm7
|
||||||
|
vpmaddubsw ymm0, ymm0, ymm6 // V
|
||||||
|
vpmaddubsw ymm2, ymm2, ymm6
|
||||||
|
vphaddw ymm1, ymm1, ymm3 // mutates
|
||||||
|
vphaddw ymm0, ymm0, ymm2
|
||||||
|
vpaddw ymm1, ymm1, ymm5 // +.5 rounding -> unsigned
|
||||||
|
vpaddw ymm0, ymm0, ymm5
|
||||||
|
vpsraw ymm1, ymm1, 8
|
||||||
|
vpsraw ymm0, ymm0, 8
|
||||||
|
vpacksswb ymm0, ymm1, ymm0 // mutates
|
||||||
|
vpermq ymm0, ymm0, 0xd8 // For vpacksswb
|
||||||
|
vpshufb ymm0, ymm0, ymmword ptr kShufARGBToUV_AVX // for vshufps/vphaddw
|
||||||
|
|
||||||
|
// step 3 - store 16 U and 16 V values
|
||||||
|
vextractf128 [edx], ymm0, 0 // U
|
||||||
|
vextractf128 [edx + edi], ymm0, 1 // V
|
||||||
|
lea edx, [edx + 16]
|
||||||
|
sub ecx, 32
|
||||||
|
jg convertloop
|
||||||
|
|
||||||
|
pop edi
|
||||||
|
pop esi
|
||||||
|
vzeroupper
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif // HAS_ARGBTOUVJROW_AVX2
|
||||||
|
|
||||||
__declspec(naked)
|
__declspec(naked)
|
||||||
void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
|
void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
|
||||||
uint8* dst_u, uint8* dst_v, int width) {
|
uint8* dst_u, uint8* dst_v, int width) {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user