mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
AVX2 version of ARGBToI420
BUG=181 TEST=unittest Review URL: https://webrtc-codereview.appspot.com/1090005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@566 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
8e26eada77
commit
551d2b297e
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 565
|
||||
Version: 566
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -123,6 +123,12 @@ extern "C" {
|
||||
// TODO(fbarchard): Port to gcc.
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
#define HAS_ARGBCOLORTABLEROW_X86
|
||||
// Visual C 2012 required for AVX2.
|
||||
#if _MSC_VER >= 1700
|
||||
// TODO(fbarchard): Hook these up to all functions. e.g. format conversion.
|
||||
#define HAS_ARGBTOYROW_AVX2
|
||||
#define HAS_ARGBTOUVROW_AVX2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// The following are Yasm x86 only.
|
||||
@ -258,6 +264,13 @@ typedef __declspec(align(16)) int16 vec16[8];
|
||||
typedef __declspec(align(16)) uint16 uvec16[8];
|
||||
typedef __declspec(align(16)) int32 vec32[4];
|
||||
typedef __declspec(align(16)) uint32 uvec32[4];
|
||||
typedef __declspec(align(32)) int8 lvec8[32];
|
||||
typedef __declspec(align(32)) uint8 ulvec8[32];
|
||||
typedef __declspec(align(32)) int16 lvec16[16];
|
||||
typedef __declspec(align(32)) uint16 ulvec16[16];
|
||||
typedef __declspec(align(32)) int32 lvec32[8];
|
||||
typedef __declspec(align(32)) uint32 ulvec32[8];
|
||||
|
||||
#elif defined(__GNUC__)
|
||||
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
|
||||
typedef int8 __attribute__((vector_size(16))) vec8;
|
||||
@ -360,6 +373,9 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
|
||||
uint8* dst_argb,
|
||||
int width);
|
||||
|
||||
void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void ARGBToYRow_Unaligned_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void ARGBToYRow_Any_AVX2(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix);
|
||||
void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix);
|
||||
@ -430,6 +446,12 @@ void RGB565ToYRow_Any_NEON(const uint8* src_rgb565, uint8* dst_y, int pix);
|
||||
void ARGB1555ToYRow_Any_NEON(const uint8* src_argb1555, uint8* dst_y, int pix);
|
||||
void ARGB4444ToYRow_Any_NEON(const uint8* src_argb4444, uint8* dst_y, int pix);
|
||||
|
||||
void ARGBToUVRow_AVX2(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void ARGBToUVRow_Unaligned_AVX2(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void ARGBToUVRow_Any_AVX2(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void ARGBToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void BGRAToUVRow_SSSE3(const uint8* src_bgra, int src_stride_bgra,
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 565
|
||||
#define LIBYUV_VERSION 566
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -739,7 +739,26 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(HAS_ARGBTOYROW_NEON)
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_AVX2)
|
||||
bool clear = false;
|
||||
if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
|
||||
clear = true;
|
||||
ARGBToUVRow = ARGBToUVRow_Any_AVX2;
|
||||
ARGBToYRow = ARGBToYRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 32)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Unaligned_AVX2;
|
||||
ARGBToYRow = ARGBToYRow_Unaligned_AVX2;
|
||||
if (IS_ALIGNED(src_argb, 32) && IS_ALIGNED(src_stride_argb, 32)) {
|
||||
ARGBToUVRow = ARGBToUVRow_AVX2;
|
||||
if (IS_ALIGNED(dst_y, 32) && IS_ALIGNED(dst_stride_y, 32)) {
|
||||
ARGBToYRow = ARGBToYRow_AVX2;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBTOYROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
|
||||
ARGBToYRow = ARGBToYRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
@ -767,6 +786,12 @@ int ARGBToI420(const uint8* src_argb, int src_stride_argb,
|
||||
ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
|
||||
ARGBToYRow(src_argb, dst_y, width);
|
||||
}
|
||||
|
||||
#if defined(HAS_ARGBTOYROW_AVX2)
|
||||
if (clear) {
|
||||
__asm vzeroupper;
|
||||
}
|
||||
#endif
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
@ -195,6 +195,9 @@ BAYERANY(ARGBToBayerRow_Any_NEON, ARGBToBayerRow_NEON, ARGBToBayerRow_C,
|
||||
dst_y + (width - NUM) * BPP, NUM); \
|
||||
}
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
YANY(ARGBToYRow_Any_AVX2, ARGBToYRow_Unaligned_AVX2, 4, 1, 32)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
YANY(ARGBToYRow_Any_SSSE3, ARGBToYRow_Unaligned_SSSE3, 4, 1, 16)
|
||||
YANY(BGRAToYRow_Any_SSSE3, BGRAToYRow_Unaligned_SSSE3, 4, 1, 16)
|
||||
@ -251,37 +254,40 @@ YANY(ARGBAttenuateRow_Any_NEON, ARGBAttenuateRow_NEON, ARGBAttenuateRow_C,
|
||||
#endif
|
||||
|
||||
// RGB/YUV to UV does multiple of 16 with SIMD and remainder with C.
|
||||
#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP) \
|
||||
#define UVANY(NAMEANY, ANYTOUV_SIMD, ANYTOUV_C, BPP, MASK) \
|
||||
void NAMEANY(const uint8* src_argb, int src_stride_argb, \
|
||||
uint8* dst_u, uint8* dst_v, int width) { \
|
||||
int n = width & ~15; \
|
||||
int n = width & ~MASK; \
|
||||
ANYTOUV_SIMD(src_argb, src_stride_argb, dst_u, dst_v, n); \
|
||||
ANYTOUV_C(src_argb + n * BPP, src_stride_argb, \
|
||||
dst_u + (n >> 1), \
|
||||
dst_v + (n >> 1), \
|
||||
width & 15); \
|
||||
width & MASK); \
|
||||
}
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
UVANY(ARGBToUVRow_Any_AVX2, ARGBToUVRow_Unaligned_AVX2, ARGBToUVRow_C, 4, 31)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUVROW_SSSE3
|
||||
UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4)
|
||||
UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4)
|
||||
UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4)
|
||||
UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4)
|
||||
UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2)
|
||||
UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2)
|
||||
UVANY(ARGBToUVRow_Any_SSSE3, ARGBToUVRow_Unaligned_SSSE3, ARGBToUVRow_C, 4, 15)
|
||||
UVANY(BGRAToUVRow_Any_SSSE3, BGRAToUVRow_Unaligned_SSSE3, BGRAToUVRow_C, 4, 15)
|
||||
UVANY(ABGRToUVRow_Any_SSSE3, ABGRToUVRow_Unaligned_SSSE3, ABGRToUVRow_C, 4, 15)
|
||||
UVANY(RGBAToUVRow_Any_SSSE3, RGBAToUVRow_Unaligned_SSSE3, RGBAToUVRow_C, 4, 15)
|
||||
UVANY(YUY2ToUVRow_Any_SSE2, YUY2ToUVRow_Unaligned_SSE2, YUY2ToUVRow_C, 2, 15)
|
||||
UVANY(UYVYToUVRow_Any_SSE2, UYVYToUVRow_Unaligned_SSE2, UYVYToUVRow_C, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_ARGBTOUVROW_NEON
|
||||
UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4)
|
||||
UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4)
|
||||
UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4)
|
||||
UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4)
|
||||
UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3)
|
||||
UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3)
|
||||
UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2)
|
||||
UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2)
|
||||
UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2)
|
||||
UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2)
|
||||
UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2)
|
||||
UVANY(ARGBToUVRow_Any_NEON, ARGBToUVRow_NEON, ARGBToUVRow_C, 4, 15)
|
||||
UVANY(BGRAToUVRow_Any_NEON, BGRAToUVRow_NEON, BGRAToUVRow_C, 4, 15)
|
||||
UVANY(ABGRToUVRow_Any_NEON, ABGRToUVRow_NEON, ABGRToUVRow_C, 4, 15)
|
||||
UVANY(RGBAToUVRow_Any_NEON, RGBAToUVRow_NEON, RGBAToUVRow_C, 4, 15)
|
||||
UVANY(RGB24ToUVRow_Any_NEON, RGB24ToUVRow_NEON, RGB24ToUVRow_C, 3, 15)
|
||||
UVANY(RAWToUVRow_Any_NEON, RAWToUVRow_NEON, RAWToUVRow_C, 3, 15)
|
||||
UVANY(RGB565ToUVRow_Any_NEON, RGB565ToUVRow_NEON, RGB565ToUVRow_C, 2, 15)
|
||||
UVANY(ARGB1555ToUVRow_Any_NEON, ARGB1555ToUVRow_NEON, ARGB1555ToUVRow_C, 2, 15)
|
||||
UVANY(ARGB4444ToUVRow_Any_NEON, ARGB4444ToUVRow_NEON, ARGB4444ToUVRow_C, 2, 15)
|
||||
UVANY(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, YUY2ToUVRow_C, 2, 15)
|
||||
UVANY(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, UYVYToUVRow_C, 2, 15)
|
||||
#endif
|
||||
#undef UVANY
|
||||
|
||||
|
||||
@ -25,14 +25,35 @@ static const vec8 kARGBToY = {
|
||||
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
|
||||
};
|
||||
|
||||
static const lvec8 kARGBToY_AVX = {
|
||||
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0,
|
||||
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
|
||||
};
|
||||
|
||||
static const vec8 kARGBToU = {
|
||||
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
|
||||
};
|
||||
|
||||
// TODO(fbarchard): Rename kARGBToU_AVX to kARGBToU and use for SSSE3 version.
|
||||
static const lvec8 kARGBToU_AVX = {
|
||||
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0,
|
||||
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
|
||||
};
|
||||
|
||||
static const vec8 kARGBToV = {
|
||||
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
|
||||
};
|
||||
|
||||
static const lvec8 kARGBToV_AVX = {
|
||||
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
|
||||
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
|
||||
};
|
||||
|
||||
// Unshuffle for vphaddw + vpackuswb vpermd.
|
||||
static const lvec32 kShufARGBToY_AVX = {
|
||||
0, 4, 1, 5, 2, 6, 3, 7
|
||||
};
|
||||
|
||||
// Constants for BGRA.
|
||||
static const vec8 kBGRAToY = {
|
||||
0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
|
||||
@ -76,11 +97,25 @@ static const uvec8 kAddY16 = {
|
||||
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
|
||||
};
|
||||
|
||||
static const ulvec8 kAddY16_AVX = {
|
||||
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
|
||||
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
|
||||
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
|
||||
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
|
||||
};
|
||||
|
||||
static const uvec8 kAddUV128 = {
|
||||
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
|
||||
};
|
||||
|
||||
static const ulvec8 kAddUV128_AVX = {
|
||||
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
|
||||
};
|
||||
|
||||
// Shuffle table for converting RGB24 to ARGB.
|
||||
static const uvec8 kShuffleMaskRGB24ToARGB = {
|
||||
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
|
||||
@ -727,6 +762,49 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
|
||||
__declspec(naked) __declspec(align(32))
|
||||
void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
mov edx, [esp + 8] /* dst_y */
|
||||
mov ecx, [esp + 12] /* pix */
|
||||
vmovdqa ymm6, kShufARGBToY_AVX
|
||||
vmovdqa ymm5, kAddY16_AVX
|
||||
vmovdqa ymm4, kARGBToY_AVX
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
vmovdqa ymm0, [eax]
|
||||
vmovdqa ymm1, [eax + 32]
|
||||
vmovdqa ymm2, [eax + 64]
|
||||
vmovdqa ymm3, [eax + 96]
|
||||
vpmaddubsw ymm0, ymm0, ymm4
|
||||
vpmaddubsw ymm1, ymm1, ymm4
|
||||
vpmaddubsw ymm2, ymm2, ymm4
|
||||
vpmaddubsw ymm3, ymm3, ymm4
|
||||
lea eax, [eax + 128]
|
||||
vphaddw ymm0, ymm0, ymm1
|
||||
vphaddw ymm2, ymm2, ymm3
|
||||
vpsrlw ymm0, ymm0, 7
|
||||
vpsrlw ymm2, ymm2, 7
|
||||
vpackuswb ymm0, ymm0, ymm2
|
||||
vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
|
||||
vpaddb ymm0, ymm0, ymm5
|
||||
sub ecx, 32
|
||||
vmovdqa [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
jg convertloop
|
||||
ret
|
||||
vphaddw ymm0, ymm0, ymm1
|
||||
vpermq ymm0, ymm0, 0xd8
|
||||
vpackuswb ymm0, ymm0, ymm2
|
||||
vpermq ymm0, ymm0, 0xd8
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBTOYROW_AVX2
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
@ -761,6 +839,44 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_AVX2
|
||||
__declspec(naked) __declspec(align(32))
|
||||
void ARGBToYRow_Unaligned_AVX2(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
mov edx, [esp + 8] /* dst_y */
|
||||
mov ecx, [esp + 12] /* pix */
|
||||
vmovdqa ymm6, kShufARGBToY_AVX
|
||||
vmovdqa ymm5, kAddY16_AVX
|
||||
vmovdqa ymm4, kARGBToY_AVX
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
vmovdqu ymm0, [eax]
|
||||
vmovdqu ymm1, [eax + 32]
|
||||
vmovdqu ymm2, [eax + 64]
|
||||
vmovdqu ymm3, [eax + 96]
|
||||
vpmaddubsw ymm0, ymm0, ymm4
|
||||
vpmaddubsw ymm1, ymm1, ymm4
|
||||
vpmaddubsw ymm2, ymm2, ymm4
|
||||
vpmaddubsw ymm3, ymm3, ymm4
|
||||
lea eax, [eax + 128]
|
||||
vphaddw ymm0, ymm0, ymm1
|
||||
vphaddw ymm2, ymm2, ymm3
|
||||
vpsrlw ymm0, ymm0, 7
|
||||
vpsrlw ymm2, ymm2, 7
|
||||
vpackuswb ymm0, ymm0, ymm2
|
||||
vpermd ymm0, ymm6, ymm0 // For vphaddw + vpackuswb mutation.
|
||||
vpaddb ymm0, ymm0, ymm5
|
||||
sub ecx, 32
|
||||
vmovdqu [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBTOYROW_AVX2
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
@ -1031,6 +1147,80 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAS_ARGBTOUVROW_AVX2
|
||||
__declspec(naked) __declspec(align(32))
|
||||
void ARGBToUVRow_AVX2(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
mov eax, [esp + 8 + 4] // src_argb
|
||||
mov esi, [esp + 8 + 8] // src_stride_argb
|
||||
mov edx, [esp + 8 + 12] // dst_u
|
||||
mov edi, [esp + 8 + 16] // dst_v
|
||||
mov ecx, [esp + 8 + 20] // pix
|
||||
vmovdqa ymm7, kARGBToU_AVX
|
||||
vmovdqa ymm6, kARGBToV_AVX
|
||||
vmovdqa ymm5, kAddUV128_AVX
|
||||
sub edi, edx // stride from u to v
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
/* step 1 - subsample 16x2 argb pixels to 8x1 */
|
||||
vmovdqa ymm0, [eax]
|
||||
vmovdqa ymm1, [eax + 32]
|
||||
vmovdqa ymm2, [eax + 64]
|
||||
vmovdqa ymm3, [eax + 96]
|
||||
vpavgb ymm0, ymm0, [eax + esi]
|
||||
vpavgb ymm1, ymm1, [eax + esi + 32]
|
||||
vpavgb ymm2, ymm2, [eax + esi + 64]
|
||||
vpavgb ymm3, ymm3, [eax + esi + 96]
|
||||
lea eax, [eax + 128]
|
||||
vmovdqa ymm4, ymm0 // TODO(fbarchard): Remove.
|
||||
vshufps ymm0, ymm0, ymm1, 0x88
|
||||
vshufps ymm4, ymm4, ymm1, 0xdd
|
||||
vpavgb ymm0, ymm0, ymm4
|
||||
vpermq ymm0, ymm0, 0xd8 // TODO(fbarchard): Remove.
|
||||
vmovdqa ymm4, ymm2 // TODO(fbarchard): Remove.
|
||||
vshufps ymm2, ymm2, ymm3, 0x88
|
||||
vshufps ymm4, ymm4, ymm3, 0xdd
|
||||
vpavgb ymm2, ymm2, ymm4
|
||||
vpermq ymm2, ymm2, 0xd8 // TODO(fbarchard): Remove.
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 32 different pixels, its 16 pixels of U and 16 of V
|
||||
vmovdqa ymm1, ymm0 // TODO(fbarchard): Remove.
|
||||
vmovdqa ymm3, ymm2 // TODO(fbarchard): Remove.
|
||||
vpmaddubsw ymm0, ymm0, ymm7 // U
|
||||
vpmaddubsw ymm2, ymm2, ymm7
|
||||
vpmaddubsw ymm1, ymm1, ymm6 // V
|
||||
vpmaddubsw ymm3, ymm3, ymm6
|
||||
vphaddw ymm0, ymm0, ymm2
|
||||
vpermq ymm0, ymm0, 0xd8 // TODO(fbarchard): Remove.
|
||||
vphaddw ymm1, ymm1, ymm3
|
||||
vpermq ymm1, ymm1, 0xd8 // TODO(fbarchard): Remove.
|
||||
vpsraw ymm0, ymm0, 8
|
||||
vpsraw ymm1, ymm1, 8
|
||||
vpacksswb ymm0, ymm0, ymm1
|
||||
vpermq ymm0, ymm0, 0xd8
|
||||
vpaddb ymm0, ymm0, ymm5 // -> unsigned
|
||||
|
||||
// step 3 - store 16 U and 16 V values
|
||||
sub ecx, 32
|
||||
vmovdqa ymm1, ymm0
|
||||
vextractf128 qword ptr [edx], ymm0, 0 // U
|
||||
vextractf128 qword ptr [edx + edi], ymm0, 1 // V
|
||||
lea edx, [edx + 16]
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBTOUVROW_AVX2
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
@ -1101,6 +1291,80 @@ void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAS_ARGBTOUVROW_AVX2
|
||||
__declspec(naked) __declspec(align(32))
|
||||
void ARGBToUVRow_Unaligned_AVX2(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
mov eax, [esp + 8 + 4] // src_argb
|
||||
mov esi, [esp + 8 + 8] // src_stride_argb
|
||||
mov edx, [esp + 8 + 12] // dst_u
|
||||
mov edi, [esp + 8 + 16] // dst_v
|
||||
mov ecx, [esp + 8 + 20] // pix
|
||||
vmovdqa ymm7, kARGBToU_AVX
|
||||
vmovdqa ymm6, kARGBToV_AVX
|
||||
vmovdqa ymm5, kAddUV128_AVX
|
||||
sub edi, edx // stride from u to v
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
/* step 1 - subsample 16x2 argb pixels to 8x1 */
|
||||
vmovdqu ymm0, [eax]
|
||||
vmovdqu ymm1, [eax + 32]
|
||||
vmovdqu ymm2, [eax + 64]
|
||||
vmovdqu ymm3, [eax + 96]
|
||||
vpavgb ymm0, ymm0, [eax + esi]
|
||||
vpavgb ymm1, ymm1, [eax + esi + 32]
|
||||
vpavgb ymm2, ymm2, [eax + esi + 64]
|
||||
vpavgb ymm3, ymm3, [eax + esi + 96]
|
||||
lea eax, [eax + 128]
|
||||
vmovdqa ymm4, ymm0
|
||||
vshufps ymm0, ymm0, ymm1, 0x88
|
||||
vshufps ymm4, ymm4, ymm1, 0xdd
|
||||
vpavgb ymm0, ymm0, ymm4
|
||||
vpermq ymm0, ymm0, 0xd8
|
||||
vmovdqa ymm4, ymm2
|
||||
vshufps ymm2, ymm2, ymm3, 0x88
|
||||
vshufps ymm4, ymm4, ymm3, 0xdd
|
||||
vpavgb ymm2, ymm2, ymm4
|
||||
vpermq ymm2, ymm2, 0xd8
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 32 different pixels, its 16 pixels of U and 16 of V
|
||||
vmovdqa ymm1, ymm0
|
||||
vmovdqa ymm3, ymm2
|
||||
vpmaddubsw ymm0, ymm0, ymm7 // U
|
||||
vpmaddubsw ymm2, ymm2, ymm7
|
||||
vpmaddubsw ymm1, ymm1, ymm6 // V
|
||||
vpmaddubsw ymm3, ymm3, ymm6
|
||||
vphaddw ymm0, ymm0, ymm2
|
||||
vpermq ymm0, ymm0, 0xd8
|
||||
vphaddw ymm1, ymm1, ymm3
|
||||
vpermq ymm1, ymm1, 0xd8
|
||||
vpsraw ymm0, ymm0, 8
|
||||
vpsraw ymm1, ymm1, 8
|
||||
vpacksswb ymm0, ymm0, ymm1
|
||||
vpermq ymm0, ymm0, 0xd8
|
||||
vpaddb ymm0, ymm0, ymm5 // -> unsigned
|
||||
|
||||
// step 3 - store 16 U and 16 V values
|
||||
sub ecx, 32
|
||||
vmovdqa ymm1, ymm0
|
||||
vextractf128 qword ptr [edx], ymm0, 0 // U
|
||||
vextractf128 qword ptr [edx + edi], ymm0, 1 // V
|
||||
lea edx, [edx + 16]
|
||||
jg convertloop
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBTOUVROW_AVX2
|
||||
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBToUV444Row_SSSE3(const uint8* src_argb0,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user