mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
ARGBToYUV with sse3 on any size/alignment
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/366011 git-svn-id: http://libyuv.googlecode.com/svn/trunk@161 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
caf3952548
commit
b5b27d131a
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 160
|
||||
Version: 161
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -16,7 +16,7 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define LIBYUV_VERSION 160
|
||||
#define LIBYUV_VERSION 161
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
|
||||
@ -365,6 +365,11 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
|
||||
IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16) &&
|
||||
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
|
||||
ARGBToYRow = ARGBToYRow_SSSE3;
|
||||
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
|
||||
ARGBToYRow = ARGBToYAnyRow_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
@ -375,6 +380,12 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
|
||||
IS_ALIGNED(width, 16) &&
|
||||
IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16)) {
|
||||
ARGBToUVRow = ARGBToUVRow_SSSE3;
|
||||
} else if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
IS_ALIGNED(width, 2) && width <= kMaxStride) {
|
||||
ARGBToUVRow = ARGBToUVAnyRow_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
@ -416,6 +427,11 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame,
|
||||
IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16) &&
|
||||
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
|
||||
ARGBToYRow = BGRAToYRow_SSSE3;
|
||||
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
|
||||
ARGBToYRow = BGRAToYAnyRow_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToYRow = BGRAToYRow_Unaligned_SSSE3;
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
@ -426,6 +442,12 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame,
|
||||
IS_ALIGNED(width, 16) &&
|
||||
IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16)) {
|
||||
ARGBToUVRow = BGRAToUVRow_SSSE3;
|
||||
} else if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
IS_ALIGNED(width, 2) && width <= kMaxStride) {
|
||||
ARGBToUVRow = BGRAToUVAnyRow_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVRow = BGRAToUVRow_Unaligned_SSSE3;
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
@ -467,6 +489,11 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame,
|
||||
IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16) &&
|
||||
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
|
||||
ARGBToYRow = ABGRToYRow_SSSE3;
|
||||
} else if (TestCpuFlag(kCpuHasSSSE3) && width <= kMaxStride) {
|
||||
ARGBToYRow = ABGRToYAnyRow_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToYRow = ABGRToYRow_Unaligned_SSSE3;
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
@ -477,6 +504,12 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame,
|
||||
IS_ALIGNED(width, 16) &&
|
||||
IS_ALIGNED(src_frame, 16) && IS_ALIGNED(src_stride_frame, 16)) {
|
||||
ARGBToUVRow = ABGRToUVRow_SSSE3;
|
||||
} else if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
IS_ALIGNED(width, 2) && width <= kMaxStride) {
|
||||
ARGBToUVRow = ABGRToUVAnyRow_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToUVRow = ABGRToUVRow_Unaligned_SSSE3;
|
||||
}
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
|
||||
20
source/row.h
20
source/row.h
@ -100,12 +100,22 @@ void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
|
||||
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
|
||||
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
|
||||
void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
|
||||
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
|
||||
@ -235,6 +245,16 @@ void ARGBToRGB565AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToARGB1555AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
void ARGBToARGB4444AnyRow_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix);
|
||||
|
||||
void ARGBToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void BGRAToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void ABGRToYAnyRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void ARGBToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void BGRAToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void ABGRToUVAnyRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
|
||||
void FastConvertYUVToARGBAnyRow_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
|
||||
@ -380,8 +380,17 @@ void NAMEANY(const uint8* y_buf, \
|
||||
memcpy(rgb_buf, row, width << 2); \
|
||||
}
|
||||
|
||||
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
|
||||
MAKEYUVANY(FastConvertYUVToARGBAnyRow_SSSE3, FastConvertYUVToARGBRow_SSSE3)
|
||||
MAKEYUVANY(FastConvertYUVToBGRAAnyRow_SSSE3, FastConvertYUVToBGRARow_SSSE3)
|
||||
MAKEYUVANY(FastConvertYUVToABGRAnyRow_SSSE3, FastConvertYUVToABGRRow_SSSE3)
|
||||
#endif
|
||||
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
|
||||
MAKEYUVANY(FastConvertYUVToARGBAnyRow_NEON, FastConvertYUVToARGBRow_NEON)
|
||||
MAKEYUVANY(FastConvertYUVToBGRAAnyRow_NEON, FastConvertYUVToBGRARow_NEON)
|
||||
MAKEYUVANY(FastConvertYUVToABGRAnyRow_NEON, FastConvertYUVToABGRRow_NEON)
|
||||
#endif
|
||||
|
||||
// Wrappers to handle odd sizes/alignments
|
||||
#define MAKEYUVANYRGB(NAMEANY, ARGBTORGB, BPP) \
|
||||
void NAMEANY(const uint8* argb_buf, \
|
||||
uint8* rgb_buf, \
|
||||
@ -391,20 +400,40 @@ void NAMEANY(const uint8* argb_buf, \
|
||||
memcpy(rgb_buf, row, width * BPP); \
|
||||
}
|
||||
|
||||
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSSE3)
|
||||
MAKEYUVANY(FastConvertYUVToARGBAnyRow_SSSE3, FastConvertYUVToARGBRow_SSSE3)
|
||||
MAKEYUVANY(FastConvertYUVToBGRAAnyRow_SSSE3, FastConvertYUVToBGRARow_SSSE3)
|
||||
MAKEYUVANY(FastConvertYUVToABGRAnyRow_SSSE3, FastConvertYUVToABGRRow_SSSE3)
|
||||
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
|
||||
MAKEYUVANYRGB(ARGBToRGB24AnyRow_SSSE3, ARGBToRGB24Row_SSSE3, 3)
|
||||
MAKEYUVANYRGB(ARGBToRAWAnyRow_SSSE3, ARGBToRAWRow_SSSE3, 3)
|
||||
MAKEYUVANYRGB(ARGBToRGB565AnyRow_SSE2, ARGBToRGB565Row_SSE2, 2)
|
||||
MAKEYUVANYRGB(ARGBToARGB1555AnyRow_SSE2, ARGBToARGB1555Row_SSE2, 2)
|
||||
MAKEYUVANYRGB(ARGBToARGB4444AnyRow_SSE2, ARGBToARGB4444Row_SSE2, 2)
|
||||
#endif
|
||||
#if defined(HAS_FASTCONVERTYUVTOARGBROW_NEON)
|
||||
MAKEYUVANY(FastConvertYUVToARGBAnyRow_NEON, FastConvertYUVToARGBRow_NEON)
|
||||
MAKEYUVANY(FastConvertYUVToBGRAAnyRow_NEON, FastConvertYUVToBGRARow_NEON)
|
||||
MAKEYUVANY(FastConvertYUVToABGRAnyRow_NEON, FastConvertYUVToABGRRow_NEON)
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
|
||||
#define MAKEARGBTOYANY(NAMEANY, ARGBTOY) \
|
||||
void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) { \
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride]); \
|
||||
ARGBTOY(src_argb, row, width); \
|
||||
memcpy(dst_y, row, width); \
|
||||
}
|
||||
|
||||
MAKEARGBTOYANY(ARGBToYAnyRow_SSSE3, ARGBToYRow_Unaligned_SSSE3)
|
||||
MAKEARGBTOYANY(BGRAToYAnyRow_SSSE3, BGRAToYRow_Unaligned_SSSE3)
|
||||
MAKEARGBTOYANY(ABGRToYAnyRow_SSSE3, ABGRToYRow_Unaligned_SSSE3)
|
||||
|
||||
#define MAKEARGBTOUVANY(NAMEANY, ARGBTOUV) \
|
||||
void NAMEANY(const uint8* src_argb0, int src_stride_argb, \
|
||||
uint8* dst_u, uint8* dst_v, int width) { \
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride * 2]); \
|
||||
ARGBTOUV(src_argb0, src_stride_argb, row, row + kMaxStride, width); \
|
||||
int halfwidth = (width + 1) >> 1; \
|
||||
memcpy(dst_u, row, halfwidth); \
|
||||
memcpy(dst_v, row + kMaxStride, halfwidth); \
|
||||
}
|
||||
|
||||
MAKEARGBTOUVANY(ARGBToUVAnyRow_SSSE3, ARGBToUVRow_Unaligned_SSSE3)
|
||||
MAKEARGBTOUVANY(BGRAToUVAnyRow_SSSE3, BGRAToUVRow_Unaligned_SSSE3)
|
||||
MAKEARGBTOUVANY(ABGRToUVAnyRow_SSSE3, ABGRToUVRow_Unaligned_SSSE3)
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -257,6 +257,43 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
||||
#endif
|
||||
|
||||
);
|
||||
}
|
||||
|
||||
void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
asm volatile (
|
||||
"movdqa %4,%%xmm5 \n"
|
||||
"movdqa %3,%%xmm4 \n"
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu 0x10(%0),%%xmm1 \n"
|
||||
"movdqu 0x20(%0),%%xmm2 \n"
|
||||
"movdqu 0x30(%0),%%xmm3 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm0 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm1 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm2 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm3 \n"
|
||||
"lea 0x40(%0),%0 \n"
|
||||
"phaddw %%xmm1,%%xmm0 \n"
|
||||
"phaddw %%xmm3,%%xmm2 \n"
|
||||
"psrlw $0x7,%%xmm0 \n"
|
||||
"psrlw $0x7,%%xmm2 \n"
|
||||
"packuswb %%xmm2,%%xmm0 \n"
|
||||
"paddb %%xmm5,%%xmm0 \n"
|
||||
"movdqu %%xmm0,(%1) \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(pix) // %2
|
||||
: "m"(kARGBToY), // %3
|
||||
"m"(kAddY16) // %4
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
||||
#endif
|
||||
|
||||
);
|
||||
}
|
||||
#endif
|
||||
@ -325,6 +362,74 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
#endif
|
||||
);
|
||||
}
|
||||
|
||||
void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
asm volatile (
|
||||
"movdqa %0,%%xmm4 \n"
|
||||
"movdqa %1,%%xmm3 \n"
|
||||
"movdqa %2,%%xmm5 \n"
|
||||
:
|
||||
: "m"(kARGBToU), // %0
|
||||
"m"(kARGBToV), // %1
|
||||
"m"(kAddUV128) // %2
|
||||
:
|
||||
#if defined(__SSE2__)
|
||||
"xmm3", "xmm4", "xmm5"
|
||||
#endif
|
||||
);
|
||||
asm volatile (
|
||||
"sub %1,%2 \n"
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu 0x10(%0),%%xmm1 \n"
|
||||
"movdqu 0x20(%0),%%xmm2 \n"
|
||||
"movdqu 0x30(%0),%%xmm6 \n"
|
||||
"movdqu (%0,%4,1),%%xmm7 \n"
|
||||
"pavgb %%xmm7,%%xmm0 \n"
|
||||
"movdqu 0x10(%0,%4,1),%%xmm7 \n"
|
||||
"pavgb %%xmm7,%%xmm1 \n"
|
||||
"movdqu 0x20(%0,%4,1),%%xmm7 \n"
|
||||
"pavgb %%xmm7,%%xmm2 \n"
|
||||
"movdqu 0x30(%0,%4,1),%%xmm7 \n"
|
||||
"pavgb %%xmm7,%%xmm6 \n"
|
||||
"lea 0x40(%0),%0 \n"
|
||||
"movdqa %%xmm0,%%xmm7 \n"
|
||||
"shufps $0x88,%%xmm1,%%xmm0 \n"
|
||||
"shufps $0xdd,%%xmm1,%%xmm7 \n"
|
||||
"pavgb %%xmm7,%%xmm0 \n"
|
||||
"movdqa %%xmm2,%%xmm7 \n"
|
||||
"shufps $0x88,%%xmm6,%%xmm2 \n"
|
||||
"shufps $0xdd,%%xmm6,%%xmm7 \n"
|
||||
"pavgb %%xmm7,%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"movdqa %%xmm2,%%xmm6 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm0 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm2 \n"
|
||||
"pmaddubsw %%xmm3,%%xmm1 \n"
|
||||
"pmaddubsw %%xmm3,%%xmm6 \n"
|
||||
"phaddw %%xmm2,%%xmm0 \n"
|
||||
"phaddw %%xmm6,%%xmm1 \n"
|
||||
"psraw $0x8,%%xmm0 \n"
|
||||
"psraw $0x8,%%xmm1 \n"
|
||||
"packsswb %%xmm1,%%xmm0 \n"
|
||||
"paddb %%xmm5,%%xmm0 \n"
|
||||
"movlps %%xmm0,(%1) \n"
|
||||
"movhps %%xmm0,(%1,%2,1) \n"
|
||||
"lea 0x8(%1),%1 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(dst_u), // %1
|
||||
"+r"(dst_v), // %2
|
||||
"+rm"(width) // %3
|
||||
: "r"(static_cast<intptr_t>(src_stride_argb))
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
|
||||
@ -624,6 +729,18 @@ void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
BGRAToARGBRow_SSSE3(src_argb, row, pix);
|
||||
ARGBToYRow_SSSE3(row, dst_y, pix);
|
||||
}
|
||||
|
||||
void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride]);
|
||||
ABGRToARGBRow_C(src_argb, row, pix);
|
||||
ARGBToYRow_SSSE3(row, dst_y, pix);
|
||||
}
|
||||
|
||||
void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride]);
|
||||
BGRAToARGBRow_C(src_argb, row, pix);
|
||||
ARGBToYRow_SSSE3(row, dst_y, pix);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_ARGBTOUVROW_SSSE3
|
||||
@ -642,6 +759,22 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
|
||||
BGRAToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
|
||||
ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
|
||||
}
|
||||
|
||||
void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int pix) {
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
|
||||
ABGRToARGBRow_C(src_argb, row, pix);
|
||||
ABGRToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
|
||||
ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
|
||||
}
|
||||
|
||||
void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int pix) {
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
|
||||
BGRAToARGBRow_C(src_argb, row, pix);
|
||||
BGRAToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
|
||||
ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MIRRORROW_SSSE3
|
||||
|
||||
@ -611,6 +611,39 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
mov edx, [esp + 8] /* dst_y */
|
||||
mov ecx, [esp + 12] /* pix */
|
||||
movdqa xmm5, kAddY16
|
||||
movdqa xmm4, kARGBToY
|
||||
|
||||
convertloop:
|
||||
movdqu xmm0, [eax]
|
||||
movdqu xmm1, [eax + 16]
|
||||
movdqu xmm2, [eax + 32]
|
||||
movdqu xmm3, [eax + 48]
|
||||
pmaddubsw xmm0, xmm4
|
||||
pmaddubsw xmm1, xmm4
|
||||
pmaddubsw xmm2, xmm4
|
||||
pmaddubsw xmm3, xmm4
|
||||
lea eax, [eax + 64]
|
||||
phaddw xmm0, xmm1
|
||||
phaddw xmm2, xmm3
|
||||
psrlw xmm0, 7
|
||||
psrlw xmm2, 7
|
||||
packuswb xmm0, xmm2
|
||||
paddb xmm0, xmm5
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
@ -644,6 +677,39 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
mov edx, [esp + 8] /* dst_y */
|
||||
mov ecx, [esp + 12] /* pix */
|
||||
movdqa xmm5, kAddY16
|
||||
movdqa xmm4, kBGRAToY
|
||||
|
||||
convertloop:
|
||||
movdqu xmm0, [eax]
|
||||
movdqu xmm1, [eax + 16]
|
||||
movdqu xmm2, [eax + 32]
|
||||
movdqu xmm3, [eax + 48]
|
||||
pmaddubsw xmm0, xmm4
|
||||
pmaddubsw xmm1, xmm4
|
||||
pmaddubsw xmm2, xmm4
|
||||
pmaddubsw xmm3, xmm4
|
||||
lea eax, [eax + 64]
|
||||
phaddw xmm0, xmm1
|
||||
phaddw xmm2, xmm3
|
||||
psrlw xmm0, 7
|
||||
psrlw xmm2, 7
|
||||
packuswb xmm0, xmm2
|
||||
paddb xmm0, xmm5
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
@ -677,6 +743,39 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
mov edx, [esp + 8] /* dst_y */
|
||||
mov ecx, [esp + 12] /* pix */
|
||||
movdqa xmm5, kAddY16
|
||||
movdqa xmm4, kABGRToY
|
||||
|
||||
convertloop:
|
||||
movdqu xmm0, [eax]
|
||||
movdqu xmm1, [eax + 16]
|
||||
movdqu xmm2, [eax + 32]
|
||||
movdqu xmm3, [eax + 48]
|
||||
pmaddubsw xmm0, xmm4
|
||||
pmaddubsw xmm1, xmm4
|
||||
pmaddubsw xmm2, xmm4
|
||||
pmaddubsw xmm3, xmm4
|
||||
lea eax, [eax + 64]
|
||||
phaddw xmm0, xmm1
|
||||
phaddw xmm2, xmm3
|
||||
psrlw xmm0, 7
|
||||
psrlw xmm2, 7
|
||||
packuswb xmm0, xmm2
|
||||
paddb xmm0, xmm5
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
@ -741,6 +840,75 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__declspec(naked)
|
||||
void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
mov eax, [esp + 8 + 4] // src_argb
|
||||
mov esi, [esp + 8 + 8] // src_stride_argb
|
||||
mov edx, [esp + 8 + 12] // dst_u
|
||||
mov edi, [esp + 8 + 16] // dst_v
|
||||
mov ecx, [esp + 8 + 20] // pix
|
||||
movdqa xmm7, kARGBToU
|
||||
movdqa xmm6, kARGBToV
|
||||
movdqa xmm5, kAddUV128
|
||||
sub edi, edx // stride from u to v
|
||||
|
||||
convertloop:
|
||||
/* step 1 - subsample 16x2 argb pixels to 8x1 */
|
||||
movdqu xmm0, [eax]
|
||||
movdqu xmm1, [eax + 16]
|
||||
movdqu xmm2, [eax + 32]
|
||||
movdqu xmm3, [eax + 48]
|
||||
movdqu xmm4, [eax + esi]
|
||||
pavgb xmm0, xmm4
|
||||
movdqu xmm4, [eax + esi + 16]
|
||||
pavgb xmm1, xmm4
|
||||
movdqu xmm4, [eax + esi + 32]
|
||||
pavgb xmm2, xmm4
|
||||
movdqu xmm4, [eax + esi + 48]
|
||||
pavgb xmm3, xmm4
|
||||
lea eax, [eax + 64]
|
||||
movdqa xmm4, xmm0
|
||||
shufps xmm0, xmm1, 0x88
|
||||
shufps xmm4, xmm1, 0xdd
|
||||
pavgb xmm0, xmm4
|
||||
movdqa xmm4, xmm2
|
||||
shufps xmm2, xmm3, 0x88
|
||||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
pmaddubsw xmm2, xmm7
|
||||
pmaddubsw xmm1, xmm6 // V
|
||||
pmaddubsw xmm3, xmm6
|
||||
phaddw xmm0, xmm2
|
||||
phaddw xmm1, xmm3
|
||||
psraw xmm0, 8
|
||||
psraw xmm1, 8
|
||||
packsswb xmm0, xmm1
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
@ -805,6 +973,74 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
mov eax, [esp + 8 + 4] // src_argb
|
||||
mov esi, [esp + 8 + 8] // src_stride_argb
|
||||
mov edx, [esp + 8 + 12] // dst_u
|
||||
mov edi, [esp + 8 + 16] // dst_v
|
||||
mov ecx, [esp + 8 + 20] // pix
|
||||
movdqa xmm7, kBGRAToU
|
||||
movdqa xmm6, kBGRAToV
|
||||
movdqa xmm5, kAddUV128
|
||||
sub edi, edx // stride from u to v
|
||||
|
||||
convertloop:
|
||||
/* step 1 - subsample 16x2 argb pixels to 8x1 */
|
||||
movdqu xmm0, [eax]
|
||||
movdqu xmm1, [eax + 16]
|
||||
movdqu xmm2, [eax + 32]
|
||||
movdqu xmm3, [eax + 48]
|
||||
movdqu xmm4, [eax + esi]
|
||||
pavgb xmm0, xmm4
|
||||
movdqu xmm4, [eax + esi + 16]
|
||||
pavgb xmm1, xmm4
|
||||
movdqu xmm4, [eax + esi + 32]
|
||||
pavgb xmm2, xmm4
|
||||
movdqu xmm4, [eax + esi + 48]
|
||||
pavgb xmm3, xmm4
|
||||
lea eax, [eax + 64]
|
||||
movdqa xmm4, xmm0
|
||||
shufps xmm0, xmm1, 0x88
|
||||
shufps xmm4, xmm1, 0xdd
|
||||
pavgb xmm0, xmm4
|
||||
movdqa xmm4, xmm2
|
||||
shufps xmm2, xmm3, 0x88
|
||||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
pmaddubsw xmm2, xmm7
|
||||
pmaddubsw xmm1, xmm6 // V
|
||||
pmaddubsw xmm3, xmm6
|
||||
phaddw xmm0, xmm2
|
||||
phaddw xmm1, xmm3
|
||||
psraw xmm0, 8
|
||||
psraw xmm1, 8
|
||||
packsswb xmm0, xmm1
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
@ -869,6 +1105,75 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
__declspec(naked)
|
||||
void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
mov eax, [esp + 8 + 4] // src_argb
|
||||
mov esi, [esp + 8 + 8] // src_stride_argb
|
||||
mov edx, [esp + 8 + 12] // dst_u
|
||||
mov edi, [esp + 8 + 16] // dst_v
|
||||
mov ecx, [esp + 8 + 20] // pix
|
||||
movdqa xmm7, kABGRToU
|
||||
movdqa xmm6, kABGRToV
|
||||
movdqa xmm5, kAddUV128
|
||||
sub edi, edx // stride from u to v
|
||||
|
||||
convertloop:
|
||||
/* step 1 - subsample 16x2 argb pixels to 8x1 */
|
||||
movdqu xmm0, [eax]
|
||||
movdqu xmm1, [eax + 16]
|
||||
movdqu xmm2, [eax + 32]
|
||||
movdqu xmm3, [eax + 48]
|
||||
movdqu xmm4, [eax + esi]
|
||||
pavgb xmm0, xmm4
|
||||
movdqu xmm4, [eax + esi + 16]
|
||||
pavgb xmm1, xmm4
|
||||
movdqu xmm4, [eax + esi + 32]
|
||||
pavgb xmm2, xmm4
|
||||
movdqu xmm4, [eax + esi + 48]
|
||||
pavgb xmm3, xmm4
|
||||
lea eax, [eax + 64]
|
||||
movdqa xmm4, xmm0
|
||||
shufps xmm0, xmm1, 0x88
|
||||
shufps xmm4, xmm1, 0xdd
|
||||
pavgb xmm0, xmm4
|
||||
movdqa xmm4, xmm2
|
||||
shufps xmm2, xmm3, 0x88
|
||||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
pmaddubsw xmm2, xmm7
|
||||
pmaddubsw xmm1, xmm6 // V
|
||||
pmaddubsw xmm3, xmm6
|
||||
phaddw xmm0, xmm2
|
||||
phaddw xmm1, xmm3
|
||||
psraw xmm0, 8
|
||||
psraw xmm1, 8
|
||||
packsswb xmm0, xmm1
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSSE3
|
||||
|
||||
#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user