mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Use the following order. local system libyuv Review URL: http://webrtc-codereview.appspot.com/270007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@76 16f28f9a-4ce2-e073-06de-1de4eb20be90
805 lines
32 KiB
C++
805 lines
32 KiB
C++
/*
|
|
* Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include "row.h"
|
|
|
|
#include "libyuv/basic_types.h"
|
|
|
|
extern "C" {
|
|
|
|
#ifdef HAS_ARGBTOYROW_SSSE3
|
|
|
|
// Constant multiplication table for converting ARGB to I400.
|
|
static const vec8 kARGBToY = {
|
|
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
|
|
};
|
|
|
|
static const uvec8 kAddY16 = {
|
|
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
|
|
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
|
|
};
|
|
|
|
#ifdef HAS_ARGBTOUVROW_SSSE3
|
|
static const vec8 kARGBToU = {
|
|
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
|
|
};
|
|
|
|
static const uvec8 kARGBToV = {
|
|
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
|
|
};
|
|
static const uvec8 kAddUV128 = {
|
|
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
|
|
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
|
|
};
|
|
#endif
|
|
|
|
// Shuffle table for converting BG24 to ARGB.
|
|
static const uvec8 kShuffleMaskBG24ToARGB = {
|
|
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
|
|
};
|
|
|
|
// Shuffle table for converting RAW to ARGB.
|
|
static const uvec8 kShuffleMaskRAWToARGB = {
|
|
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
|
|
};
|
|
|
|
// Shuffle table for converting ABGR to ARGB.
|
|
static const uvec8 kShuffleMaskABGRToARGB = {
|
|
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
|
|
};
|
|
|
|
// Shuffle table for converting BGRA to ARGB.
|
|
static const uvec8 kShuffleMaskBGRAToARGB = {
|
|
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
|
|
};
|
|
|
|
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"pslld $0x18,%%xmm5 \n"
|
|
"1: \n"
|
|
"movq (%0),%%xmm0 \n"
|
|
"lea 0x8(%0),%0 \n"
|
|
"punpcklbw %%xmm0,%%xmm0 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"punpcklwd %%xmm0,%%xmm0 \n"
|
|
"punpckhwd %%xmm1,%%xmm1 \n"
|
|
"por %%xmm5,%%xmm0 \n"
|
|
"por %%xmm5,%%xmm1 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"movdqa %%xmm1,0x10(%1) \n"
|
|
"lea 0x20(%1),%1 \n"
|
|
"sub $0x8,%2 \n"
|
|
"ja 1b \n"
|
|
: "+r"(src_y), // %0
|
|
"+r"(dst_argb), // %1
|
|
"+r"(pix) // %2
|
|
:
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
|
|
asm volatile (
|
|
"movdqa %3,%%xmm5 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"lea 0x10(%0),%0 \n"
|
|
"pshufb %%xmm5,%%xmm0 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"sub $0x4,%2 \n"
|
|
"ja 1b \n"
|
|
: "+r"(src_abgr), // %0
|
|
"+r"(dst_argb), // %1
|
|
"+r"(pix) // %2
|
|
: "m"(kShuffleMaskABGRToARGB) // %3
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm5"
|
|
#endif
|
|
|
|
);
|
|
}
|
|
|
|
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
|
|
asm volatile (
|
|
"movdqa %3,%%xmm5 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"lea 0x10(%0),%0 \n"
|
|
"pshufb %%xmm5,%%xmm0 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"sub $0x4,%2 \n"
|
|
"ja 1b \n"
|
|
: "+r"(src_bgra), // %0
|
|
"+r"(dst_argb), // %1
|
|
"+r"(pix) // %2
|
|
: "m"(kShuffleMaskBGRAToARGB) // %3
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
|
|
"pslld $0x18,%%xmm5 \n"
|
|
"movdqa %3,%%xmm4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"movdqa 0x20(%0),%%xmm3 \n"
|
|
"lea 0x30(%0),%0 \n"
|
|
"movdqa %%xmm3,%%xmm2 \n"
|
|
"palignr $0x8,%%xmm1,%%xmm2 \n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
|
|
"pshufb %%xmm4,%%xmm2 \n"
|
|
"por %%xmm5,%%xmm2 \n"
|
|
"palignr $0xc,%%xmm0,%%xmm1 \n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
|
|
"pshufb %%xmm4,%%xmm0 \n"
|
|
"movdqa %%xmm2,0x20(%1) \n"
|
|
"por %%xmm5,%%xmm0 \n"
|
|
"pshufb %%xmm4,%%xmm1 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"por %%xmm5,%%xmm1 \n"
|
|
"palignr $0x4,%%xmm3,%%xmm3 \n" // xmm3 = { xmm3[4:15] }
|
|
"pshufb %%xmm4,%%xmm3 \n"
|
|
"movdqa %%xmm1,0x10(%1) \n"
|
|
"por %%xmm5,%%xmm3 \n"
|
|
"movdqa %%xmm3,0x30(%1) \n"
|
|
"lea 0x40(%1),%1 \n"
|
|
"sub $0x10,%2 \n"
|
|
"ja 1b \n"
|
|
: "+r"(src_bg24), // %0
|
|
"+r"(dst_argb), // %1
|
|
"+r"(pix) // %2
|
|
: "m"(kShuffleMaskBG24ToARGB) // %3
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
|
|
"pslld $0x18,%%xmm5 \n"
|
|
"movdqa %3,%%xmm4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"movdqa 0x20(%0),%%xmm3 \n"
|
|
"lea 0x30(%0),%0 \n"
|
|
"movdqa %%xmm3,%%xmm2 \n"
|
|
"palignr $0x8,%%xmm1,%%xmm2 \n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
|
|
"pshufb %%xmm4,%%xmm2 \n"
|
|
"por %%xmm5,%%xmm2 \n"
|
|
"palignr $0xc,%%xmm0,%%xmm1 \n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
|
|
"pshufb %%xmm4,%%xmm0 \n"
|
|
"movdqa %%xmm2,0x20(%1) \n"
|
|
"por %%xmm5,%%xmm0 \n"
|
|
"pshufb %%xmm4,%%xmm1 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"por %%xmm5,%%xmm1 \n"
|
|
"palignr $0x4,%%xmm3,%%xmm3 \n" // xmm3 = { xmm3[4:15] }
|
|
"pshufb %%xmm4,%%xmm3 \n"
|
|
"movdqa %%xmm1,0x10(%1) \n"
|
|
"por %%xmm5,%%xmm3 \n"
|
|
"movdqa %%xmm3,0x30(%1) \n"
|
|
"lea 0x40(%1),%1 \n"
|
|
"sub $0x10,%2 \n"
|
|
"ja 1b \n"
|
|
: "+r"(src_raw), // %0
|
|
"+r"(dst_argb), // %1
|
|
"+r"(pix) // %2
|
|
: "m"(kShuffleMaskRAWToARGB) // %3
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
|
asm volatile (
|
|
"movdqa %4,%%xmm5 \n"
|
|
"movdqa %3,%%xmm4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"movdqa 0x20(%0),%%xmm2 \n"
|
|
"movdqa 0x30(%0),%%xmm3 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm1 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm4,%%xmm3 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"phaddw %%xmm1,%%xmm0 \n"
|
|
"phaddw %%xmm3,%%xmm2 \n"
|
|
"psrlw $0x7,%%xmm0 \n"
|
|
"psrlw $0x7,%%xmm2 \n"
|
|
"packuswb %%xmm2,%%xmm0 \n"
|
|
"paddb %%xmm5,%%xmm0 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"sub $0x10,%2 \n"
|
|
"ja 1b \n"
|
|
: "+r"(src_argb), // %0
|
|
"+r"(dst_y), // %1
|
|
"+r"(pix) // %2
|
|
: "m"(kARGBToY), // %3
|
|
"m"(kAddY16) // %4
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
|
|
);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_ARGBTOUVROW_SSSE3
|
|
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
|
uint8* dst_u, uint8* dst_v, int width) {
|
|
asm volatile (
|
|
"movdqa %0,%%xmm4 \n"
|
|
"movdqa %1,%%xmm3 \n"
|
|
"movdqa %2,%%xmm5 \n"
|
|
:
|
|
: "m"(kARGBToU), // %0
|
|
"m"(kARGBToV), // %1
|
|
"m"(kAddUV128) // %2
|
|
:
|
|
#if defined(__SSE2__)
|
|
"xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
asm volatile (
|
|
"sub %1,%2 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"movdqa 0x20(%0),%%xmm2 \n"
|
|
"movdqa 0x30(%0),%%xmm6 \n"
|
|
"pavgb (%0,%4,1),%%xmm0 \n"
|
|
"pavgb 0x10(%0,%4,1),%%xmm1 \n"
|
|
"pavgb 0x20(%0,%4,1),%%xmm2 \n"
|
|
"pavgb 0x30(%0,%4,1),%%xmm6 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"movdqa %%xmm0,%%xmm7 \n"
|
|
"shufps $0x88,%%xmm1,%%xmm0 \n"
|
|
"shufps $0xdd,%%xmm1,%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm0 \n"
|
|
"movdqa %%xmm2,%%xmm7 \n"
|
|
"shufps $0x88,%%xmm6,%%xmm2 \n"
|
|
"shufps $0xdd,%%xmm6,%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm2 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"movdqa %%xmm2,%%xmm6 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm3,%%xmm1 \n"
|
|
"pmaddubsw %%xmm3,%%xmm6 \n"
|
|
"phaddw %%xmm2,%%xmm0 \n"
|
|
"phaddw %%xmm6,%%xmm1 \n"
|
|
"psraw $0x8,%%xmm0 \n"
|
|
"psraw $0x8,%%xmm1 \n"
|
|
"packsswb %%xmm1,%%xmm0 \n"
|
|
"paddb %%xmm5,%%xmm0 \n"
|
|
"movlps %%xmm0,(%1) \n"
|
|
"movhps %%xmm0,(%1,%2,1) \n"
|
|
"lea 0x8(%1),%1 \n"
|
|
"sub $0x10,%3 \n"
|
|
"ja 1b \n"
|
|
: "+r"(src_argb0), // %0
|
|
"+r"(dst_u), // %1
|
|
"+r"(dst_v), // %2
|
|
"+rm"(width) // %3
|
|
: "r"(static_cast<intptr_t>(src_stride_argb))
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
|
|
#endif
|
|
);
|
|
}
|
|
#endif
|
|
|
|
// The following code requires 6 registers and prefers 7 registers.
|
|
// 7 registers requires -fpic to be off, and -fomit-frame-pointer
|
|
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
|
|
#if defined(__x86_64__)
|
|
#define REG_a "rax"
|
|
#define REG_d "rdx"
|
|
#else
|
|
#define REG_a "eax"
|
|
#define REG_d "edx"
|
|
#endif
|
|
#if defined(__APPLE__) || defined(__x86_64__)
|
|
#define OMITFP
|
|
#else
|
|
#define OMITFP __attribute__((optimize("omit-frame-pointer")))
|
|
#endif
|
|
|
|
#define CLOBBER "%"REG_a, "%"REG_d
|
|
// This version produces 2 pixels
|
|
#define YUVTORGB \
|
|
"1: \n" \
|
|
"movzb (%1),%%"REG_a" \n" \
|
|
"lea 1(%1),%1 \n" \
|
|
"movzb (%2),%%"REG_d" \n" \
|
|
"lea 1(%2),%2 \n" \
|
|
"movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \
|
|
"movzb 0(%0),%%"REG_a" \n" \
|
|
"movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \
|
|
"paddsw %%xmm1,%%xmm0 \n" \
|
|
"movzb 1(%0),%%"REG_d" \n" \
|
|
"punpcklqdq %%xmm0,%%xmm0 \n" \
|
|
"lea 2(%0),%0 \n" \
|
|
"movq 0(%5,%%"REG_a",8),%%xmm1 \n" \
|
|
"movhps 0(%5,%%"REG_d",8),%%xmm1 \n" \
|
|
"paddsw %%xmm0,%%xmm1 \n" \
|
|
"psraw $6,%%xmm1 \n" \
|
|
"packuswb %%xmm1,%%xmm1 \n" \
|
|
"movq %%xmm1,0(%3) \n" \
|
|
"lea 8(%3),%3 \n" \
|
|
"sub $0x2,%4 \n" \
|
|
"ja 1b \n"
|
|
// This version produces 4 pixels
|
|
#define YUVTORGB4 \
|
|
"1: \n" \
|
|
"movzb 0(%1),%%"REG_a" \n" \
|
|
"movzb 0(%2),%%"REG_d" \n" \
|
|
"movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \
|
|
"movzb 0(%0),%%"REG_a" \n" \
|
|
"movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \
|
|
"paddsw %%xmm1,%%xmm0 \n" \
|
|
"movzb 1(%0),%%"REG_d" \n" \
|
|
"punpcklqdq %%xmm0,%%xmm0 \n" \
|
|
"movq 0(%5,%%"REG_a",8),%%xmm2 \n" \
|
|
"movhps 0(%5,%%"REG_d",8),%%xmm2 \n" \
|
|
"paddsw %%xmm0,%%xmm2 \n" \
|
|
"psraw $6,%%xmm2 \n" \
|
|
"movzb 1(%1),%%"REG_a" \n" \
|
|
"movzb 1(%2),%%"REG_d" \n" \
|
|
"movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \
|
|
"movzb 2(%0),%%"REG_a" \n" \
|
|
"movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \
|
|
"paddsw %%xmm1,%%xmm0 \n" \
|
|
"movzb 3(%0),%%"REG_d" \n" \
|
|
"punpcklqdq %%xmm0,%%xmm0 \n" \
|
|
"movq 0(%5,%%"REG_a",8),%%xmm3 \n" \
|
|
"movhps 0(%5,%%"REG_d",8),%%xmm3 \n" \
|
|
"paddsw %%xmm0,%%xmm3 \n" \
|
|
"psraw $6,%%xmm3 \n" \
|
|
"lea 2(%1),%1 \n" \
|
|
"lea 2(%2),%2 \n" \
|
|
"lea 4(%0),%0 \n" \
|
|
"packuswb %%xmm3,%%xmm2 \n" \
|
|
"movdqa %%xmm2,0(%3) \n" \
|
|
"lea 16(%3),%3 \n" \
|
|
"sub $0x4,%4 \n" \
|
|
"ja 1b \n" \
|
|
|
|
// 6 or 7 registers
|
|
void OMITFP FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, // rdi
|
|
const uint8* u_buf, // rsi
|
|
const uint8* v_buf, // rdx
|
|
uint8* rgb_buf, // rcx
|
|
int width) { // r8
|
|
asm volatile (
|
|
YUVTORGB
|
|
: "+r"(y_buf), // %0
|
|
"+r"(u_buf), // %1
|
|
"+r"(v_buf), // %2
|
|
"+r"(rgb_buf), // %3
|
|
"+rm"(width) // %4
|
|
: "r" (kCoefficientsRgbY) // %5
|
|
: "memory", "cc", CLOBBER
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
// 6 or 7 registers
|
|
void OMITFP FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf, // rdi
|
|
const uint8* u_buf, // rsi
|
|
const uint8* v_buf, // rdx
|
|
uint8* rgb_buf, // rcx
|
|
int width) { // r8
|
|
asm volatile (
|
|
YUVTORGB4
|
|
: "+r"(y_buf), // %0
|
|
"+r"(u_buf), // %1
|
|
"+r"(v_buf), // %2
|
|
"+r"(rgb_buf), // %3
|
|
"+rm"(width) // %4
|
|
: "r" (kCoefficientsRgbY) // %5
|
|
: "memory", "cc", CLOBBER
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void OMITFP FastConvertYUVToBGRARow_SSE2(const uint8* y_buf, // rdi
|
|
const uint8* u_buf, // rsi
|
|
const uint8* v_buf, // rdx
|
|
uint8* rgb_buf, // rcx
|
|
int width) { // r8
|
|
asm volatile (
|
|
YUVTORGB
|
|
: "+r"(y_buf), // %0
|
|
"+r"(u_buf), // %1
|
|
"+r"(v_buf), // %2
|
|
"+r"(rgb_buf), // %3
|
|
"+rm"(width) // %4
|
|
: "r" (kCoefficientsBgraY) // %5
|
|
: "memory", "cc", CLOBBER
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void OMITFP FastConvertYUVToABGRRow_SSE2(const uint8* y_buf, // rdi
|
|
const uint8* u_buf, // rsi
|
|
const uint8* v_buf, // rdx
|
|
uint8* rgb_buf, // rcx
|
|
int width) { // r8
|
|
asm volatile (
|
|
YUVTORGB
|
|
: "+r"(y_buf), // %0
|
|
"+r"(u_buf), // %1
|
|
"+r"(v_buf), // %2
|
|
"+r"(rgb_buf), // %3
|
|
"+rm"(width) // %4
|
|
: "r" (kCoefficientsAbgrY) // %5
|
|
: "memory", "cc", CLOBBER
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
// 6 registers
|
|
void OMITFP FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, // rdi
|
|
const uint8* u_buf, // rsi
|
|
const uint8* v_buf, // rdx
|
|
uint8* rgb_buf, // rcx
|
|
int width) { // r8
|
|
asm volatile (
|
|
"1: \n"
|
|
"movzb (%1),%%"REG_a" \n"
|
|
"lea 1(%1),%1 \n"
|
|
"movq 2048(%5,%%"REG_a",8),%%xmm0 \n"
|
|
"movzb (%2),%%"REG_a" \n"
|
|
"lea 1(%2),%2 \n"
|
|
"movq 4096(%5,%%"REG_a",8),%%xmm1 \n"
|
|
"paddsw %%xmm1,%%xmm0 \n"
|
|
"movzb (%0),%%"REG_a" \n"
|
|
"lea 1(%0),%0 \n"
|
|
"movq 0(%5,%%"REG_a",8),%%xmm2 \n"
|
|
"paddsw %%xmm0,%%xmm2 \n"
|
|
"shufps $0x44,%%xmm2,%%xmm2 \n"
|
|
"psraw $0x6,%%xmm2 \n"
|
|
"packuswb %%xmm2,%%xmm2 \n"
|
|
"movd %%xmm2,0x0(%3) \n"
|
|
"lea 4(%3),%3 \n"
|
|
"sub $0x1,%4 \n"
|
|
"ja 1b \n"
|
|
: "+r"(y_buf), // %0
|
|
"+r"(u_buf), // %1
|
|
"+r"(v_buf), // %2
|
|
"+r"(rgb_buf), // %3
|
|
"+rm"(width) // %4
|
|
: "r" (kCoefficientsRgbY) // %5
|
|
: "memory", "cc", "%"REG_a
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
// 5 registers
|
|
void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
|
|
uint8* rgb_buf, // rcx
|
|
int width) { // r8
|
|
asm volatile (
|
|
"1: \n"
|
|
"movzb (%0),%%"REG_a" \n"
|
|
"movzb 0x1(%0),%%"REG_d" \n"
|
|
"movq (%3,%%"REG_a",8),%%xmm2 \n"
|
|
"lea 2(%0),%0 \n"
|
|
"movhps (%3,%%"REG_d",8),%%xmm2 \n"
|
|
"psraw $0x6,%%xmm2 \n"
|
|
"packuswb %%xmm2,%%xmm2 \n"
|
|
"movq %%xmm2,0x0(%1) \n"
|
|
"lea 8(%1),%1 \n"
|
|
"sub $0x2,%2 \n"
|
|
"ja 1b \n"
|
|
: "+r"(y_buf), // %0
|
|
"+r"(rgb_buf), // %1
|
|
"+rm"(width) // %2
|
|
: "r" (kCoefficientsRgbY) // %3
|
|
: "memory", "cc", "%"REG_a, "%"REG_d
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
#endif
|
|
|
|
#ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX
|
|
// 32 bit mmx gcc version
|
|
|
|
#ifdef OSX
|
|
#define UNDERSCORE "_"
|
|
#else
|
|
#define UNDERSCORE ""
|
|
#endif
|
|
|
|
void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
|
|
const uint8* u_buf,
|
|
const uint8* v_buf,
|
|
uint8* rgb_buf,
|
|
int width);
|
|
asm(
|
|
".text \n"
|
|
#if defined(OSX) || defined(IOS)
|
|
".globl _FastConvertYUVToARGBRow_MMX \n"
|
|
"_FastConvertYUVToARGBRow_MMX: \n"
|
|
#else
|
|
".global FastConvertYUVToARGBRow_MMX \n"
|
|
"FastConvertYUVToARGBRow_MMX: \n"
|
|
#endif
|
|
"pusha \n"
|
|
"mov 0x24(%esp),%edx \n"
|
|
"mov 0x28(%esp),%edi \n"
|
|
"mov 0x2c(%esp),%esi \n"
|
|
"mov 0x30(%esp),%ebp \n"
|
|
"mov 0x34(%esp),%ecx \n"
|
|
|
|
"1: \n"
|
|
"movzbl (%edi),%eax \n"
|
|
"lea 1(%edi),%edi \n"
|
|
"movzbl (%esi),%ebx \n"
|
|
"lea 1(%esi),%esi \n"
|
|
"movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
|
|
"movzbl (%edx),%eax \n"
|
|
"paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
|
|
"movzbl 0x1(%edx),%ebx \n"
|
|
"movq " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm1\n"
|
|
"lea 2(%edx),%edx \n"
|
|
"movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm2\n"
|
|
"paddsw %mm0,%mm1 \n"
|
|
"paddsw %mm0,%mm2 \n"
|
|
"psraw $0x6,%mm1 \n"
|
|
"psraw $0x6,%mm2 \n"
|
|
"packuswb %mm2,%mm1 \n"
|
|
"movq %mm1,0x0(%ebp) \n"
|
|
"lea 8(%ebp),%ebp \n"
|
|
"sub $0x2,%ecx \n"
|
|
"ja 1b \n"
|
|
"popa \n"
|
|
"ret \n"
|
|
);
|
|
|
|
void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
|
|
const uint8* u_buf,
|
|
const uint8* v_buf,
|
|
uint8* rgb_buf,
|
|
int width);
|
|
asm(
|
|
".text \n"
|
|
#if defined(OSX) || defined(IOS)
|
|
".globl _FastConvertYUVToBGRARow_MMX \n"
|
|
"_FastConvertYUVToBGRARow_MMX: \n"
|
|
#else
|
|
".global FastConvertYUVToBGRARow_MMX \n"
|
|
"FastConvertYUVToBGRARow_MMX: \n"
|
|
#endif
|
|
"pusha \n"
|
|
"mov 0x24(%esp),%edx \n"
|
|
"mov 0x28(%esp),%edi \n"
|
|
"mov 0x2c(%esp),%esi \n"
|
|
"mov 0x30(%esp),%ebp \n"
|
|
"mov 0x34(%esp),%ecx \n"
|
|
|
|
"1: \n"
|
|
"movzbl (%edi),%eax \n"
|
|
"lea 1(%edi),%edi \n"
|
|
"movzbl (%esi),%ebx \n"
|
|
"lea 1(%esi),%esi \n"
|
|
"movq " UNDERSCORE "kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
|
|
"movzbl (%edx),%eax \n"
|
|
"paddsw " UNDERSCORE "kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
|
|
"movzbl 0x1(%edx),%ebx \n"
|
|
"movq " UNDERSCORE "kCoefficientsBgraY(,%eax,8),%mm1\n"
|
|
"lea 2(%edx),%edx \n"
|
|
"movq " UNDERSCORE "kCoefficientsBgraY(,%ebx,8),%mm2\n"
|
|
"paddsw %mm0,%mm1 \n"
|
|
"paddsw %mm0,%mm2 \n"
|
|
"psraw $0x6,%mm1 \n"
|
|
"psraw $0x6,%mm2 \n"
|
|
"packuswb %mm2,%mm1 \n"
|
|
"movq %mm1,0x0(%ebp) \n"
|
|
"lea 8(%ebp),%ebp \n"
|
|
"sub $0x2,%ecx \n"
|
|
"ja 1b \n"
|
|
"popa \n"
|
|
"ret \n"
|
|
);
|
|
|
|
void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
|
|
const uint8* u_buf,
|
|
const uint8* v_buf,
|
|
uint8* rgb_buf,
|
|
int width);
|
|
asm(
|
|
".text \n"
|
|
#if defined(OSX) || defined(IOS)
|
|
".globl _FastConvertYUVToABGRRow_MMX \n"
|
|
"_FastConvertYUVToABGRRow_MMX: \n"
|
|
#else
|
|
".global FastConvertYUVToABGRRow_MMX \n"
|
|
"FastConvertYUVToABGRRow_MMX: \n"
|
|
#endif
|
|
"pusha \n"
|
|
"mov 0x24(%esp),%edx \n"
|
|
"mov 0x28(%esp),%edi \n"
|
|
"mov 0x2c(%esp),%esi \n"
|
|
"mov 0x30(%esp),%ebp \n"
|
|
"mov 0x34(%esp),%ecx \n"
|
|
|
|
"1: \n"
|
|
"movzbl (%edi),%eax \n"
|
|
"lea 1(%edi),%edi \n"
|
|
"movzbl (%esi),%ebx \n"
|
|
"lea 1(%esi),%esi \n"
|
|
"movq " UNDERSCORE "kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
|
|
"movzbl (%edx),%eax \n"
|
|
"paddsw " UNDERSCORE "kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
|
|
"movzbl 0x1(%edx),%ebx \n"
|
|
"movq " UNDERSCORE "kCoefficientsAbgrY(,%eax,8),%mm1\n"
|
|
"lea 2(%edx),%edx \n"
|
|
"movq " UNDERSCORE "kCoefficientsAbgrY(,%ebx,8),%mm2\n"
|
|
"paddsw %mm0,%mm1 \n"
|
|
"paddsw %mm0,%mm2 \n"
|
|
"psraw $0x6,%mm1 \n"
|
|
"psraw $0x6,%mm2 \n"
|
|
"packuswb %mm2,%mm1 \n"
|
|
"movq %mm1,0x0(%ebp) \n"
|
|
"lea 8(%ebp),%ebp \n"
|
|
"sub $0x2,%ecx \n"
|
|
"ja 1b \n"
|
|
"popa \n"
|
|
"ret \n"
|
|
);
|
|
|
|
void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
|
|
const uint8* u_buf,
|
|
const uint8* v_buf,
|
|
uint8* rgb_buf,
|
|
int width);
|
|
asm(
|
|
".text \n"
|
|
#if defined(OSX) || defined(IOS)
|
|
".globl _FastConvertYUV444ToARGBRow_MMX \n"
|
|
"_FastConvertYUV444ToARGBRow_MMX: \n"
|
|
#else
|
|
".global FastConvertYUV444ToARGBRow_MMX \n"
|
|
"FastConvertYUV444ToARGBRow_MMX: \n"
|
|
#endif
|
|
"pusha \n"
|
|
"mov 0x24(%esp),%edx \n"
|
|
"mov 0x28(%esp),%edi \n"
|
|
"mov 0x2c(%esp),%esi \n"
|
|
"mov 0x30(%esp),%ebp \n"
|
|
"mov 0x34(%esp),%ecx \n"
|
|
|
|
"1: \n"
|
|
"movzbl (%edi),%eax \n"
|
|
"lea 1(%edi),%edi \n"
|
|
"movzbl (%esi),%ebx \n"
|
|
"lea 1(%esi),%esi \n"
|
|
"movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
|
|
"movzbl (%edx),%eax \n"
|
|
"paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
|
|
"lea 1(%edx),%edx \n"
|
|
"paddsw " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm0\n"
|
|
"psraw $0x6,%mm0 \n"
|
|
"packuswb %mm0,%mm0 \n"
|
|
"movd %mm0,0x0(%ebp) \n"
|
|
"lea 4(%ebp),%ebp \n"
|
|
"sub $0x1,%ecx \n"
|
|
"ja 1b \n"
|
|
"popa \n"
|
|
"ret \n"
|
|
);
|
|
|
|
void FastConvertYToARGBRow_MMX(const uint8* y_buf,
|
|
uint8* rgb_buf,
|
|
int width);
|
|
asm(
|
|
".text \n"
|
|
#if defined(OSX) || defined(IOS)
|
|
".globl _FastConvertYToARGBRow_MMX \n"
|
|
"_FastConvertYToARGBRow_MMX: \n"
|
|
#else
|
|
".global FastConvertYToARGBRow_MMX \n"
|
|
"FastConvertYToARGBRow_MMX: \n"
|
|
#endif
|
|
"push %ebx \n"
|
|
"mov 0x8(%esp),%eax \n"
|
|
"mov 0xc(%esp),%edx \n"
|
|
"mov 0x10(%esp),%ecx \n"
|
|
|
|
"1: \n"
|
|
"movzbl (%eax),%ebx \n"
|
|
"movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm0\n"
|
|
"psraw $0x6,%mm0 \n"
|
|
"movzbl 0x1(%eax),%ebx \n"
|
|
"movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm1\n"
|
|
"psraw $0x6,%mm1 \n"
|
|
"packuswb %mm1,%mm0 \n"
|
|
"lea 0x2(%eax),%eax \n"
|
|
"movq %mm0,(%edx) \n"
|
|
"lea 0x8(%edx),%edx \n"
|
|
"sub $0x2,%ecx \n"
|
|
"ja 1b \n"
|
|
"pop %ebx \n"
|
|
"ret \n"
|
|
);
|
|
|
|
#endif
|
|
|
|
#ifdef HAS_ARGBTOYROW_SSSE3
|
|
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
|
SIMD_ALIGNED(uint8 row[kMaxStride]);
|
|
ABGRToARGBRow_SSSE3(src_argb, row, pix);
|
|
ARGBToYRow_SSSE3(row, dst_y, pix);
|
|
}
|
|
|
|
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
|
SIMD_ALIGNED(uint8 row[kMaxStride]);
|
|
BGRAToARGBRow_SSSE3(src_argb, row, pix);
|
|
ARGBToYRow_SSSE3(row, dst_y, pix);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_ARGBTOUVROW_SSSE3
|
|
void ABGRToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
|
|
uint8* dst_u, uint8* dst_v, int pix) {
|
|
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
|
|
ABGRToARGBRow_SSSE3(src_argb, row, pix);
|
|
ABGRToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
|
|
ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
|
|
}
|
|
|
|
void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
|
|
uint8* dst_u, uint8* dst_v, int pix) {
|
|
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
|
|
BGRAToARGBRow_SSSE3(src_argb, row, pix);
|
|
BGRAToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
|
|
ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
|
|
}
|
|
#endif
|
|
|
|
} // extern "C"
|