mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
align asm new line to column 48
BUG=none TEST=builds Review URL: http://webrtc-codereview.appspot.com/268008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@73 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
2cb934c624
commit
f7a5048f54
@ -23,35 +23,30 @@ namespace libyuv {
|
||||
static uint32 SumSquareError_NEON(const uint8* src_a,
|
||||
const uint8* src_b, int count) {
|
||||
volatile uint32 sse;
|
||||
asm volatile
|
||||
(
|
||||
"vmov.u8 q7, #0\n"
|
||||
"vmov.u8 q9, #0\n"
|
||||
"vmov.u8 q8, #0\n"
|
||||
"vmov.u8 q10, #0\n"
|
||||
asm volatile (
|
||||
"vmov.u8 q7, #0 \n"
|
||||
"vmov.u8 q9, #0 \n"
|
||||
"vmov.u8 q8, #0 \n"
|
||||
"vmov.u8 q10, #0 \n"
|
||||
|
||||
"1:\n"
|
||||
"vld1.u8 {q0}, [%0]!\n"
|
||||
"vld1.u8 {q1}, [%1]!\n"
|
||||
|
||||
"vsubl.u8 q2, d0, d2\n"
|
||||
"vsubl.u8 q3, d1, d3\n"
|
||||
|
||||
"vmlal.s16 q7, d4, d4\n"
|
||||
"vmlal.s16 q8, d6, d6\n"
|
||||
"vmlal.s16 q8, d5, d5\n"
|
||||
"vmlal.s16 q10, d7, d7\n"
|
||||
|
||||
"subs %2, %2, #16\n"
|
||||
"bhi 1b\n"
|
||||
|
||||
"vadd.u32 q7, q7, q8\n"
|
||||
"vadd.u32 q9, q9, q10\n"
|
||||
"vadd.u32 q10, q7, q9\n"
|
||||
"vpaddl.u32 q1, q10\n"
|
||||
"vadd.u64 d0, d2, d3\n"
|
||||
"vmov.32 %3, d0[0]\n"
|
||||
"1: \n"
|
||||
"vld1.u8 {q0}, [%0]! \n"
|
||||
"vld1.u8 {q1}, [%1]! \n"
|
||||
"vsubl.u8 q2, d0, d2 \n"
|
||||
"vsubl.u8 q3, d1, d3 \n"
|
||||
"vmlal.s16 q7, d4, d4 \n"
|
||||
"vmlal.s16 q8, d6, d6 \n"
|
||||
"vmlal.s16 q8, d5, d5 \n"
|
||||
"vmlal.s16 q10, d7, d7 \n"
|
||||
"subs %2, %2, #16 \n"
|
||||
"bhi 1b \n"
|
||||
|
||||
"vadd.u32 q7, q7, q8 \n"
|
||||
"vadd.u32 q9, q9, q10 \n"
|
||||
"vadd.u32 q10, q7, q9 \n"
|
||||
"vpaddl.u32 q1, q10 \n"
|
||||
"vadd.u64 d0, d2, d3 \n"
|
||||
"vmov.32 %3, d0[0] \n"
|
||||
: "+r"(src_a),
|
||||
"+r"(src_b),
|
||||
"+r"(count),
|
||||
@ -59,7 +54,6 @@ static uint32 SumSquareError_NEON(const uint8* src_a,
|
||||
:
|
||||
: "memory", "cc", "q0", "q1", "q2", "q3", "q7", "q8", "q9", "q10"
|
||||
);
|
||||
|
||||
return sse;
|
||||
}
|
||||
|
||||
@ -102,7 +96,6 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
|
||||
pshufd xmm1, xmm0, 01h
|
||||
paddd xmm0, xmm1
|
||||
movd eax, xmm0
|
||||
|
||||
ret
|
||||
}
|
||||
}
|
||||
@ -112,11 +105,12 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
|
||||
// DISABLE
|
||||
//#define HAS_SUMSQUAREERROR_SSE2
|
||||
// DISABLE
|
||||
#if HAS_SUMSQUAREERROR_SSE2
|
||||
static uint32 SumSquareError_SSE2(const uint8* src_a,
|
||||
const uint8* src_b, int count) {
|
||||
volatile uint32 sse;
|
||||
asm volatile(
|
||||
"\n"
|
||||
asm volatile (
|
||||
" \n"
|
||||
: "+r"(src_a), // %0
|
||||
"+r"(src_b), // %1
|
||||
"+r"(count), // %2
|
||||
@ -131,6 +125,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a,
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
|
||||
static uint32 SumSquareError_C(const uint8* src_a,
|
||||
const uint8* src_b, int count) {
|
||||
@ -148,7 +143,6 @@ uint64 ComputeSumSquareError(const uint8* src_a,
|
||||
const uint8* src_b, int count) {
|
||||
uint32 (*SumSquareError)(const uint8* src_a,
|
||||
const uint8* src_b, int count);
|
||||
|
||||
#if defined(HAS_SUMSQUAREERROR_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
SumSquareError = SumSquareError_NEON;
|
||||
@ -162,10 +156,8 @@ uint64 ComputeSumSquareError(const uint8* src_a,
|
||||
{
|
||||
SumSquareError = SumSquareError_C;
|
||||
}
|
||||
|
||||
const int kBlockSize = 4096;
|
||||
uint64 diff = 0;
|
||||
|
||||
while (count >= kBlockSize) {
|
||||
diff += SumSquareError(src_a, src_b, kBlockSize);
|
||||
src_a += kBlockSize;
|
||||
@ -179,7 +171,6 @@ uint64 ComputeSumSquareError(const uint8* src_a,
|
||||
diff += static_cast<uint64>(SumSquareError_C(src_a, src_b, count));
|
||||
}
|
||||
}
|
||||
|
||||
return diff;
|
||||
}
|
||||
|
||||
@ -188,7 +179,6 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
|
||||
int width, int height) {
|
||||
uint32 (*SumSquareError)(const uint8* src_a,
|
||||
const uint8* src_b, int count);
|
||||
|
||||
#if defined(HAS_SUMSQUAREERROR_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) &&
|
||||
(width % 16 == 0)) {
|
||||
@ -200,7 +190,6 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
|
||||
}
|
||||
|
||||
uint64 sse = 0;
|
||||
|
||||
for (int h = 0; h < height; ++h) {
|
||||
sse += static_cast<uint64>(SumSquareError(src_a, src_b, width));
|
||||
src_a += stride_a;
|
||||
@ -210,11 +199,10 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
|
||||
return sse;
|
||||
}
|
||||
|
||||
double Sse2Psnr(double Samples, double Sse) {
|
||||
double Sse2Psnr(double samples, double sse) {
|
||||
double psnr;
|
||||
|
||||
if (Sse > 0.0)
|
||||
psnr = 10.0 * log10(255.0 * 255.0 * Samples / Sse);
|
||||
if (sse > 0.0)
|
||||
psnr = 10.0 * log10(255.0 * 255.0 * samples / sse);
|
||||
else
|
||||
psnr = kMaxPsnr; // Limit to prevent divide by 0
|
||||
|
||||
@ -224,6 +212,21 @@ double Sse2Psnr(double Samples, double Sse) {
|
||||
return psnr;
|
||||
}
|
||||
|
||||
double Sse2Psnr(uint64 samples, uint64 sse) {
|
||||
double psnr;
|
||||
if (sse > 0) {
|
||||
double mse = static_cast<double>(samples) / static_cast<double>(sse);
|
||||
psnr = 10.0 * log10(255.0 * 255.0 * mse);
|
||||
} else {
|
||||
psnr = kMaxPsnr; // Limit to prevent divide by 0
|
||||
}
|
||||
|
||||
if (psnr > kMaxPsnr)
|
||||
psnr = kMaxPsnr;
|
||||
|
||||
return psnr;
|
||||
}
|
||||
|
||||
double CalcFramePsnr(const uint8* src_a, int stride_a,
|
||||
const uint8* src_b, int stride_b,
|
||||
int width, int height) {
|
||||
@ -233,7 +236,7 @@ double CalcFramePsnr(const uint8* src_a, int stride_a,
|
||||
src_b, stride_b,
|
||||
width, height);
|
||||
|
||||
return Sse2Psnr (samples, sse);
|
||||
return Sse2Psnr(samples, sse);
|
||||
}
|
||||
|
||||
double I420Psnr(const uint8* src_y_a, int stride_y_a,
|
||||
|
||||
@ -22,9 +22,9 @@
|
||||
#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
|
||||
static inline void __cpuid(int cpu_info[4], int info_type) {
|
||||
asm volatile (
|
||||
"mov %%ebx, %%edi\n"
|
||||
"cpuid\n"
|
||||
"xchg %%edi, %%ebx\n"
|
||||
"mov %%ebx, %%edi \n"
|
||||
"cpuid \n"
|
||||
"xchg %%edi, %%ebx \n"
|
||||
: "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
|
||||
: "a"(info_type)
|
||||
);
|
||||
@ -32,7 +32,7 @@ static inline void __cpuid(int cpu_info[4], int info_type) {
|
||||
#elif defined(__i386__) || defined(__x86_64__)
|
||||
static inline void __cpuid(int cpu_info[4], int info_type) {
|
||||
asm volatile (
|
||||
"cpuid\n"
|
||||
"cpuid \n"
|
||||
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
|
||||
: "a"(info_type)
|
||||
);
|
||||
|
||||
@ -50,17 +50,17 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
|
||||
#define HAS_ARGBTOBAYERROW_SSSE3
|
||||
static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
|
||||
uint32 selector, int pix) {
|
||||
asm volatile(
|
||||
"movd %3,%%xmm5\n"
|
||||
"pshufd $0x0,%%xmm5,%%xmm5\n"
|
||||
"1:\n"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"lea 0x10(%0),%0\n"
|
||||
"pshufb %%xmm5,%%xmm0\n"
|
||||
"movd %%xmm0,(%1)\n"
|
||||
"lea 0x4(%1),%1\n"
|
||||
"sub $0x4,%2\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"movd %3,%%xmm5 \n"
|
||||
"pshufd $0x0,%%xmm5,%%xmm5 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"pshufb %%xmm5,%%xmm0 \n"
|
||||
"movd %%xmm0,(%1) \n"
|
||||
"lea 0x4(%1),%1 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_bayer), // %1
|
||||
"+r"(pix) // %2
|
||||
|
||||
@ -23,14 +23,13 @@ namespace libyuv {
|
||||
// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
|
||||
static void SplitUV_NEON(const uint8* src_uv,
|
||||
uint8* dst_u, uint8* dst_v, int pix) {
|
||||
__asm__ volatile
|
||||
(
|
||||
"1:\n"
|
||||
"vld2.u8 {q0,q1}, [%0]!\n" // load 16 pairs of UV
|
||||
"vst1.u8 {q0}, [%1]!\n" // store U
|
||||
"vst1.u8 {q1}, [%2]!\n" // Store V
|
||||
"subs %3, %3, #16\n" // 16 processed per loop
|
||||
"bhi 1b\n"
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
"vld2.u8 {q0,q1}, [%0]! \n" // load 16 pairs of UV
|
||||
"vst1.u8 {q0}, [%1]! \n" // store U
|
||||
"vst1.u8 {q1}, [%2]! \n" // Store V
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop
|
||||
"bhi 1b \n"
|
||||
: "+r"(src_uv),
|
||||
"+r"(dst_u),
|
||||
"+r"(dst_v),
|
||||
@ -57,7 +56,7 @@ static void SplitUV_SSE2(const uint8* src_uv,
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
||||
psrlw xmm5, 8
|
||||
|
||||
wloop:
|
||||
convertloop:
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
lea eax, [eax + 32]
|
||||
@ -74,7 +73,7 @@ static void SplitUV_SSE2(const uint8* src_uv,
|
||||
movdqa [edi], xmm2
|
||||
lea edi, [edi + 16]
|
||||
sub ecx, 16
|
||||
ja wloop
|
||||
ja convertloop
|
||||
pop edi
|
||||
ret
|
||||
}
|
||||
@ -85,27 +84,27 @@ static void SplitUV_SSE2(const uint8* src_uv,
|
||||
#define HAS_SPLITUV_SSE2
|
||||
static void SplitUV_SSE2(const uint8* src_uv,
|
||||
uint8* dst_u, uint8* dst_v, int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm5,%%xmm5\n"
|
||||
"psrlw $0x8,%%xmm5\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
"lea 0x20(%0),%0\n"
|
||||
"movdqa %%xmm0,%%xmm2\n"
|
||||
"movdqa %%xmm1,%%xmm3\n"
|
||||
"pand %%xmm5,%%xmm0\n"
|
||||
"pand %%xmm5,%%xmm1\n"
|
||||
"packuswb %%xmm1,%%xmm0\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"lea 0x10(%1),%1\n"
|
||||
"psrlw $0x8,%%xmm2\n"
|
||||
"psrlw $0x8,%%xmm3\n"
|
||||
"packuswb %%xmm3,%%xmm2\n"
|
||||
"movdqa %%xmm2,(%2)\n"
|
||||
"lea 0x10(%2),%2\n"
|
||||
"sub $0x10,%3\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"psrlw $0x8,%%xmm5 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa 0x10(%0),%%xmm1 \n"
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"pand %%xmm5,%%xmm0 \n"
|
||||
"pand %%xmm5,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"movdqa %%xmm0,(%1) \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
"psrlw $0x8,%%xmm2 \n"
|
||||
"psrlw $0x8,%%xmm3 \n"
|
||||
"packuswb %%xmm3,%%xmm2 \n"
|
||||
"movdqa %%xmm2,(%2) \n"
|
||||
"lea 0x10(%2),%2 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src_uv), // %0
|
||||
"+r"(dst_u), // %1
|
||||
"+r"(dst_v), // %2
|
||||
@ -239,13 +238,12 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
|
||||
#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
|
||||
#define HAS_SETROW_NEON
|
||||
static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
|
||||
__asm__ volatile
|
||||
(
|
||||
"vdup.u32 q0, %2\n" // duplicate 4 ints
|
||||
"1:\n"
|
||||
"vst1.u32 {q0}, [%0]!\n" // store
|
||||
"subs %1, %1, #16\n" // 16 processed per loop
|
||||
"bhi 1b\n"
|
||||
asm volatile (
|
||||
"vdup.u32 q0, %2 \n" // duplicate 4 ints
|
||||
"1: \n"
|
||||
"vst1.u32 {q0}, [%0]! \n" // store
|
||||
"subs %1, %1, #16 \n" // 16 processed per loop
|
||||
"bhi 1b \n"
|
||||
: "+r"(dst), // %0
|
||||
"+r"(count) // %1
|
||||
: "r"(v32) // %2
|
||||
@ -263,11 +261,11 @@ static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
|
||||
mov ecx, [esp + 12] // count
|
||||
pshufd xmm5, xmm5, 0
|
||||
|
||||
wloop:
|
||||
convertloop:
|
||||
movdqa [eax], xmm5
|
||||
lea eax, [eax + 16]
|
||||
sub ecx, 16
|
||||
ja wloop
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
@ -277,14 +275,14 @@ static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
|
||||
|
||||
#define HAS_SETROW_SSE2
|
||||
static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
|
||||
asm volatile(
|
||||
"movd %2, %%xmm5\n"
|
||||
"pshufd $0x0,%%xmm5,%%xmm5\n"
|
||||
"1:"
|
||||
"movdqa %%xmm5,(%0)\n"
|
||||
"lea 0x10(%0),%0\n"
|
||||
"sub $0x10,%1\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"movd %2, %%xmm5 \n"
|
||||
"pshufd $0x0,%%xmm5,%%xmm5 \n"
|
||||
"1: \n"
|
||||
"movdqa %%xmm5,(%0) \n"
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"sub $0x10,%1 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(dst), // %0
|
||||
"+r"(count) // %1
|
||||
: "r"(v32) // %2
|
||||
@ -561,7 +559,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
||||
psrlw xmm5, 8
|
||||
|
||||
wloop:
|
||||
convertloop:
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
lea eax, [eax + 32]
|
||||
@ -585,7 +583,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
|
||||
movq qword ptr [edi], xmm1
|
||||
lea edi, [edi + 8]
|
||||
sub ecx, 16
|
||||
ja wloop
|
||||
ja convertloop
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
@ -598,34 +596,34 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
|
||||
#define HAS_SPLITYUY2_SSE2
|
||||
static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
|
||||
uint8* dst_u, uint8* dst_v, int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm5,%%xmm5\n"
|
||||
"psrlw $0x8,%%xmm5\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
"lea 0x20(%0),%0\n"
|
||||
"movdqa %%xmm0,%%xmm2\n"
|
||||
"movdqa %%xmm1,%%xmm3\n"
|
||||
"pand %%xmm5,%%xmm2\n"
|
||||
"pand %%xmm5,%%xmm3\n"
|
||||
"packuswb %%xmm3,%%xmm2\n"
|
||||
"movdqa %%xmm2,(%1)\n"
|
||||
"lea 0x10(%1),%1\n"
|
||||
"psrlw $0x8,%%xmm0\n"
|
||||
"psrlw $0x8,%%xmm1\n"
|
||||
"packuswb %%xmm1,%%xmm0\n"
|
||||
"movdqa %%xmm0,%%xmm1\n"
|
||||
"pand %%xmm5,%%xmm0\n"
|
||||
"packuswb %%xmm0,%%xmm0\n"
|
||||
"movq %%xmm0,(%2)\n"
|
||||
"lea 0x8(%2),%2\n"
|
||||
"psrlw $0x8,%%xmm1\n"
|
||||
"packuswb %%xmm1,%%xmm1\n"
|
||||
"movq %%xmm1,(%3)\n"
|
||||
"lea 0x8(%3),%3\n"
|
||||
"sub $0x10,%4\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"psrlw $0x8,%%xmm5 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa 0x10(%0),%%xmm1 \n"
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"pand %%xmm5,%%xmm2 \n"
|
||||
"pand %%xmm5,%%xmm3 \n"
|
||||
"packuswb %%xmm3,%%xmm2 \n"
|
||||
"movdqa %%xmm2,(%1) \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
"psrlw $0x8,%%xmm0 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"pand %%xmm5,%%xmm0 \n"
|
||||
"packuswb %%xmm0,%%xmm0 \n"
|
||||
"movq %%xmm0,(%2) \n"
|
||||
"lea 0x8(%2),%2 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm1 \n"
|
||||
"movq %%xmm1,(%3) \n"
|
||||
"lea 0x8(%3),%3 \n"
|
||||
"sub $0x10,%4 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src_yuy2), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(dst_u), // %2
|
||||
@ -716,7 +714,7 @@ void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
||||
psrlw xmm5, 8
|
||||
|
||||
wloop:
|
||||
convertloop:
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
lea eax, [eax + 32]
|
||||
@ -726,7 +724,7 @@ void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
|
||||
movdqa [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
ja wloop
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
@ -745,7 +743,7 @@ void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
||||
psrlw xmm5, 8
|
||||
|
||||
wloop:
|
||||
convertloop:
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm2, [eax + esi]
|
||||
@ -766,7 +764,7 @@ void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
||||
movq qword ptr [edi], xmm1
|
||||
lea edi, [edi + 8]
|
||||
sub ecx, 16
|
||||
ja wloop
|
||||
ja convertloop
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
@ -783,7 +781,7 @@ void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
|
||||
mov edx, [esp + 8] // dst_y
|
||||
mov ecx, [esp + 12] // pix
|
||||
|
||||
wloop:
|
||||
convertloop:
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
lea eax, [eax + 32]
|
||||
@ -793,7 +791,7 @@ void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
|
||||
movdqa [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
ja wloop
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
@ -812,7 +810,7 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
||||
psrlw xmm5, 8
|
||||
|
||||
wloop:
|
||||
convertloop:
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm2, [eax + esi]
|
||||
@ -833,7 +831,7 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
||||
movq qword ptr [edi], xmm1
|
||||
lea edi, [edi + 8]
|
||||
sub ecx, 16
|
||||
ja wloop
|
||||
ja convertloop
|
||||
|
||||
pop edi
|
||||
pop esi
|
||||
@ -847,20 +845,20 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
||||
#define HAS_YUY2TOI420ROW_SSE2
|
||||
static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
|
||||
uint8* dst_y, int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm5,%%xmm5\n"
|
||||
"psrlw $0x8,%%xmm5\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
"lea 0x20(%0),%0\n"
|
||||
"pand %%xmm5,%%xmm0\n"
|
||||
"pand %%xmm5,%%xmm1\n"
|
||||
"packuswb %%xmm1,%%xmm0\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"lea 0x10(%1),%1\n"
|
||||
"sub $0x10,%2\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"psrlw $0x8,%%xmm5 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa 0x10(%0),%%xmm1 \n"
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"pand %%xmm5,%%xmm0 \n"
|
||||
"pand %%xmm5,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"movdqa %%xmm0,(%1) \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src_yuy2), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(pix) // %2
|
||||
@ -874,31 +872,31 @@ static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
|
||||
|
||||
static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
||||
uint8* dst_u, uint8* dst_y, int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm5,%%xmm5\n"
|
||||
"psrlw $0x8,%%xmm5\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
"movdqa (%0,%4,1),%%xmm2\n"
|
||||
"movdqa 0x10(%0,%4,1),%%xmm3\n"
|
||||
"lea 0x20(%0),%0\n"
|
||||
"pavgb %%xmm2,%%xmm0\n"
|
||||
"pavgb %%xmm3,%%xmm1\n"
|
||||
"psrlw $0x8,%%xmm0\n"
|
||||
"psrlw $0x8,%%xmm1\n"
|
||||
"packuswb %%xmm1,%%xmm0\n"
|
||||
"movdqa %%xmm0,%%xmm1\n"
|
||||
"pand %%xmm5,%%xmm0\n"
|
||||
"packuswb %%xmm0,%%xmm0\n"
|
||||
"movq %%xmm0,(%1)\n"
|
||||
"lea 0x8(%1),%1\n"
|
||||
"psrlw $0x8,%%xmm1\n"
|
||||
"packuswb %%xmm1,%%xmm1\n"
|
||||
"movq %%xmm1,(%2)\n"
|
||||
"lea 0x8(%2),%2\n"
|
||||
"sub $0x10,%3\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"psrlw $0x8,%%xmm5 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa 0x10(%0),%%xmm1 \n"
|
||||
"movdqa (%0,%4,1),%%xmm2 \n"
|
||||
"movdqa 0x10(%0,%4,1),%%xmm3 \n"
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"pavgb %%xmm2,%%xmm0 \n"
|
||||
"pavgb %%xmm3,%%xmm1 \n"
|
||||
"psrlw $0x8,%%xmm0 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"pand %%xmm5,%%xmm0 \n"
|
||||
"packuswb %%xmm0,%%xmm0 \n"
|
||||
"movq %%xmm0,(%1) \n"
|
||||
"lea 0x8(%1),%1 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm1 \n"
|
||||
"movq %%xmm1,(%2) \n"
|
||||
"lea 0x8(%2),%2 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src_yuy2), // %0
|
||||
"+r"(dst_u), // %1
|
||||
"+r"(dst_y), // %2
|
||||
@ -913,18 +911,18 @@ static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
||||
#define HAS_UYVYTOI420ROW_SSE2
|
||||
static void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
|
||||
uint8* dst_y, int pix) {
|
||||
asm volatile(
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
"lea 0x20(%0),%0\n"
|
||||
"psrlw $0x8,%%xmm0\n"
|
||||
"psrlw $0x8,%%xmm1\n"
|
||||
"packuswb %%xmm1,%%xmm0\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"lea 0x10(%1),%1\n"
|
||||
"sub $0x10,%2\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa 0x10(%0),%%xmm1 \n"
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"psrlw $0x8,%%xmm0 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"movdqa %%xmm0,(%1) \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src_uyvy), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(pix) // %2
|
||||
@ -938,31 +936,31 @@ static void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
|
||||
|
||||
static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
||||
uint8* dst_u, uint8* dst_y, int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm5,%%xmm5\n"
|
||||
"psrlw $0x8,%%xmm5\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
"movdqa (%0,%4,1),%%xmm2\n"
|
||||
"movdqa 0x10(%0,%4,1),%%xmm3\n"
|
||||
"lea 0x20(%0),%0\n"
|
||||
"pavgb %%xmm2,%%xmm0\n"
|
||||
"pavgb %%xmm3,%%xmm1\n"
|
||||
"pand %%xmm5,%%xmm0\n"
|
||||
"pand %%xmm5,%%xmm1\n"
|
||||
"packuswb %%xmm1,%%xmm0\n"
|
||||
"movdqa %%xmm0,%%xmm1\n"
|
||||
"pand %%xmm5,%%xmm0\n"
|
||||
"packuswb %%xmm0,%%xmm0\n"
|
||||
"movq %%xmm0,(%1)\n"
|
||||
"lea 0x8(%1),%1\n"
|
||||
"psrlw $0x8,%%xmm1\n"
|
||||
"packuswb %%xmm1,%%xmm1\n"
|
||||
"movq %%xmm1,(%2)\n"
|
||||
"lea 0x8(%2),%2\n"
|
||||
"sub $0x10,%3\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"psrlw $0x8,%%xmm5 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa 0x10(%0),%%xmm1 \n"
|
||||
"movdqa (%0,%4,1),%%xmm2 \n"
|
||||
"movdqa 0x10(%0,%4,1),%%xmm3 \n"
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"pavgb %%xmm2,%%xmm0 \n"
|
||||
"pavgb %%xmm3,%%xmm1 \n"
|
||||
"pand %%xmm5,%%xmm0 \n"
|
||||
"pand %%xmm5,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"pand %%xmm5,%%xmm0 \n"
|
||||
"packuswb %%xmm0,%%xmm0 \n"
|
||||
"movq %%xmm0,(%1) \n"
|
||||
"lea 0x8(%1),%1 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm1 \n"
|
||||
"movq %%xmm1,(%2) \n"
|
||||
"lea 0x8(%2),%2 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src_uyvy), // %0
|
||||
"+r"(dst_u), // %1
|
||||
"+r"(dst_y), // %2
|
||||
|
||||
830
source/rotate.cc
830
source/rotate.cc
@ -282,78 +282,78 @@ __asm {
|
||||
#define HAS_TRANSPOSE_WX8_SSSE3
|
||||
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
asm volatile(
|
||||
asm volatile (
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
"1:\n"
|
||||
"movq (%0),%%xmm0\n"
|
||||
"movq (%0,%3),%%xmm1\n"
|
||||
"lea (%0,%3,2),%0\n"
|
||||
"punpcklbw %%xmm1,%%xmm0\n"
|
||||
"movq (%0),%%xmm2\n"
|
||||
"movdqa %%xmm0,%%xmm1\n"
|
||||
"palignr $0x8,%%xmm1,%%xmm1\n"
|
||||
"movq (%0,%3),%%xmm3\n"
|
||||
"lea (%0,%3,2),%0\n"
|
||||
"punpcklbw %%xmm3,%%xmm2\n"
|
||||
"movdqa %%xmm2,%%xmm3\n"
|
||||
"movq (%0),%%xmm4\n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3\n"
|
||||
"movq (%0,%3),%%xmm5\n"
|
||||
"lea (%0,%3,2),%0\n"
|
||||
"punpcklbw %%xmm5,%%xmm4\n"
|
||||
"movdqa %%xmm4,%%xmm5\n"
|
||||
"movq (%0),%%xmm6\n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5\n"
|
||||
"movq (%0,%3),%%xmm7\n"
|
||||
"lea (%0,%3,2),%0\n"
|
||||
"punpcklbw %%xmm7,%%xmm6\n"
|
||||
"neg %3\n"
|
||||
"movdqa %%xmm6,%%xmm7\n"
|
||||
"lea 0x8(%0,%3,8),%0\n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7\n"
|
||||
"neg %3\n"
|
||||
"1: \n"
|
||||
"movq (%0),%%xmm0 \n"
|
||||
"movq (%0,%3),%%xmm1 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"movq (%0),%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"palignr $0x8,%%xmm1,%%xmm1 \n"
|
||||
"movq (%0,%3),%%xmm3 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm3,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"movq (%0),%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"movq (%0,%3),%%xmm5 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm5,%%xmm4 \n"
|
||||
"movdqa %%xmm4,%%xmm5 \n"
|
||||
"movq (%0),%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"movq (%0,%3),%%xmm7 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"punpcklbw %%xmm7,%%xmm6 \n"
|
||||
"neg %3 \n"
|
||||
"movdqa %%xmm6,%%xmm7 \n"
|
||||
"lea 0x8(%0,%3,8),%0 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"neg %3 \n"
|
||||
// Second round of bit swap.
|
||||
"punpcklwd %%xmm2,%%xmm0\n"
|
||||
"punpcklwd %%xmm3,%%xmm1\n"
|
||||
"movdqa %%xmm0,%%xmm2\n"
|
||||
"movdqa %%xmm1,%%xmm3\n"
|
||||
"palignr $0x8,%%xmm2,%%xmm2\n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3\n"
|
||||
"punpcklwd %%xmm6,%%xmm4\n"
|
||||
"punpcklwd %%xmm7,%%xmm5\n"
|
||||
"movdqa %%xmm4,%%xmm6\n"
|
||||
"movdqa %%xmm5,%%xmm7\n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6\n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7\n"
|
||||
"punpcklwd %%xmm2,%%xmm0 \n"
|
||||
"punpcklwd %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"palignr $0x8,%%xmm2,%%xmm2 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"punpcklwd %%xmm6,%%xmm4 \n"
|
||||
"punpcklwd %%xmm7,%%xmm5 \n"
|
||||
"movdqa %%xmm4,%%xmm6 \n"
|
||||
"movdqa %%xmm5,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
"punpckldq %%xmm4,%%xmm0\n"
|
||||
"movq %%xmm0,(%1)\n"
|
||||
"movdqa %%xmm0,%%xmm4\n"
|
||||
"palignr $0x8,%%xmm4,%%xmm4\n"
|
||||
"movq %%xmm4,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"punpckldq %%xmm6,%%xmm2\n"
|
||||
"movdqa %%xmm2,%%xmm6\n"
|
||||
"movq %%xmm2,(%1)\n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6\n"
|
||||
"punpckldq %%xmm5,%%xmm1\n"
|
||||
"movq %%xmm6,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"movdqa %%xmm1,%%xmm5\n"
|
||||
"movq %%xmm1,(%1)\n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5\n"
|
||||
"movq %%xmm5,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"punpckldq %%xmm7,%%xmm3\n"
|
||||
"movq %%xmm3,(%1)\n"
|
||||
"movdqa %%xmm3,%%xmm7\n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7\n"
|
||||
"movq %%xmm7,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"sub $0x8,%2\n"
|
||||
"ja 1b\n"
|
||||
"punpckldq %%xmm4,%%xmm0 \n"
|
||||
"movq %%xmm0,(%1) \n"
|
||||
"movdqa %%xmm0,%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm4,%%xmm4 \n"
|
||||
"movq %%xmm4,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm6,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm6 \n"
|
||||
"movq %%xmm2,(%1) \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"punpckldq %%xmm5,%%xmm1 \n"
|
||||
"movq %%xmm6,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"movdqa %%xmm1,%%xmm5 \n"
|
||||
"movq %%xmm1,(%1) \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"movq %%xmm5,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm7,%%xmm3 \n"
|
||||
"movq %%xmm3,(%1) \n"
|
||||
"movdqa %%xmm3,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"movq %%xmm7,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
@ -372,258 +372,258 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int w);
|
||||
asm(
|
||||
".text\n"
|
||||
asm volatile (
|
||||
".text \n"
|
||||
#if defined(OSX)
|
||||
".globl _TransposeUVWx8_SSE2\n"
|
||||
"_TransposeUVWx8_SSE2:\n"
|
||||
".globl _TransposeUVWx8_SSE2 \n"
|
||||
"_TransposeUVWx8_SSE2: \n"
|
||||
#else
|
||||
".global TransposeUVWx8_SSE2\n"
|
||||
"TransposeUVWx8_SSE2:\n"
|
||||
".global TransposeUVWx8_SSE2 \n"
|
||||
"TransposeUVWx8_SSE2: \n"
|
||||
#endif
|
||||
"push %ebx\n"
|
||||
"push %esi\n"
|
||||
"push %edi\n"
|
||||
"push %ebp\n"
|
||||
"mov 0x14(%esp),%eax\n"
|
||||
"mov 0x18(%esp),%edi\n"
|
||||
"mov 0x1c(%esp),%edx\n"
|
||||
"mov 0x20(%esp),%esi\n"
|
||||
"mov 0x24(%esp),%ebx\n"
|
||||
"mov 0x28(%esp),%ebp\n"
|
||||
"mov %esp,%ecx\n"
|
||||
"sub $0x14,%esp\n"
|
||||
"and $0xfffffff0,%esp\n"
|
||||
"mov %ecx,0x10(%esp)\n"
|
||||
"mov 0x2c(%ecx),%ecx\n"
|
||||
"push %ebx \n"
|
||||
"push %esi \n"
|
||||
"push %edi \n"
|
||||
"push %ebp \n"
|
||||
"mov 0x14(%esp),%eax \n"
|
||||
"mov 0x18(%esp),%edi \n"
|
||||
"mov 0x1c(%esp),%edx \n"
|
||||
"mov 0x20(%esp),%esi \n"
|
||||
"mov 0x24(%esp),%ebx \n"
|
||||
"mov 0x28(%esp),%ebp \n"
|
||||
"mov %esp,%ecx \n"
|
||||
"sub $0x14,%esp \n"
|
||||
"and $0xfffffff0,%esp \n"
|
||||
"mov %ecx,0x10(%esp) \n"
|
||||
"mov 0x2c(%ecx),%ecx \n"
|
||||
|
||||
"1:\n"
|
||||
"movdqa (%eax),%xmm0\n"
|
||||
"movdqa (%eax,%edi,1),%xmm1\n"
|
||||
"lea (%eax,%edi,2),%eax\n"
|
||||
"movdqa %xmm0,%xmm7\n"
|
||||
"punpcklbw %xmm1,%xmm0\n"
|
||||
"punpckhbw %xmm1,%xmm7\n"
|
||||
"movdqa %xmm7,%xmm1\n"
|
||||
"movdqa (%eax),%xmm2\n"
|
||||
"movdqa (%eax,%edi,1),%xmm3\n"
|
||||
"lea (%eax,%edi,2),%eax\n"
|
||||
"movdqa %xmm2,%xmm7\n"
|
||||
"punpcklbw %xmm3,%xmm2\n"
|
||||
"punpckhbw %xmm3,%xmm7\n"
|
||||
"movdqa %xmm7,%xmm3\n"
|
||||
"movdqa (%eax),%xmm4\n"
|
||||
"movdqa (%eax,%edi,1),%xmm5\n"
|
||||
"lea (%eax,%edi,2),%eax\n"
|
||||
"movdqa %xmm4,%xmm7\n"
|
||||
"punpcklbw %xmm5,%xmm4\n"
|
||||
"punpckhbw %xmm5,%xmm7\n"
|
||||
"movdqa %xmm7,%xmm5\n"
|
||||
"movdqa (%eax),%xmm6\n"
|
||||
"movdqa (%eax,%edi,1),%xmm7\n"
|
||||
"lea (%eax,%edi,2),%eax\n"
|
||||
"movdqa %xmm5,(%esp)\n"
|
||||
"neg %edi\n"
|
||||
"movdqa %xmm6,%xmm5\n"
|
||||
"punpcklbw %xmm7,%xmm6\n"
|
||||
"punpckhbw %xmm7,%xmm5\n"
|
||||
"movdqa %xmm5,%xmm7\n"
|
||||
"lea 0x10(%eax,%edi,8),%eax\n"
|
||||
"neg %edi\n"
|
||||
"movdqa %xmm0,%xmm5\n"
|
||||
"punpcklwd %xmm2,%xmm0\n"
|
||||
"punpckhwd %xmm2,%xmm5\n"
|
||||
"movdqa %xmm5,%xmm2\n"
|
||||
"movdqa %xmm1,%xmm5\n"
|
||||
"punpcklwd %xmm3,%xmm1\n"
|
||||
"punpckhwd %xmm3,%xmm5\n"
|
||||
"movdqa %xmm5,%xmm3\n"
|
||||
"movdqa %xmm4,%xmm5\n"
|
||||
"punpcklwd %xmm6,%xmm4\n"
|
||||
"punpckhwd %xmm6,%xmm5\n"
|
||||
"movdqa %xmm5,%xmm6\n"
|
||||
"movdqa (%esp),%xmm5\n"
|
||||
"movdqa %xmm6,(%esp)\n"
|
||||
"movdqa %xmm5,%xmm6\n"
|
||||
"punpcklwd %xmm7,%xmm5\n"
|
||||
"punpckhwd %xmm7,%xmm6\n"
|
||||
"movdqa %xmm6,%xmm7\n"
|
||||
"movdqa %xmm0,%xmm6\n"
|
||||
"punpckldq %xmm4,%xmm0\n"
|
||||
"punpckhdq %xmm4,%xmm6\n"
|
||||
"movdqa %xmm6,%xmm4\n"
|
||||
"movdqa (%esp),%xmm6\n"
|
||||
"movlpd %xmm0,(%edx)\n"
|
||||
"movhpd %xmm0,(%ebx)\n"
|
||||
"movlpd %xmm4,(%edx,%esi,1)\n"
|
||||
"lea (%edx,%esi,2),%edx\n"
|
||||
"movhpd %xmm4,(%ebx,%ebp,1)\n"
|
||||
"lea (%ebx,%ebp,2),%ebx\n"
|
||||
"movdqa %xmm2,%xmm0\n"
|
||||
"punpckldq %xmm6,%xmm2\n"
|
||||
"movlpd %xmm2,(%edx)\n"
|
||||
"movhpd %xmm2,(%ebx)\n"
|
||||
"punpckhdq %xmm6,%xmm0\n"
|
||||
"movlpd %xmm0,(%edx,%esi,1)\n"
|
||||
"lea (%edx,%esi,2),%edx\n"
|
||||
"movhpd %xmm0,(%ebx,%ebp,1)\n"
|
||||
"lea (%ebx,%ebp,2),%ebx\n"
|
||||
"movdqa %xmm1,%xmm0\n"
|
||||
"punpckldq %xmm5,%xmm1\n"
|
||||
"movlpd %xmm1,(%edx)\n"
|
||||
"movhpd %xmm1,(%ebx)\n"
|
||||
"punpckhdq %xmm5,%xmm0\n"
|
||||
"movlpd %xmm0,(%edx,%esi,1)\n"
|
||||
"lea (%edx,%esi,2),%edx\n"
|
||||
"movhpd %xmm0,(%ebx,%ebp,1)\n"
|
||||
"lea (%ebx,%ebp,2),%ebx\n"
|
||||
"movdqa %xmm3,%xmm0\n"
|
||||
"punpckldq %xmm7,%xmm3\n"
|
||||
"movlpd %xmm3,(%edx)\n"
|
||||
"movhpd %xmm3,(%ebx)\n"
|
||||
"punpckhdq %xmm7,%xmm0\n"
|
||||
"movlpd %xmm0,(%edx,%esi,1)\n"
|
||||
"lea (%edx,%esi,2),%edx\n"
|
||||
"movhpd %xmm0,(%ebx,%ebp,1)\n"
|
||||
"lea (%ebx,%ebp,2),%ebx\n"
|
||||
"sub $0x8,%ecx\n"
|
||||
"ja 1b\n"
|
||||
"mov 0x10(%esp),%esp\n"
|
||||
"pop %ebp\n"
|
||||
"pop %edi\n"
|
||||
"pop %esi\n"
|
||||
"pop %ebx\n"
|
||||
"ret\n"
|
||||
"1: \n"
|
||||
"movdqa (%eax),%xmm0 \n"
|
||||
"movdqa (%eax,%edi,1),%xmm1 \n"
|
||||
"lea (%eax,%edi,2),%eax \n"
|
||||
"movdqa %xmm0,%xmm7 \n"
|
||||
"punpcklbw %xmm1,%xmm0 \n"
|
||||
"punpckhbw %xmm1,%xmm7 \n"
|
||||
"movdqa %xmm7,%xmm1 \n"
|
||||
"movdqa (%eax),%xmm2 \n"
|
||||
"movdqa (%eax,%edi,1),%xmm3 \n"
|
||||
"lea (%eax,%edi,2),%eax \n"
|
||||
"movdqa %xmm2,%xmm7 \n"
|
||||
"punpcklbw %xmm3,%xmm2 \n"
|
||||
"punpckhbw %xmm3,%xmm7 \n"
|
||||
"movdqa %xmm7,%xmm3 \n"
|
||||
"movdqa (%eax),%xmm4 \n"
|
||||
"movdqa (%eax,%edi,1),%xmm5 \n"
|
||||
"lea (%eax,%edi,2),%eax \n"
|
||||
"movdqa %xmm4,%xmm7 \n"
|
||||
"punpcklbw %xmm5,%xmm4 \n"
|
||||
"punpckhbw %xmm5,%xmm7 \n"
|
||||
"movdqa %xmm7,%xmm5 \n"
|
||||
"movdqa (%eax),%xmm6 \n"
|
||||
"movdqa (%eax,%edi,1),%xmm7 \n"
|
||||
"lea (%eax,%edi,2),%eax \n"
|
||||
"movdqa %xmm5,(%esp) \n"
|
||||
"neg %edi \n"
|
||||
"movdqa %xmm6,%xmm5 \n"
|
||||
"punpcklbw %xmm7,%xmm6 \n"
|
||||
"punpckhbw %xmm7,%xmm5 \n"
|
||||
"movdqa %xmm5,%xmm7 \n"
|
||||
"lea 0x10(%eax,%edi,8),%eax \n"
|
||||
"neg %edi \n"
|
||||
"movdqa %xmm0,%xmm5 \n"
|
||||
"punpcklwd %xmm2,%xmm0 \n"
|
||||
"punpckhwd %xmm2,%xmm5 \n"
|
||||
"movdqa %xmm5,%xmm2 \n"
|
||||
"movdqa %xmm1,%xmm5 \n"
|
||||
"punpcklwd %xmm3,%xmm1 \n"
|
||||
"punpckhwd %xmm3,%xmm5 \n"
|
||||
"movdqa %xmm5,%xmm3 \n"
|
||||
"movdqa %xmm4,%xmm5 \n"
|
||||
"punpcklwd %xmm6,%xmm4 \n"
|
||||
"punpckhwd %xmm6,%xmm5 \n"
|
||||
"movdqa %xmm5,%xmm6 \n"
|
||||
"movdqa (%esp),%xmm5 \n"
|
||||
"movdqa %xmm6,(%esp) \n"
|
||||
"movdqa %xmm5,%xmm6 \n"
|
||||
"punpcklwd %xmm7,%xmm5 \n"
|
||||
"punpckhwd %xmm7,%xmm6 \n"
|
||||
"movdqa %xmm6,%xmm7 \n"
|
||||
"movdqa %xmm0,%xmm6 \n"
|
||||
"punpckldq %xmm4,%xmm0 \n"
|
||||
"punpckhdq %xmm4,%xmm6 \n"
|
||||
"movdqa %xmm6,%xmm4 \n"
|
||||
"movdqa (%esp),%xmm6 \n"
|
||||
"movlpd %xmm0,(%edx) \n"
|
||||
"movhpd %xmm0,(%ebx) \n"
|
||||
"movlpd %xmm4,(%edx,%esi,1) \n"
|
||||
"lea (%edx,%esi,2),%edx \n"
|
||||
"movhpd %xmm4,(%ebx,%ebp,1) \n"
|
||||
"lea (%ebx,%ebp,2),%ebx \n"
|
||||
"movdqa %xmm2,%xmm0 \n"
|
||||
"punpckldq %xmm6,%xmm2 \n"
|
||||
"movlpd %xmm2,(%edx) \n"
|
||||
"movhpd %xmm2,(%ebx) \n"
|
||||
"punpckhdq %xmm6,%xmm0 \n"
|
||||
"movlpd %xmm0,(%edx,%esi,1) \n"
|
||||
"lea (%edx,%esi,2),%edx \n"
|
||||
"movhpd %xmm0,(%ebx,%ebp,1) \n"
|
||||
"lea (%ebx,%ebp,2),%ebx \n"
|
||||
"movdqa %xmm1,%xmm0 \n"
|
||||
"punpckldq %xmm5,%xmm1 \n"
|
||||
"movlpd %xmm1,(%edx) \n"
|
||||
"movhpd %xmm1,(%ebx) \n"
|
||||
"punpckhdq %xmm5,%xmm0 \n"
|
||||
"movlpd %xmm0,(%edx,%esi,1) \n"
|
||||
"lea (%edx,%esi,2),%edx \n"
|
||||
"movhpd %xmm0,(%ebx,%ebp,1) \n"
|
||||
"lea (%ebx,%ebp,2),%ebx \n"
|
||||
"movdqa %xmm3,%xmm0 \n"
|
||||
"punpckldq %xmm7,%xmm3 \n"
|
||||
"movlpd %xmm3,(%edx) \n"
|
||||
"movhpd %xmm3,(%ebx) \n"
|
||||
"punpckhdq %xmm7,%xmm0 \n"
|
||||
"movlpd %xmm0,(%edx,%esi,1) \n"
|
||||
"lea (%edx,%esi,2),%edx \n"
|
||||
"movhpd %xmm0,(%ebx,%ebp,1) \n"
|
||||
"lea (%ebx,%ebp,2),%ebx \n"
|
||||
"sub $0x8,%ecx \n"
|
||||
"ja 1b \n"
|
||||
"mov 0x10(%esp),%esp \n"
|
||||
"pop %ebp \n"
|
||||
"pop %edi \n"
|
||||
"pop %esi \n"
|
||||
"pop %ebx \n"
|
||||
"ret \n"
|
||||
);
|
||||
#elif defined (__x86_64__)
|
||||
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
|
||||
#define HAS_TRANSPOSE_WX8_FAST_SSSE3
|
||||
static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
asm volatile(
|
||||
asm volatile (
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
"1:\n"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa (%0,%3),%%xmm1\n"
|
||||
"lea (%0,%3,2),%0\n"
|
||||
"movdqa %%xmm0,%%xmm8\n"
|
||||
"punpcklbw %%xmm1,%%xmm0\n"
|
||||
"punpckhbw %%xmm1,%%xmm8\n"
|
||||
"movdqa (%0),%%xmm2\n"
|
||||
"movdqa %%xmm0,%%xmm1\n"
|
||||
"movdqa %%xmm8,%%xmm9\n"
|
||||
"palignr $0x8,%%xmm1,%%xmm1\n"
|
||||
"palignr $0x8,%%xmm9,%%xmm9\n"
|
||||
"movdqa (%0,%3),%%xmm3\n"
|
||||
"lea (%0,%3,2),%0\n"
|
||||
"movdqa %%xmm2,%%xmm10\n"
|
||||
"punpcklbw %%xmm3,%%xmm2\n"
|
||||
"punpckhbw %%xmm3,%%xmm10\n"
|
||||
"movdqa %%xmm2,%%xmm3\n"
|
||||
"movdqa %%xmm10,%%xmm11\n"
|
||||
"movdqa (%0),%%xmm4\n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3\n"
|
||||
"palignr $0x8,%%xmm11,%%xmm11\n"
|
||||
"movdqa (%0,%3),%%xmm5\n"
|
||||
"lea (%0,%3,2),%0\n"
|
||||
"movdqa %%xmm4,%%xmm12\n"
|
||||
"punpcklbw %%xmm5,%%xmm4\n"
|
||||
"punpckhbw %%xmm5,%%xmm12\n"
|
||||
"movdqa %%xmm4,%%xmm5\n"
|
||||
"movdqa %%xmm12,%%xmm13\n"
|
||||
"movdqa (%0),%%xmm6\n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5\n"
|
||||
"palignr $0x8,%%xmm13,%%xmm13\n"
|
||||
"movdqa (%0,%3),%%xmm7\n"
|
||||
"lea (%0,%3,2),%0\n"
|
||||
"movdqa %%xmm6,%%xmm14\n"
|
||||
"punpcklbw %%xmm7,%%xmm6\n"
|
||||
"punpckhbw %%xmm7,%%xmm14\n"
|
||||
"neg %3\n"
|
||||
"movdqa %%xmm6,%%xmm7\n"
|
||||
"movdqa %%xmm14,%%xmm15\n"
|
||||
"lea 0x10(%0,%3,8),%0\n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7\n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15\n"
|
||||
"neg %3\n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa (%0,%3),%%xmm1 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"punpckhbw %%xmm1,%%xmm8 \n"
|
||||
"movdqa (%0),%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"movdqa %%xmm8,%%xmm9 \n"
|
||||
"palignr $0x8,%%xmm1,%%xmm1 \n"
|
||||
"palignr $0x8,%%xmm9,%%xmm9 \n"
|
||||
"movdqa (%0,%3),%%xmm3 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm2,%%xmm10 \n"
|
||||
"punpcklbw %%xmm3,%%xmm2 \n"
|
||||
"punpckhbw %%xmm3,%%xmm10 \n"
|
||||
"movdqa %%xmm2,%%xmm3 \n"
|
||||
"movdqa %%xmm10,%%xmm11 \n"
|
||||
"movdqa (%0),%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"palignr $0x8,%%xmm11,%%xmm11 \n"
|
||||
"movdqa (%0,%3),%%xmm5 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm4,%%xmm12 \n"
|
||||
"punpcklbw %%xmm5,%%xmm4 \n"
|
||||
"punpckhbw %%xmm5,%%xmm12 \n"
|
||||
"movdqa %%xmm4,%%xmm5 \n"
|
||||
"movdqa %%xmm12,%%xmm13 \n"
|
||||
"movdqa (%0),%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"palignr $0x8,%%xmm13,%%xmm13 \n"
|
||||
"movdqa (%0,%3),%%xmm7 \n"
|
||||
"lea (%0,%3,2),%0 \n"
|
||||
"movdqa %%xmm6,%%xmm14 \n"
|
||||
"punpcklbw %%xmm7,%%xmm6 \n"
|
||||
"punpckhbw %%xmm7,%%xmm14 \n"
|
||||
"neg %3 \n"
|
||||
"movdqa %%xmm6,%%xmm7 \n"
|
||||
"movdqa %%xmm14,%%xmm15 \n"
|
||||
"lea 0x10(%0,%3,8),%0 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15 \n"
|
||||
"neg %3 \n"
|
||||
// Second round of bit swap.
|
||||
"punpcklwd %%xmm2,%%xmm0\n"
|
||||
"punpcklwd %%xmm3,%%xmm1\n"
|
||||
"movdqa %%xmm0,%%xmm2\n"
|
||||
"movdqa %%xmm1,%%xmm3\n"
|
||||
"palignr $0x8,%%xmm2,%%xmm2\n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3\n"
|
||||
"punpcklwd %%xmm6,%%xmm4\n"
|
||||
"punpcklwd %%xmm7,%%xmm5\n"
|
||||
"movdqa %%xmm4,%%xmm6\n"
|
||||
"movdqa %%xmm5,%%xmm7\n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6\n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7\n"
|
||||
"punpcklwd %%xmm10,%%xmm8\n"
|
||||
"punpcklwd %%xmm11,%%xmm9\n"
|
||||
"movdqa %%xmm8,%%xmm10\n"
|
||||
"movdqa %%xmm9,%%xmm11\n"
|
||||
"palignr $0x8,%%xmm10,%%xmm10\n"
|
||||
"palignr $0x8,%%xmm11,%%xmm11\n"
|
||||
"punpcklwd %%xmm14,%%xmm12\n"
|
||||
"punpcklwd %%xmm15,%%xmm13\n"
|
||||
"movdqa %%xmm12,%%xmm14\n"
|
||||
"movdqa %%xmm13,%%xmm15\n"
|
||||
"palignr $0x8,%%xmm14,%%xmm14\n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15\n"
|
||||
"punpcklwd %%xmm2,%%xmm0 \n"
|
||||
"punpcklwd %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm0,%%xmm2 \n"
|
||||
"movdqa %%xmm1,%%xmm3 \n"
|
||||
"palignr $0x8,%%xmm2,%%xmm2 \n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3 \n"
|
||||
"punpcklwd %%xmm6,%%xmm4 \n"
|
||||
"punpcklwd %%xmm7,%%xmm5 \n"
|
||||
"movdqa %%xmm4,%%xmm6 \n"
|
||||
"movdqa %%xmm5,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"punpcklwd %%xmm10,%%xmm8 \n"
|
||||
"punpcklwd %%xmm11,%%xmm9 \n"
|
||||
"movdqa %%xmm8,%%xmm10 \n"
|
||||
"movdqa %%xmm9,%%xmm11 \n"
|
||||
"palignr $0x8,%%xmm10,%%xmm10 \n"
|
||||
"palignr $0x8,%%xmm11,%%xmm11 \n"
|
||||
"punpcklwd %%xmm14,%%xmm12 \n"
|
||||
"punpcklwd %%xmm15,%%xmm13 \n"
|
||||
"movdqa %%xmm12,%%xmm14 \n"
|
||||
"movdqa %%xmm13,%%xmm15 \n"
|
||||
"palignr $0x8,%%xmm14,%%xmm14 \n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15 \n"
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
"punpckldq %%xmm4,%%xmm0\n"
|
||||
"movq %%xmm0,(%1)\n"
|
||||
"movdqa %%xmm0,%%xmm4\n"
|
||||
"palignr $0x8,%%xmm4,%%xmm4\n"
|
||||
"movq %%xmm4,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"punpckldq %%xmm6,%%xmm2\n"
|
||||
"movdqa %%xmm2,%%xmm6\n"
|
||||
"movq %%xmm2,(%1)\n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6\n"
|
||||
"punpckldq %%xmm5,%%xmm1\n"
|
||||
"movq %%xmm6,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"movdqa %%xmm1,%%xmm5\n"
|
||||
"movq %%xmm1,(%1)\n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5\n"
|
||||
"movq %%xmm5,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"punpckldq %%xmm7,%%xmm3\n"
|
||||
"movq %%xmm3,(%1)\n"
|
||||
"movdqa %%xmm3,%%xmm7\n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7\n"
|
||||
"movq %%xmm7,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"punpckldq %%xmm12,%%xmm8\n"
|
||||
"movq %%xmm8,(%1)\n"
|
||||
"movdqa %%xmm8,%%xmm12\n"
|
||||
"palignr $0x8,%%xmm12,%%xmm12\n"
|
||||
"movq %%xmm12,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"punpckldq %%xmm14,%%xmm10\n"
|
||||
"movdqa %%xmm10,%%xmm14\n"
|
||||
"movq %%xmm10,(%1)\n"
|
||||
"palignr $0x8,%%xmm14,%%xmm14\n"
|
||||
"punpckldq %%xmm13,%%xmm9\n"
|
||||
"movq %%xmm14,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"movdqa %%xmm9,%%xmm13\n"
|
||||
"movq %%xmm9,(%1)\n"
|
||||
"palignr $0x8,%%xmm13,%%xmm13\n"
|
||||
"movq %%xmm13,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"punpckldq %%xmm15,%%xmm11\n"
|
||||
"movq %%xmm11,(%1)\n"
|
||||
"movdqa %%xmm11,%%xmm15\n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15\n"
|
||||
"movq %%xmm15,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"sub $0x10,%2\n"
|
||||
"ja 1b\n"
|
||||
"punpckldq %%xmm4,%%xmm0 \n"
|
||||
"movq %%xmm0,(%1) \n"
|
||||
"movdqa %%xmm0,%%xmm4 \n"
|
||||
"palignr $0x8,%%xmm4,%%xmm4 \n"
|
||||
"movq %%xmm4,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm6,%%xmm2 \n"
|
||||
"movdqa %%xmm2,%%xmm6 \n"
|
||||
"movq %%xmm2,(%1) \n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6 \n"
|
||||
"punpckldq %%xmm5,%%xmm1 \n"
|
||||
"movq %%xmm6,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"movdqa %%xmm1,%%xmm5 \n"
|
||||
"movq %%xmm1,(%1) \n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5 \n"
|
||||
"movq %%xmm5,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm7,%%xmm3 \n"
|
||||
"movq %%xmm3,(%1) \n"
|
||||
"movdqa %%xmm3,%%xmm7 \n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7 \n"
|
||||
"movq %%xmm7,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm12,%%xmm8 \n"
|
||||
"movq %%xmm8,(%1) \n"
|
||||
"movdqa %%xmm8,%%xmm12 \n"
|
||||
"palignr $0x8,%%xmm12,%%xmm12 \n"
|
||||
"movq %%xmm12,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm14,%%xmm10 \n"
|
||||
"movdqa %%xmm10,%%xmm14 \n"
|
||||
"movq %%xmm10,(%1) \n"
|
||||
"palignr $0x8,%%xmm14,%%xmm14 \n"
|
||||
"punpckldq %%xmm13,%%xmm9 \n"
|
||||
"movq %%xmm14,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"movdqa %%xmm9,%%xmm13 \n"
|
||||
"movq %%xmm9,(%1) \n"
|
||||
"palignr $0x8,%%xmm13,%%xmm13 \n"
|
||||
"movq %%xmm13,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"punpckldq %%xmm15,%%xmm11 \n"
|
||||
"movq %%xmm11,(%1) \n"
|
||||
"movdqa %%xmm11,%%xmm15 \n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15 \n"
|
||||
"movq %%xmm15,(%1,%4) \n"
|
||||
"lea (%1,%4,2),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
@ -640,98 +640,98 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int w) {
|
||||
asm volatile(
|
||||
asm volatile (
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
"1:\n"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa (%0,%4),%%xmm1\n"
|
||||
"lea (%0,%4,2),%0\n"
|
||||
"movdqa %%xmm0,%%xmm8\n"
|
||||
"punpcklbw %%xmm1,%%xmm0\n"
|
||||
"punpckhbw %%xmm1,%%xmm8\n"
|
||||
"movdqa %%xmm8,%%xmm1\n"
|
||||
"movdqa (%0),%%xmm2\n"
|
||||
"movdqa (%0,%4),%%xmm3\n"
|
||||
"lea (%0,%4,2),%0\n"
|
||||
"movdqa %%xmm2,%%xmm8\n"
|
||||
"punpcklbw %%xmm3,%%xmm2\n"
|
||||
"punpckhbw %%xmm3,%%xmm8\n"
|
||||
"movdqa %%xmm8,%%xmm3\n"
|
||||
"movdqa (%0),%%xmm4\n"
|
||||
"movdqa (%0,%4),%%xmm5\n"
|
||||
"lea (%0,%4,2),%0\n"
|
||||
"movdqa %%xmm4,%%xmm8\n"
|
||||
"punpcklbw %%xmm5,%%xmm4\n"
|
||||
"punpckhbw %%xmm5,%%xmm8\n"
|
||||
"movdqa %%xmm8,%%xmm5\n"
|
||||
"movdqa (%0),%%xmm6\n"
|
||||
"movdqa (%0,%4),%%xmm7\n"
|
||||
"lea (%0,%4,2),%0\n"
|
||||
"movdqa %%xmm6,%%xmm8\n"
|
||||
"punpcklbw %%xmm7,%%xmm6\n"
|
||||
"neg %4\n"
|
||||
"lea 0x10(%0,%4,8),%0\n"
|
||||
"punpckhbw %%xmm7,%%xmm8\n"
|
||||
"movdqa %%xmm8,%%xmm7\n"
|
||||
"neg %4\n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa (%0,%4),%%xmm1 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"punpcklbw %%xmm1,%%xmm0 \n"
|
||||
"punpckhbw %%xmm1,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm1 \n"
|
||||
"movdqa (%0),%%xmm2 \n"
|
||||
"movdqa (%0,%4),%%xmm3 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm2,%%xmm8 \n"
|
||||
"punpcklbw %%xmm3,%%xmm2 \n"
|
||||
"punpckhbw %%xmm3,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm3 \n"
|
||||
"movdqa (%0),%%xmm4 \n"
|
||||
"movdqa (%0,%4),%%xmm5 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm4,%%xmm8 \n"
|
||||
"punpcklbw %%xmm5,%%xmm4 \n"
|
||||
"punpckhbw %%xmm5,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm5 \n"
|
||||
"movdqa (%0),%%xmm6 \n"
|
||||
"movdqa (%0,%4),%%xmm7 \n"
|
||||
"lea (%0,%4,2),%0 \n"
|
||||
"movdqa %%xmm6,%%xmm8 \n"
|
||||
"punpcklbw %%xmm7,%%xmm6 \n"
|
||||
"neg %4 \n"
|
||||
"lea 0x10(%0,%4,8),%0 \n"
|
||||
"punpckhbw %%xmm7,%%xmm8 \n"
|
||||
"movdqa %%xmm8,%%xmm7 \n"
|
||||
"neg %4 \n"
|
||||
// Second round of bit swap.
|
||||
"movdqa %%xmm0,%%xmm8\n"
|
||||
"movdqa %%xmm1,%%xmm9\n"
|
||||
"punpckhwd %%xmm2,%%xmm8\n"
|
||||
"punpckhwd %%xmm3,%%xmm9\n"
|
||||
"punpcklwd %%xmm2,%%xmm0\n"
|
||||
"punpcklwd %%xmm3,%%xmm1\n"
|
||||
"movdqa %%xmm8,%%xmm2\n"
|
||||
"movdqa %%xmm9,%%xmm3\n"
|
||||
"movdqa %%xmm4,%%xmm8\n"
|
||||
"movdqa %%xmm5,%%xmm9\n"
|
||||
"punpckhwd %%xmm6,%%xmm8\n"
|
||||
"punpckhwd %%xmm7,%%xmm9\n"
|
||||
"punpcklwd %%xmm6,%%xmm4\n"
|
||||
"punpcklwd %%xmm7,%%xmm5\n"
|
||||
"movdqa %%xmm8,%%xmm6\n"
|
||||
"movdqa %%xmm9,%%xmm7\n"
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"movdqa %%xmm1,%%xmm9 \n"
|
||||
"punpckhwd %%xmm2,%%xmm8 \n"
|
||||
"punpckhwd %%xmm3,%%xmm9 \n"
|
||||
"punpcklwd %%xmm2,%%xmm0 \n"
|
||||
"punpcklwd %%xmm3,%%xmm1 \n"
|
||||
"movdqa %%xmm8,%%xmm2 \n"
|
||||
"movdqa %%xmm9,%%xmm3 \n"
|
||||
"movdqa %%xmm4,%%xmm8 \n"
|
||||
"movdqa %%xmm5,%%xmm9 \n"
|
||||
"punpckhwd %%xmm6,%%xmm8 \n"
|
||||
"punpckhwd %%xmm7,%%xmm9 \n"
|
||||
"punpcklwd %%xmm6,%%xmm4 \n"
|
||||
"punpcklwd %%xmm7,%%xmm5 \n"
|
||||
"movdqa %%xmm8,%%xmm6 \n"
|
||||
"movdqa %%xmm9,%%xmm7 \n"
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
"movdqa %%xmm0,%%xmm8\n"
|
||||
"punpckldq %%xmm4,%%xmm0\n"
|
||||
"movlpd %%xmm0,(%1)\n" // Write back U channel
|
||||
"movhpd %%xmm0,(%2)\n" // Write back V channel
|
||||
"punpckhdq %%xmm4,%%xmm8\n"
|
||||
"movlpd %%xmm8,(%1,%5)\n"
|
||||
"lea (%1,%5,2),%1\n"
|
||||
"movhpd %%xmm8,(%2,%6)\n"
|
||||
"lea (%2,%6,2),%2\n"
|
||||
"movdqa %%xmm2,%%xmm8\n"
|
||||
"punpckldq %%xmm6,%%xmm2\n"
|
||||
"movlpd %%xmm2,(%1)\n"
|
||||
"movhpd %%xmm2,(%2)\n"
|
||||
"punpckhdq %%xmm6,%%xmm8\n"
|
||||
"movlpd %%xmm8,(%1,%5)\n"
|
||||
"lea (%1,%5,2),%1\n"
|
||||
"movhpd %%xmm8,(%2,%6)\n"
|
||||
"lea (%2,%6,2),%2\n"
|
||||
"movdqa %%xmm1,%%xmm8\n"
|
||||
"punpckldq %%xmm5,%%xmm1\n"
|
||||
"movlpd %%xmm1,(%1)\n"
|
||||
"movhpd %%xmm1,(%2)\n"
|
||||
"punpckhdq %%xmm5,%%xmm8\n"
|
||||
"movlpd %%xmm8,(%1,%5)\n"
|
||||
"lea (%1,%5,2),%1\n"
|
||||
"movhpd %%xmm8,(%2,%6)\n"
|
||||
"lea (%2,%6,2),%2\n"
|
||||
"movdqa %%xmm3,%%xmm8\n"
|
||||
"punpckldq %%xmm7,%%xmm3\n"
|
||||
"movlpd %%xmm3,(%1)\n"
|
||||
"movhpd %%xmm3,(%2)\n"
|
||||
"punpckhdq %%xmm7,%%xmm8\n"
|
||||
"movlpd %%xmm8,(%1,%5)\n"
|
||||
"lea (%1,%5,2),%1\n"
|
||||
"movhpd %%xmm8,(%2,%6)\n"
|
||||
"lea (%2,%6,2),%2\n"
|
||||
"sub $0x8,%3\n"
|
||||
"ja 1b\n"
|
||||
"movdqa %%xmm0,%%xmm8 \n"
|
||||
"punpckldq %%xmm4,%%xmm0 \n"
|
||||
"movlpd %%xmm0,(%1) \n" // Write back U channel
|
||||
"movhpd %%xmm0,(%2) \n" // Write back V channel
|
||||
"punpckhdq %%xmm4,%%xmm8 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"movdqa %%xmm2,%%xmm8 \n"
|
||||
"punpckldq %%xmm6,%%xmm2 \n"
|
||||
"movlpd %%xmm2,(%1) \n"
|
||||
"movhpd %%xmm2,(%2) \n"
|
||||
"punpckhdq %%xmm6,%%xmm8 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"movdqa %%xmm1,%%xmm8 \n"
|
||||
"punpckldq %%xmm5,%%xmm1 \n"
|
||||
"movlpd %%xmm1,(%1) \n"
|
||||
"movhpd %%xmm1,(%2) \n"
|
||||
"punpckhdq %%xmm5,%%xmm8 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"movdqa %%xmm3,%%xmm8 \n"
|
||||
"punpckldq %%xmm7,%%xmm3 \n"
|
||||
"movlpd %%xmm3,(%1) \n"
|
||||
"movhpd %%xmm3,(%2) \n"
|
||||
"punpckhdq %%xmm7,%%xmm8 \n"
|
||||
"movlpd %%xmm8,(%1,%5) \n"
|
||||
"lea (%1,%5,2),%1 \n"
|
||||
"movhpd %%xmm8,(%2,%6) \n"
|
||||
"lea (%2,%6,2),%2 \n"
|
||||
"sub $0x8,%3 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst_a), // %1
|
||||
"+r"(dst_b), // %2
|
||||
@ -882,17 +882,17 @@ __asm {
|
||||
#define HAS_REVERSE_LINE_SSSE3
|
||||
static void ReverseLine_SSSE3(const uint8* src, uint8* dst, int width) {
|
||||
intptr_t temp_width = static_cast<intptr_t>(width);
|
||||
asm volatile(
|
||||
"movdqa (%3),%%xmm5\n"
|
||||
"lea -0x10(%0,%2,1),%0\n"
|
||||
"1:\n"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"lea -0x10(%0),%0\n"
|
||||
"pshufb %%xmm5,%%xmm0\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"lea 0x10(%1),%1\n"
|
||||
"sub $0x10,%2\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"movdqa (%3),%%xmm5 \n"
|
||||
"lea -0x10(%0,%2,1),%0 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"lea -0x10(%0),%0 \n"
|
||||
"pshufb %%xmm5,%%xmm0 \n"
|
||||
"movdqa %%xmm0,(%1) \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(temp_width) // %2
|
||||
@ -1091,19 +1091,19 @@ void ReverseLineUV_SSSE3(const uint8* src,
|
||||
uint8* dst_a, uint8* dst_b,
|
||||
int width) {
|
||||
intptr_t temp_width = static_cast<intptr_t>(width);
|
||||
asm volatile(
|
||||
"movdqa (%4),%%xmm5\n"
|
||||
"lea -0x10(%0,%3,2),%0\n"
|
||||
"1:\n"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"lea -0x10(%0),%0\n"
|
||||
"pshufb %%xmm5,%%xmm0\n"
|
||||
"movlpd %%xmm0,(%1)\n"
|
||||
"lea 0x8(%1),%1\n"
|
||||
"movhpd %%xmm0,(%2)\n"
|
||||
"lea 0x8(%2),%2\n"
|
||||
"sub $0x8,%3\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"movdqa (%4),%%xmm5 \n"
|
||||
"lea -0x10(%0,%3,2),%0 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"lea -0x10(%0),%0 \n"
|
||||
"pshufb %%xmm5,%%xmm0 \n"
|
||||
"movlpd %%xmm0,(%1) \n"
|
||||
"lea 0x8(%1),%1 \n"
|
||||
"movhpd %%xmm0,(%2) \n"
|
||||
"lea 0x8(%2),%2 \n"
|
||||
"sub $0x8,%3 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst_a), // %1
|
||||
"+r"(dst_b), // %2
|
||||
|
||||
@ -15,12 +15,12 @@ namespace libyuv {
|
||||
#if defined(__ARM_NEON__) && !defined(COVERAGE_ENABLED)
|
||||
|
||||
void ReverseLine_NEON(const uint8* src, uint8* dst, int width) {
|
||||
asm volatile(
|
||||
asm volatile (
|
||||
// compute where to start writing destination
|
||||
"add %1, %2\n"
|
||||
"add %1, %2 \n"
|
||||
|
||||
// work on segments that are multiples of 16
|
||||
"lsrs r3, %2, #4\n"
|
||||
"lsrs r3, %2, #4 \n"
|
||||
|
||||
// the output is written in two block. 8 bytes followed
|
||||
// by another 8. reading is done sequentially, from left to
|
||||
@ -28,72 +28,72 @@ void ReverseLine_NEON(const uint8* src, uint8* dst, int width) {
|
||||
// %1, the destination pointer is incremented after writing
|
||||
// the first of the two blocks. need to subtract that 8 off
|
||||
// along with 16 to get the next location.
|
||||
"mov r3, #-24\n"
|
||||
"mov r3, #-24 \n"
|
||||
|
||||
"beq 2f\n"
|
||||
"beq 2f \n"
|
||||
|
||||
// back of destination by the size of the register that is
|
||||
// going to be reversed
|
||||
"sub %1, #16\n"
|
||||
"sub %1, #16 \n"
|
||||
|
||||
// the loop needs to run on blocks of 16. what will be left
|
||||
// over is either a negative number, the residuals that need
|
||||
// to be done, or 0. if this isn't subtracted off here the
|
||||
// loop will run one extra time.
|
||||
"sub %2, #16\n"
|
||||
"sub %2, #16 \n"
|
||||
|
||||
"1:\n"
|
||||
"vld1.8 {q0}, [%0]!\n" // src += 16
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // src += 16
|
||||
|
||||
// reverse the bytes in the 64 bit segments. unable to reverse
|
||||
// the bytes in the entire 128 bits in one go.
|
||||
"vrev64.8 q0, q0\n"
|
||||
"vrev64.8 q0, q0 \n"
|
||||
|
||||
// because of the inability to reverse the entire 128 bits
|
||||
// reverse the writing out of the two 64 bit segments.
|
||||
"vst1.8 {d1}, [%1]!\n"
|
||||
"vst1.8 {d0}, [%1], r3\n" // dst -= 16
|
||||
"vst1.8 {d1}, [%1]! \n"
|
||||
"vst1.8 {d0}, [%1], r3 \n" // dst -= 16
|
||||
|
||||
"subs %2, #16\n"
|
||||
"bge 1b\n"
|
||||
"subs %2, #16 \n"
|
||||
"bge 1b \n"
|
||||
|
||||
// add 16 back to the counter. if the result is 0 there is no
|
||||
// residuals so jump past
|
||||
"adds %2, #16\n"
|
||||
"beq 5f\n"
|
||||
"adds %2, #16 \n"
|
||||
"beq 5f \n"
|
||||
|
||||
"add %1, #16\n"
|
||||
"add %1, #16 \n"
|
||||
|
||||
"2:\n"
|
||||
"2: \n"
|
||||
|
||||
"mov r3, #-3\n"
|
||||
"mov r3, #-3 \n"
|
||||
|
||||
"sub %1, #2\n"
|
||||
"subs %2, #2\n"
|
||||
"sub %1, #2 \n"
|
||||
"subs %2, #2 \n"
|
||||
// check for 16*n+1 scenarios where segments_of_2 should not
|
||||
// be run, but there is something left over.
|
||||
"blt 4f\n"
|
||||
"blt 4f \n"
|
||||
|
||||
// do this in neon registers as per
|
||||
// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
|
||||
"3:\n"
|
||||
"vld2.8 {d0[0], d1[0]}, [%0]!\n" // src += 2
|
||||
"3: \n"
|
||||
"vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
|
||||
|
||||
"vst1.8 {d1[0]}, [%1]!\n"
|
||||
"vst1.8 {d0[0]}, [%1], r3\n" // dst -= 2
|
||||
"vst1.8 {d1[0]}, [%1]! \n"
|
||||
"vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2
|
||||
|
||||
"subs %2, #2\n"
|
||||
"bge 3b\n"
|
||||
"subs %2, #2 \n"
|
||||
"bge 3b \n"
|
||||
|
||||
"adds %2, #2\n"
|
||||
"beq 5f\n"
|
||||
"adds %2, #2 \n"
|
||||
"beq 5f \n"
|
||||
|
||||
"4:\n"
|
||||
"add %1, #1\n"
|
||||
"vld1.8 {d0[0]}, [%0]\n"
|
||||
"vst1.8 {d0[0]}, [%1]\n"
|
||||
"4: \n"
|
||||
"add %1, #1 \n"
|
||||
"vld1.8 {d0[0]}, [%0] \n"
|
||||
"vst1.8 {d0[0]}, [%1] \n"
|
||||
|
||||
"5:\n"
|
||||
"5: \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
@ -108,154 +108,154 @@ static const uint8 vtbl_4x4_transpose[16] __attribute__((vector_size(16))) =
|
||||
void TransposeWx8_NEON(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width) {
|
||||
asm volatile(
|
||||
asm volatile (
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %4, #8\n"
|
||||
"sub %4, #8 \n"
|
||||
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
"1:\n"
|
||||
"mov r9, %0\n"
|
||||
"1: \n"
|
||||
"mov r9, %0 \n"
|
||||
|
||||
"vld1.8 {d0}, [r9], %1\n"
|
||||
"vld1.8 {d1}, [r9], %1\n"
|
||||
"vld1.8 {d2}, [r9], %1\n"
|
||||
"vld1.8 {d3}, [r9], %1\n"
|
||||
"vld1.8 {d4}, [r9], %1\n"
|
||||
"vld1.8 {d5}, [r9], %1\n"
|
||||
"vld1.8 {d6}, [r9], %1\n"
|
||||
"vld1.8 {d7}, [r9]\n"
|
||||
"vld1.8 {d0}, [r9], %1 \n"
|
||||
"vld1.8 {d1}, [r9], %1 \n"
|
||||
"vld1.8 {d2}, [r9], %1 \n"
|
||||
"vld1.8 {d3}, [r9], %1 \n"
|
||||
"vld1.8 {d4}, [r9], %1 \n"
|
||||
"vld1.8 {d5}, [r9], %1 \n"
|
||||
"vld1.8 {d6}, [r9], %1 \n"
|
||||
"vld1.8 {d7}, [r9] \n"
|
||||
|
||||
"vtrn.8 d1, d0\n"
|
||||
"vtrn.8 d3, d2\n"
|
||||
"vtrn.8 d5, d4\n"
|
||||
"vtrn.8 d7, d6\n"
|
||||
"vtrn.8 d1, d0 \n"
|
||||
"vtrn.8 d3, d2 \n"
|
||||
"vtrn.8 d5, d4 \n"
|
||||
"vtrn.8 d7, d6 \n"
|
||||
|
||||
"vtrn.16 d1, d3\n"
|
||||
"vtrn.16 d0, d2\n"
|
||||
"vtrn.16 d5, d7\n"
|
||||
"vtrn.16 d4, d6\n"
|
||||
"vtrn.16 d1, d3 \n"
|
||||
"vtrn.16 d0, d2 \n"
|
||||
"vtrn.16 d5, d7 \n"
|
||||
"vtrn.16 d4, d6 \n"
|
||||
|
||||
"vtrn.32 d1, d5\n"
|
||||
"vtrn.32 d0, d4\n"
|
||||
"vtrn.32 d3, d7\n"
|
||||
"vtrn.32 d2, d6\n"
|
||||
"vtrn.32 d1, d5 \n"
|
||||
"vtrn.32 d0, d4 \n"
|
||||
"vtrn.32 d3, d7 \n"
|
||||
"vtrn.32 d2, d6 \n"
|
||||
|
||||
"vrev16.8 q0, q0\n"
|
||||
"vrev16.8 q1, q1\n"
|
||||
"vrev16.8 q2, q2\n"
|
||||
"vrev16.8 q3, q3\n"
|
||||
"vrev16.8 q0, q0 \n"
|
||||
"vrev16.8 q1, q1 \n"
|
||||
"vrev16.8 q2, q2 \n"
|
||||
"vrev16.8 q3, q3 \n"
|
||||
|
||||
"mov r9, %2\n"
|
||||
"mov r9, %2 \n"
|
||||
|
||||
"vst1.8 {d1}, [r9], %3\n"
|
||||
"vst1.8 {d0}, [r9], %3\n"
|
||||
"vst1.8 {d3}, [r9], %3\n"
|
||||
"vst1.8 {d2}, [r9], %3\n"
|
||||
"vst1.8 {d5}, [r9], %3\n"
|
||||
"vst1.8 {d4}, [r9], %3\n"
|
||||
"vst1.8 {d7}, [r9], %3\n"
|
||||
"vst1.8 {d6}, [r9]\n"
|
||||
"vst1.8 {d1}, [r9], %3 \n"
|
||||
"vst1.8 {d0}, [r9], %3 \n"
|
||||
"vst1.8 {d3}, [r9], %3 \n"
|
||||
"vst1.8 {d2}, [r9], %3 \n"
|
||||
"vst1.8 {d5}, [r9], %3 \n"
|
||||
"vst1.8 {d4}, [r9], %3 \n"
|
||||
"vst1.8 {d7}, [r9], %3 \n"
|
||||
"vst1.8 {d6}, [r9] \n"
|
||||
|
||||
"add %0, #8\n" // src += 8
|
||||
"add %2, %2, %3, lsl #3\n" // dst += 8 * dst_stride
|
||||
"subs %4, #8\n" // w -= 8
|
||||
"bge 1b\n"
|
||||
"add %0, #8 \n" // src += 8
|
||||
"add %2, %2, %3, lsl #3 \n" // dst += 8 * dst_stride
|
||||
"subs %4, #8 \n" // w -= 8
|
||||
"bge 1b \n"
|
||||
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// no residuals.
|
||||
"adds %4, #8\n"
|
||||
"beq 4f\n"
|
||||
"adds %4, #8 \n"
|
||||
"beq 4f \n"
|
||||
|
||||
// some residual, so between 1 and 7 lines left to transpose
|
||||
"cmp %4, #2\n"
|
||||
"blt 3f\n"
|
||||
"cmp %4, #2 \n"
|
||||
"blt 3f \n"
|
||||
|
||||
"cmp %4, #4\n"
|
||||
"blt 2f\n"
|
||||
"cmp %4, #4 \n"
|
||||
"blt 2f \n"
|
||||
|
||||
// 4x8 block
|
||||
"mov r9, %0\n"
|
||||
"vld1.32 {d0[0]}, [r9], %1\n"
|
||||
"vld1.32 {d0[1]}, [r9], %1\n"
|
||||
"vld1.32 {d1[0]}, [r9], %1\n"
|
||||
"vld1.32 {d1[1]}, [r9], %1\n"
|
||||
"vld1.32 {d2[0]}, [r9], %1\n"
|
||||
"vld1.32 {d2[1]}, [r9], %1\n"
|
||||
"vld1.32 {d3[0]}, [r9], %1\n"
|
||||
"vld1.32 {d3[1]}, [r9]\n"
|
||||
"mov r9, %0 \n"
|
||||
"vld1.32 {d0[0]}, [r9], %1 \n"
|
||||
"vld1.32 {d0[1]}, [r9], %1 \n"
|
||||
"vld1.32 {d1[0]}, [r9], %1 \n"
|
||||
"vld1.32 {d1[1]}, [r9], %1 \n"
|
||||
"vld1.32 {d2[0]}, [r9], %1 \n"
|
||||
"vld1.32 {d2[1]}, [r9], %1 \n"
|
||||
"vld1.32 {d3[0]}, [r9], %1 \n"
|
||||
"vld1.32 {d3[1]}, [r9] \n"
|
||||
|
||||
"mov r9, %2\n"
|
||||
"mov r9, %2 \n"
|
||||
|
||||
"vld1.8 {q3}, [%5]\n"
|
||||
"vld1.8 {q3}, [%5] \n"
|
||||
|
||||
"vtbl.8 d4, {d0, d1}, d6\n"
|
||||
"vtbl.8 d5, {d0, d1}, d7\n"
|
||||
"vtbl.8 d0, {d2, d3}, d6\n"
|
||||
"vtbl.8 d1, {d2, d3}, d7\n"
|
||||
"vtbl.8 d4, {d0, d1}, d6 \n"
|
||||
"vtbl.8 d5, {d0, d1}, d7 \n"
|
||||
"vtbl.8 d0, {d2, d3}, d6 \n"
|
||||
"vtbl.8 d1, {d2, d3}, d7 \n"
|
||||
|
||||
// TODO: rework shuffle above to write
|
||||
// out with 4 instead of 8 writes
|
||||
"vst1.32 {d4[0]}, [r9], %3\n"
|
||||
"vst1.32 {d4[1]}, [r9], %3\n"
|
||||
"vst1.32 {d5[0]}, [r9], %3\n"
|
||||
"vst1.32 {d5[1]}, [r9]\n"
|
||||
"vst1.32 {d4[0]}, [r9], %3 \n"
|
||||
"vst1.32 {d4[1]}, [r9], %3 \n"
|
||||
"vst1.32 {d5[0]}, [r9], %3 \n"
|
||||
"vst1.32 {d5[1]}, [r9] \n"
|
||||
|
||||
"add r9, %2, #4\n"
|
||||
"vst1.32 {d0[0]}, [r9], %3\n"
|
||||
"vst1.32 {d0[1]}, [r9], %3\n"
|
||||
"vst1.32 {d1[0]}, [r9], %3\n"
|
||||
"vst1.32 {d1[1]}, [r9]\n"
|
||||
"add r9, %2, #4 \n"
|
||||
"vst1.32 {d0[0]}, [r9], %3 \n"
|
||||
"vst1.32 {d0[1]}, [r9], %3 \n"
|
||||
"vst1.32 {d1[0]}, [r9], %3 \n"
|
||||
"vst1.32 {d1[1]}, [r9] \n"
|
||||
|
||||
"add %0, #4\n" // src += 4
|
||||
"add %2, %2, %3, lsl #2\n" // dst += 4 * dst_stride
|
||||
"subs %4, #4\n" // w -= 4
|
||||
"beq 4f\n"
|
||||
"add %0, #4 \n" // src += 4
|
||||
"add %2, %2, %3, lsl #2 \n" // dst += 4 * dst_stride
|
||||
"subs %4, #4 \n" // w -= 4
|
||||
"beq 4f \n"
|
||||
|
||||
// some residual, check to see if it includes a 2x8 block,
|
||||
// or less
|
||||
"cmp %4, #2\n"
|
||||
"blt 3f\n"
|
||||
"cmp %4, #2 \n"
|
||||
"blt 3f \n"
|
||||
|
||||
// 2x8 block
|
||||
"2:\n"
|
||||
"mov r9, %0\n"
|
||||
"vld1.16 {d0[0]}, [r9], %1\n"
|
||||
"vld1.16 {d1[0]}, [r9], %1\n"
|
||||
"vld1.16 {d0[1]}, [r9], %1\n"
|
||||
"vld1.16 {d1[1]}, [r9], %1\n"
|
||||
"vld1.16 {d0[2]}, [r9], %1\n"
|
||||
"vld1.16 {d1[2]}, [r9], %1\n"
|
||||
"vld1.16 {d0[3]}, [r9], %1\n"
|
||||
"vld1.16 {d1[3]}, [r9]\n"
|
||||
"2: \n"
|
||||
"mov r9, %0 \n"
|
||||
"vld1.16 {d0[0]}, [r9], %1 \n"
|
||||
"vld1.16 {d1[0]}, [r9], %1 \n"
|
||||
"vld1.16 {d0[1]}, [r9], %1 \n"
|
||||
"vld1.16 {d1[1]}, [r9], %1 \n"
|
||||
"vld1.16 {d0[2]}, [r9], %1 \n"
|
||||
"vld1.16 {d1[2]}, [r9], %1 \n"
|
||||
"vld1.16 {d0[3]}, [r9], %1 \n"
|
||||
"vld1.16 {d1[3]}, [r9] \n"
|
||||
|
||||
"vtrn.8 d0, d1\n"
|
||||
"vtrn.8 d0, d1 \n"
|
||||
|
||||
"mov r9, %2\n"
|
||||
"mov r9, %2 \n"
|
||||
|
||||
"vst1.64 {d0}, [r9], %3\n"
|
||||
"vst1.64 {d1}, [r9]\n"
|
||||
"vst1.64 {d0}, [r9], %3 \n"
|
||||
"vst1.64 {d1}, [r9] \n"
|
||||
|
||||
"add %0, #2\n" // src += 2
|
||||
"add %2, %2, %3, lsl #1\n" // dst += 2 * dst_stride
|
||||
"subs %4, #2\n" // w -= 2
|
||||
"beq 4f\n"
|
||||
"add %0, #2 \n" // src += 2
|
||||
"add %2, %2, %3, lsl #1 \n" // dst += 2 * dst_stride
|
||||
"subs %4, #2 \n" // w -= 2
|
||||
"beq 4f \n"
|
||||
|
||||
// 1x8 block
|
||||
"3:\n"
|
||||
"vld1.8 {d0[0]}, [%0], %1\n"
|
||||
"vld1.8 {d0[1]}, [%0], %1\n"
|
||||
"vld1.8 {d0[2]}, [%0], %1\n"
|
||||
"vld1.8 {d0[3]}, [%0], %1\n"
|
||||
"vld1.8 {d0[4]}, [%0], %1\n"
|
||||
"vld1.8 {d0[5]}, [%0], %1\n"
|
||||
"vld1.8 {d0[6]}, [%0], %1\n"
|
||||
"vld1.8 {d0[7]}, [%0]\n"
|
||||
"3: \n"
|
||||
"vld1.8 {d0[0]}, [%0], %1 \n"
|
||||
"vld1.8 {d0[1]}, [%0], %1 \n"
|
||||
"vld1.8 {d0[2]}, [%0], %1 \n"
|
||||
"vld1.8 {d0[3]}, [%0], %1 \n"
|
||||
"vld1.8 {d0[4]}, [%0], %1 \n"
|
||||
"vld1.8 {d0[5]}, [%0], %1 \n"
|
||||
"vld1.8 {d0[6]}, [%0], %1 \n"
|
||||
"vld1.8 {d0[7]}, [%0] \n"
|
||||
|
||||
"vst1.64 {d0}, [%2]\n"
|
||||
"vst1.64 {d0}, [%2] \n"
|
||||
|
||||
"4:\n"
|
||||
"4: \n"
|
||||
|
||||
: "+r"(src), // %0
|
||||
"+r"(src_stride), // %1
|
||||
@ -270,68 +270,68 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
|
||||
void ReverseLineUV_NEON(const uint8* src,
|
||||
uint8* dst_a, uint8* dst_b,
|
||||
int width) {
|
||||
asm volatile(
|
||||
asm volatile (
|
||||
// compute where to start writing destination
|
||||
"add %1, %3\n" // dst_a + width
|
||||
"add %2, %3\n" // dst_b + width
|
||||
"add %1, %3 \n" // dst_a + width
|
||||
"add %2, %3 \n" // dst_b + width
|
||||
|
||||
// work on input segments that are multiples of 16, but
|
||||
// width that has been passed is output segments, half
|
||||
// the size of input.
|
||||
"lsrs r12, %3, #3\n"
|
||||
"lsrs r12, %3, #3 \n"
|
||||
|
||||
"beq 2f\n"
|
||||
"beq 2f \n"
|
||||
|
||||
// the output is written in to two blocks.
|
||||
"mov r12, #-8\n"
|
||||
"mov r12, #-8 \n"
|
||||
|
||||
// back of destination by the size of the register that is
|
||||
// going to be reversed
|
||||
"sub %1, #8\n"
|
||||
"sub %2, #8\n"
|
||||
"sub %1, #8 \n"
|
||||
"sub %2, #8 \n"
|
||||
|
||||
// the loop needs to run on blocks of 8. what will be left
|
||||
// over is either a negative number, the residuals that need
|
||||
// to be done, or 0. if this isn't subtracted off here the
|
||||
// loop will run one extra time.
|
||||
"sub %3, #8\n"
|
||||
"sub %3, #8 \n"
|
||||
|
||||
"1:\n"
|
||||
"vld2.8 {d0, d1}, [%0]!\n" // src += 16
|
||||
"1: \n"
|
||||
"vld2.8 {d0, d1}, [%0]! \n" // src += 16
|
||||
|
||||
// reverse the bytes in the 64 bit segments
|
||||
"vrev64.8 q0, q0\n"
|
||||
"vrev64.8 q0, q0 \n"
|
||||
|
||||
"vst1.8 {d0}, [%1], r12\n" // dst_a -= 8
|
||||
"vst1.8 {d1}, [%2], r12\n" // dst_b -= 8
|
||||
"vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8
|
||||
"vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8
|
||||
|
||||
"subs %3, #8\n"
|
||||
"bge 1b\n"
|
||||
"subs %3, #8 \n"
|
||||
"bge 1b \n"
|
||||
|
||||
// add 8 back to the counter. if the result is 0 there is no
|
||||
// residuals so return
|
||||
"adds %3, #8\n"
|
||||
"beq 4f\n"
|
||||
"adds %3, #8 \n"
|
||||
"beq 4f \n"
|
||||
|
||||
"add %1, #8\n"
|
||||
"add %2, #8\n"
|
||||
"add %1, #8 \n"
|
||||
"add %2, #8 \n"
|
||||
|
||||
"2:\n"
|
||||
"2: \n"
|
||||
|
||||
"mov r12, #-1\n"
|
||||
"mov r12, #-1 \n"
|
||||
|
||||
"sub %1, #1\n"
|
||||
"sub %2, #1\n"
|
||||
"sub %1, #1 \n"
|
||||
"sub %2, #1 \n"
|
||||
|
||||
"3:\n"
|
||||
"vld2.8 {d0[0], d1[0]}, [%0]!\n" // src += 2
|
||||
"3: \n"
|
||||
"vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
|
||||
|
||||
"vst1.8 {d0[0]}, [%1], r12\n" // dst_a -= 1
|
||||
"vst1.8 {d1[0]}, [%2], r12\n" // dst_b -= 1
|
||||
"vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1
|
||||
"vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1
|
||||
|
||||
"subs %3, %3, #1\n"
|
||||
"bgt 3b\n"
|
||||
"4:\n"
|
||||
"subs %3, %3, #1 \n"
|
||||
"bgt 3b \n"
|
||||
"4: \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst_a), // %1
|
||||
"+r"(dst_b), // %2
|
||||
@ -348,198 +348,198 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width) {
|
||||
asm volatile(
|
||||
asm volatile (
|
||||
// loops are on blocks of 8. loop will stop when
|
||||
// counter gets to or below 0. starting the counter
|
||||
// at w-8 allow for this
|
||||
"sub %6, #8\n"
|
||||
"sub %6, #8 \n"
|
||||
|
||||
// handle 8x8 blocks. this should be the majority of the plane
|
||||
"1:\n"
|
||||
"mov r9, %0\n"
|
||||
"1: \n"
|
||||
"mov r9, %0 \n"
|
||||
|
||||
"vld2.8 {d0, d1}, [r9], %1\n"
|
||||
"vld2.8 {d2, d3}, [r9], %1\n"
|
||||
"vld2.8 {d4, d5}, [r9], %1\n"
|
||||
"vld2.8 {d6, d7}, [r9], %1\n"
|
||||
"vld2.8 {d16, d17}, [r9], %1\n"
|
||||
"vld2.8 {d18, d19}, [r9], %1\n"
|
||||
"vld2.8 {d20, d21}, [r9], %1\n"
|
||||
"vld2.8 {d22, d23}, [r9]\n"
|
||||
"vld2.8 {d0, d1}, [r9], %1 \n"
|
||||
"vld2.8 {d2, d3}, [r9], %1 \n"
|
||||
"vld2.8 {d4, d5}, [r9], %1 \n"
|
||||
"vld2.8 {d6, d7}, [r9], %1 \n"
|
||||
"vld2.8 {d16, d17}, [r9], %1 \n"
|
||||
"vld2.8 {d18, d19}, [r9], %1 \n"
|
||||
"vld2.8 {d20, d21}, [r9], %1 \n"
|
||||
"vld2.8 {d22, d23}, [r9] \n"
|
||||
|
||||
"vtrn.8 q1, q0\n"
|
||||
"vtrn.8 q3, q2\n"
|
||||
"vtrn.8 q9, q8\n"
|
||||
"vtrn.8 q11, q10\n"
|
||||
"vtrn.8 q1, q0 \n"
|
||||
"vtrn.8 q3, q2 \n"
|
||||
"vtrn.8 q9, q8 \n"
|
||||
"vtrn.8 q11, q10 \n"
|
||||
|
||||
"vtrn.16 q1, q3\n"
|
||||
"vtrn.16 q0, q2\n"
|
||||
"vtrn.16 q9, q11\n"
|
||||
"vtrn.16 q8, q10\n"
|
||||
"vtrn.16 q1, q3 \n"
|
||||
"vtrn.16 q0, q2 \n"
|
||||
"vtrn.16 q9, q11 \n"
|
||||
"vtrn.16 q8, q10 \n"
|
||||
|
||||
"vtrn.32 q1, q9\n"
|
||||
"vtrn.32 q0, q8\n"
|
||||
"vtrn.32 q3, q11\n"
|
||||
"vtrn.32 q2, q10\n"
|
||||
"vtrn.32 q1, q9 \n"
|
||||
"vtrn.32 q0, q8 \n"
|
||||
"vtrn.32 q3, q11 \n"
|
||||
"vtrn.32 q2, q10 \n"
|
||||
|
||||
"vrev16.8 q0, q0\n"
|
||||
"vrev16.8 q1, q1\n"
|
||||
"vrev16.8 q2, q2\n"
|
||||
"vrev16.8 q3, q3\n"
|
||||
"vrev16.8 q8, q8\n"
|
||||
"vrev16.8 q9, q9\n"
|
||||
"vrev16.8 q10, q10\n"
|
||||
"vrev16.8 q11, q11\n"
|
||||
"vrev16.8 q0, q0 \n"
|
||||
"vrev16.8 q1, q1 \n"
|
||||
"vrev16.8 q2, q2 \n"
|
||||
"vrev16.8 q3, q3 \n"
|
||||
"vrev16.8 q8, q8 \n"
|
||||
"vrev16.8 q9, q9 \n"
|
||||
"vrev16.8 q10, q10 \n"
|
||||
"vrev16.8 q11, q11 \n"
|
||||
|
||||
"mov r9, %2\n"
|
||||
"mov r9, %2 \n"
|
||||
|
||||
"vst1.8 {d2}, [r9], %3\n"
|
||||
"vst1.8 {d0}, [r9], %3\n"
|
||||
"vst1.8 {d6}, [r9], %3\n"
|
||||
"vst1.8 {d4}, [r9], %3\n"
|
||||
"vst1.8 {d18}, [r9], %3\n"
|
||||
"vst1.8 {d16}, [r9], %3\n"
|
||||
"vst1.8 {d22}, [r9], %3\n"
|
||||
"vst1.8 {d20}, [r9]\n"
|
||||
"vst1.8 {d2}, [r9], %3 \n"
|
||||
"vst1.8 {d0}, [r9], %3 \n"
|
||||
"vst1.8 {d6}, [r9], %3 \n"
|
||||
"vst1.8 {d4}, [r9], %3 \n"
|
||||
"vst1.8 {d18}, [r9], %3 \n"
|
||||
"vst1.8 {d16}, [r9], %3 \n"
|
||||
"vst1.8 {d22}, [r9], %3 \n"
|
||||
"vst1.8 {d20}, [r9] \n"
|
||||
|
||||
"mov r9, %4\n"
|
||||
"mov r9, %4 \n"
|
||||
|
||||
"vst1.8 {d3}, [r9], %5\n"
|
||||
"vst1.8 {d1}, [r9], %5\n"
|
||||
"vst1.8 {d7}, [r9], %5\n"
|
||||
"vst1.8 {d5}, [r9], %5\n"
|
||||
"vst1.8 {d19}, [r9], %5\n"
|
||||
"vst1.8 {d17}, [r9], %5\n"
|
||||
"vst1.8 {d23}, [r9], %5\n"
|
||||
"vst1.8 {d21}, [r9]\n"
|
||||
"vst1.8 {d3}, [r9], %5 \n"
|
||||
"vst1.8 {d1}, [r9], %5 \n"
|
||||
"vst1.8 {d7}, [r9], %5 \n"
|
||||
"vst1.8 {d5}, [r9], %5 \n"
|
||||
"vst1.8 {d19}, [r9], %5 \n"
|
||||
"vst1.8 {d17}, [r9], %5 \n"
|
||||
"vst1.8 {d23}, [r9], %5 \n"
|
||||
"vst1.8 {d21}, [r9] \n"
|
||||
|
||||
"add %0, #8*2\n" // src += 8*2
|
||||
"add %2, %2, %3, lsl #3\n" // dst_a += 8 * dst_stride_a
|
||||
"add %4, %4, %5, lsl #3\n" // dst_b += 8 * dst_stride_b
|
||||
"subs %6, #8\n" // w -= 8
|
||||
"bge 1b\n"
|
||||
"add %0, #8*2 \n" // src += 8*2
|
||||
"add %2, %2, %3, lsl #3 \n" // dst_a += 8 * dst_stride_a
|
||||
"add %4, %4, %5, lsl #3 \n" // dst_b += 8 * dst_stride_b
|
||||
"subs %6, #8 \n" // w -= 8
|
||||
"bge 1b \n"
|
||||
|
||||
// add 8 back to counter. if the result is 0 there are
|
||||
// no residuals.
|
||||
"adds %6, #8\n"
|
||||
"beq 4f\n"
|
||||
"adds %6, #8 \n"
|
||||
"beq 4f \n"
|
||||
|
||||
// some residual, so between 1 and 7 lines left to transpose
|
||||
"cmp %6, #2\n"
|
||||
"blt 3f\n"
|
||||
"cmp %6, #2 \n"
|
||||
"blt 3f \n"
|
||||
|
||||
"cmp %6, #4\n"
|
||||
"blt 2f\n"
|
||||
"cmp %6, #4 \n"
|
||||
"blt 2f \n"
|
||||
|
||||
//TODO(frkoenig) : clean this up
|
||||
// 4x8 block
|
||||
"mov r9, %0\n"
|
||||
"vld1.64 {d0}, [r9], %1\n"
|
||||
"vld1.64 {d1}, [r9], %1\n"
|
||||
"vld1.64 {d2}, [r9], %1\n"
|
||||
"vld1.64 {d3}, [r9], %1\n"
|
||||
"vld1.64 {d4}, [r9], %1\n"
|
||||
"vld1.64 {d5}, [r9], %1\n"
|
||||
"vld1.64 {d6}, [r9], %1\n"
|
||||
"vld1.64 {d7}, [r9]\n"
|
||||
"mov r9, %0 \n"
|
||||
"vld1.64 {d0}, [r9], %1 \n"
|
||||
"vld1.64 {d1}, [r9], %1 \n"
|
||||
"vld1.64 {d2}, [r9], %1 \n"
|
||||
"vld1.64 {d3}, [r9], %1 \n"
|
||||
"vld1.64 {d4}, [r9], %1 \n"
|
||||
"vld1.64 {d5}, [r9], %1 \n"
|
||||
"vld1.64 {d6}, [r9], %1 \n"
|
||||
"vld1.64 {d7}, [r9] \n"
|
||||
|
||||
"vld1.8 {q15}, [%7]\n"
|
||||
"vld1.8 {q15}, [%7] \n"
|
||||
|
||||
"vtrn.8 q0, q1\n"
|
||||
"vtrn.8 q2, q3\n"
|
||||
"vtrn.8 q0, q1 \n"
|
||||
"vtrn.8 q2, q3 \n"
|
||||
|
||||
"vtbl.8 d16, {d0, d1}, d30\n"
|
||||
"vtbl.8 d17, {d0, d1}, d31\n"
|
||||
"vtbl.8 d18, {d2, d3}, d30\n"
|
||||
"vtbl.8 d19, {d2, d3}, d31\n"
|
||||
"vtbl.8 d20, {d4, d5}, d30\n"
|
||||
"vtbl.8 d21, {d4, d5}, d31\n"
|
||||
"vtbl.8 d22, {d6, d7}, d30\n"
|
||||
"vtbl.8 d23, {d6, d7}, d31\n"
|
||||
"vtbl.8 d16, {d0, d1}, d30 \n"
|
||||
"vtbl.8 d17, {d0, d1}, d31 \n"
|
||||
"vtbl.8 d18, {d2, d3}, d30 \n"
|
||||
"vtbl.8 d19, {d2, d3}, d31 \n"
|
||||
"vtbl.8 d20, {d4, d5}, d30 \n"
|
||||
"vtbl.8 d21, {d4, d5}, d31 \n"
|
||||
"vtbl.8 d22, {d6, d7}, d30 \n"
|
||||
"vtbl.8 d23, {d6, d7}, d31 \n"
|
||||
|
||||
"mov r9, %2\n"
|
||||
"mov r9, %2 \n"
|
||||
|
||||
"vst1.32 {d16[0]}, [r9], %3\n"
|
||||
"vst1.32 {d16[1]}, [r9], %3\n"
|
||||
"vst1.32 {d17[0]}, [r9], %3\n"
|
||||
"vst1.32 {d17[1]}, [r9], %3\n"
|
||||
"vst1.32 {d16[0]}, [r9], %3 \n"
|
||||
"vst1.32 {d16[1]}, [r9], %3 \n"
|
||||
"vst1.32 {d17[0]}, [r9], %3 \n"
|
||||
"vst1.32 {d17[1]}, [r9], %3 \n"
|
||||
|
||||
"add r9, %2, #4\n"
|
||||
"vst1.32 {d20[0]}, [r9], %3\n"
|
||||
"vst1.32 {d20[1]}, [r9], %3\n"
|
||||
"vst1.32 {d21[0]}, [r9], %3\n"
|
||||
"vst1.32 {d21[1]}, [r9]\n"
|
||||
"add r9, %2, #4 \n"
|
||||
"vst1.32 {d20[0]}, [r9], %3 \n"
|
||||
"vst1.32 {d20[1]}, [r9], %3 \n"
|
||||
"vst1.32 {d21[0]}, [r9], %3 \n"
|
||||
"vst1.32 {d21[1]}, [r9] \n"
|
||||
|
||||
"mov r9, %4\n"
|
||||
"mov r9, %4 \n"
|
||||
|
||||
"vst1.32 {d18[0]}, [r9], %5\n"
|
||||
"vst1.32 {d18[1]}, [r9], %5\n"
|
||||
"vst1.32 {d19[0]}, [r9], %5\n"
|
||||
"vst1.32 {d19[1]}, [r9], %5\n"
|
||||
"vst1.32 {d18[0]}, [r9], %5 \n"
|
||||
"vst1.32 {d18[1]}, [r9], %5 \n"
|
||||
"vst1.32 {d19[0]}, [r9], %5 \n"
|
||||
"vst1.32 {d19[1]}, [r9], %5 \n"
|
||||
|
||||
"add r9, %4, #4\n"
|
||||
"vst1.32 {d22[0]}, [r9], %5\n"
|
||||
"vst1.32 {d22[1]}, [r9], %5\n"
|
||||
"vst1.32 {d23[0]}, [r9], %5\n"
|
||||
"vst1.32 {d23[1]}, [r9]\n"
|
||||
"add r9, %4, #4 \n"
|
||||
"vst1.32 {d22[0]}, [r9], %5 \n"
|
||||
"vst1.32 {d22[1]}, [r9], %5 \n"
|
||||
"vst1.32 {d23[0]}, [r9], %5 \n"
|
||||
"vst1.32 {d23[1]}, [r9] \n"
|
||||
|
||||
"add %0, #4*2\n" // src += 4 * 2
|
||||
"add %2, %2, %3, lsl #2\n" // dst_a += 4 * dst_stride_a
|
||||
"add %4, %4, %5, lsl #2\n" // dst_b += 4 * dst_stride_b
|
||||
"subs %6, #4\n" // w -= 4
|
||||
"beq 4f\n"
|
||||
"add %0, #4*2 \n" // src += 4 * 2
|
||||
"add %2, %2, %3, lsl #2 \n" // dst_a += 4 * dst_stride_a
|
||||
"add %4, %4, %5, lsl #2 \n" // dst_b += 4 * dst_stride_b
|
||||
"subs %6, #4 \n" // w -= 4
|
||||
"beq 4f \n"
|
||||
|
||||
// some residual, check to see if it includes a 2x8 block,
|
||||
// or less
|
||||
"cmp %6, #2\n"
|
||||
"blt 3f\n"
|
||||
"cmp %6, #2 \n"
|
||||
"blt 3f \n"
|
||||
|
||||
// 2x8 block
|
||||
"2:\n"
|
||||
"mov r9, %0\n"
|
||||
"vld2.16 {d0[0], d2[0]}, [r9], %1\n"
|
||||
"vld2.16 {d1[0], d3[0]}, [r9], %1\n"
|
||||
"vld2.16 {d0[1], d2[1]}, [r9], %1\n"
|
||||
"vld2.16 {d1[1], d3[1]}, [r9], %1\n"
|
||||
"vld2.16 {d0[2], d2[2]}, [r9], %1\n"
|
||||
"vld2.16 {d1[2], d3[2]}, [r9], %1\n"
|
||||
"vld2.16 {d0[3], d2[3]}, [r9], %1\n"
|
||||
"vld2.16 {d1[3], d3[3]}, [r9]\n"
|
||||
"2: \n"
|
||||
"mov r9, %0 \n"
|
||||
"vld2.16 {d0[0], d2[0]}, [r9], %1 \n"
|
||||
"vld2.16 {d1[0], d3[0]}, [r9], %1 \n"
|
||||
"vld2.16 {d0[1], d2[1]}, [r9], %1 \n"
|
||||
"vld2.16 {d1[1], d3[1]}, [r9], %1 \n"
|
||||
"vld2.16 {d0[2], d2[2]}, [r9], %1 \n"
|
||||
"vld2.16 {d1[2], d3[2]}, [r9], %1 \n"
|
||||
"vld2.16 {d0[3], d2[3]}, [r9], %1 \n"
|
||||
"vld2.16 {d1[3], d3[3]}, [r9] \n"
|
||||
|
||||
"vtrn.8 d0, d1\n"
|
||||
"vtrn.8 d2, d3\n"
|
||||
"vtrn.8 d0, d1 \n"
|
||||
"vtrn.8 d2, d3 \n"
|
||||
|
||||
"mov r9, %2\n"
|
||||
"mov r9, %2 \n"
|
||||
|
||||
"vst1.64 {d0}, [r9], %3\n"
|
||||
"vst1.64 {d2}, [r9]\n"
|
||||
"vst1.64 {d0}, [r9], %3 \n"
|
||||
"vst1.64 {d2}, [r9] \n"
|
||||
|
||||
"mov r9, %4\n"
|
||||
"mov r9, %4 \n"
|
||||
|
||||
"vst1.64 {d1}, [r9], %5\n"
|
||||
"vst1.64 {d3}, [r9]\n"
|
||||
"vst1.64 {d1}, [r9], %5 \n"
|
||||
"vst1.64 {d3}, [r9] \n"
|
||||
|
||||
"add %0, #2*2\n" // src += 2 * 2
|
||||
"add %2, %2, %3, lsl #1\n" // dst_a += 2 * dst_stride_a
|
||||
"add %4, %4, %5, lsl #1\n" // dst_b += 2 * dst_stride_b
|
||||
"subs %6, #2\n" // w -= 2
|
||||
"beq 4f\n"
|
||||
"add %0, #2*2 \n" // src += 2 * 2
|
||||
"add %2, %2, %3, lsl #1 \n" // dst_a += 2 * dst_stride_a
|
||||
"add %4, %4, %5, lsl #1 \n" // dst_b += 2 * dst_stride_b
|
||||
"subs %6, #2 \n" // w -= 2
|
||||
"beq 4f \n"
|
||||
|
||||
// 1x8 block
|
||||
"3:\n"
|
||||
"vld2.8 {d0[0], d1[0]}, [%0], %1\n"
|
||||
"vld2.8 {d0[1], d1[1]}, [%0], %1\n"
|
||||
"vld2.8 {d0[2], d1[2]}, [%0], %1\n"
|
||||
"vld2.8 {d0[3], d1[3]}, [%0], %1\n"
|
||||
"vld2.8 {d0[4], d1[4]}, [%0], %1\n"
|
||||
"vld2.8 {d0[5], d1[5]}, [%0], %1\n"
|
||||
"vld2.8 {d0[6], d1[6]}, [%0], %1\n"
|
||||
"vld2.8 {d0[7], d1[7]}, [%0]\n"
|
||||
"3: \n"
|
||||
"vld2.8 {d0[0], d1[0]}, [%0], %1 \n"
|
||||
"vld2.8 {d0[1], d1[1]}, [%0], %1 \n"
|
||||
"vld2.8 {d0[2], d1[2]}, [%0], %1 \n"
|
||||
"vld2.8 {d0[3], d1[3]}, [%0], %1 \n"
|
||||
"vld2.8 {d0[4], d1[4]}, [%0], %1 \n"
|
||||
"vld2.8 {d0[5], d1[5]}, [%0], %1 \n"
|
||||
"vld2.8 {d0[6], d1[6]}, [%0], %1 \n"
|
||||
"vld2.8 {d0[7], d1[7]}, [%0] \n"
|
||||
|
||||
"vst1.64 {d0}, [%2]\n"
|
||||
"vst1.64 {d1}, [%4]\n"
|
||||
"vst1.64 {d0}, [%2] \n"
|
||||
"vst1.64 {d1}, [%4] \n"
|
||||
|
||||
"4:\n"
|
||||
"4: \n"
|
||||
|
||||
: "+r"(src), // %0
|
||||
"+r"(src_stride), // %1
|
||||
|
||||
@ -59,23 +59,23 @@ static const uvec8 kShuffleMaskBGRAToARGB = {
|
||||
};
|
||||
|
||||
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm5,%%xmm5\n"
|
||||
"pslld $0x18,%%xmm5\n"
|
||||
"1:"
|
||||
"movq (%0),%%xmm0\n"
|
||||
"lea 0x8(%0),%0\n"
|
||||
"punpcklbw %%xmm0,%%xmm0\n"
|
||||
"movdqa %%xmm0,%%xmm1\n"
|
||||
"punpcklwd %%xmm0,%%xmm0\n"
|
||||
"punpckhwd %%xmm1,%%xmm1\n"
|
||||
"por %%xmm5,%%xmm0\n"
|
||||
"por %%xmm5,%%xmm1\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"movdqa %%xmm1,0x10(%1)\n"
|
||||
"lea 0x20(%1),%1\n"
|
||||
"sub $0x8,%2\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"pslld $0x18,%%xmm5 \n"
|
||||
"1: \n"
|
||||
"movq (%0),%%xmm0 \n"
|
||||
"lea 0x8(%0),%0 \n"
|
||||
"punpcklbw %%xmm0,%%xmm0 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"punpcklwd %%xmm0,%%xmm0 \n"
|
||||
"punpckhwd %%xmm1,%%xmm1 \n"
|
||||
"por %%xmm5,%%xmm0 \n"
|
||||
"por %%xmm5,%%xmm1 \n"
|
||||
"movdqa %%xmm0,(%1) \n"
|
||||
"movdqa %%xmm1,0x10(%1) \n"
|
||||
"lea 0x20(%1),%1 \n"
|
||||
"sub $0x8,%2 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(pix) // %2
|
||||
@ -88,16 +88,16 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
}
|
||||
|
||||
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
|
||||
asm volatile(
|
||||
"movdqa %3,%%xmm5\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"lea 0x10(%0),%0\n"
|
||||
"pshufb %%xmm5,%%xmm0\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"lea 0x10(%1),%1\n"
|
||||
"sub $0x4,%2\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"movdqa %3,%%xmm5 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"pshufb %%xmm5,%%xmm0 \n"
|
||||
"movdqa %%xmm0,(%1) \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src_abgr), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(pix) // %2
|
||||
@ -111,16 +111,16 @@ void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
|
||||
}
|
||||
|
||||
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
|
||||
asm volatile(
|
||||
"movdqa %3,%%xmm5\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"lea 0x10(%0),%0\n"
|
||||
"pshufb %%xmm5,%%xmm0\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"lea 0x10(%1),%1\n"
|
||||
"sub $0x4,%2\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"movdqa %3,%%xmm5 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"pshufb %%xmm5,%%xmm0 \n"
|
||||
"movdqa %%xmm0,(%1) \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src_bgra), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(pix) // %2
|
||||
@ -133,34 +133,34 @@ void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
|
||||
}
|
||||
|
||||
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm5,%%xmm5\n" // generate mask 0xff000000
|
||||
"pslld $0x18,%%xmm5\n"
|
||||
"movdqa %3,%%xmm4\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
"movdqa 0x20(%0),%%xmm3\n"
|
||||
"lea 0x30(%0),%0\n"
|
||||
"movdqa %%xmm3,%%xmm2\n"
|
||||
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
|
||||
"pshufb %%xmm4,%%xmm2\n"
|
||||
"por %%xmm5,%%xmm2\n"
|
||||
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
|
||||
"pshufb %%xmm4,%%xmm0\n"
|
||||
"movdqa %%xmm2,0x20(%1)\n"
|
||||
"por %%xmm5,%%xmm0\n"
|
||||
"pshufb %%xmm4,%%xmm1\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"por %%xmm5,%%xmm1\n"
|
||||
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
|
||||
"pshufb %%xmm4,%%xmm3\n"
|
||||
"movdqa %%xmm1,0x10(%1)\n"
|
||||
"por %%xmm5,%%xmm3\n"
|
||||
"movdqa %%xmm3,0x30(%1)\n"
|
||||
"lea 0x40(%1),%1\n"
|
||||
"sub $0x10,%2\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
|
||||
"pslld $0x18,%%xmm5 \n"
|
||||
"movdqa %3,%%xmm4 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa 0x10(%0),%%xmm1 \n"
|
||||
"movdqa 0x20(%0),%%xmm3 \n"
|
||||
"lea 0x30(%0),%0 \n"
|
||||
"movdqa %%xmm3,%%xmm2 \n"
|
||||
"palignr $0x8,%%xmm1,%%xmm2 \n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
|
||||
"pshufb %%xmm4,%%xmm2 \n"
|
||||
"por %%xmm5,%%xmm2 \n"
|
||||
"palignr $0xc,%%xmm0,%%xmm1 \n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
|
||||
"pshufb %%xmm4,%%xmm0 \n"
|
||||
"movdqa %%xmm2,0x20(%1) \n"
|
||||
"por %%xmm5,%%xmm0 \n"
|
||||
"pshufb %%xmm4,%%xmm1 \n"
|
||||
"movdqa %%xmm0,(%1) \n"
|
||||
"por %%xmm5,%%xmm1 \n"
|
||||
"palignr $0x4,%%xmm3,%%xmm3 \n" // xmm3 = { xmm3[4:15] }
|
||||
"pshufb %%xmm4,%%xmm3 \n"
|
||||
"movdqa %%xmm1,0x10(%1) \n"
|
||||
"por %%xmm5,%%xmm3 \n"
|
||||
"movdqa %%xmm3,0x30(%1) \n"
|
||||
"lea 0x40(%1),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src_bg24), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(pix) // %2
|
||||
@ -173,34 +173,34 @@ void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
|
||||
}
|
||||
|
||||
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm5,%%xmm5\n" // generate mask 0xff000000
|
||||
"pslld $0x18,%%xmm5\n"
|
||||
"movdqa %3,%%xmm4\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
"movdqa 0x20(%0),%%xmm3\n"
|
||||
"lea 0x30(%0),%0\n"
|
||||
"movdqa %%xmm3,%%xmm2\n"
|
||||
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
|
||||
"pshufb %%xmm4,%%xmm2\n"
|
||||
"por %%xmm5,%%xmm2\n"
|
||||
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
|
||||
"pshufb %%xmm4,%%xmm0\n"
|
||||
"movdqa %%xmm2,0x20(%1)\n"
|
||||
"por %%xmm5,%%xmm0\n"
|
||||
"pshufb %%xmm4,%%xmm1\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"por %%xmm5,%%xmm1\n"
|
||||
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
|
||||
"pshufb %%xmm4,%%xmm3\n"
|
||||
"movdqa %%xmm1,0x10(%1)\n"
|
||||
"por %%xmm5,%%xmm3\n"
|
||||
"movdqa %%xmm3,0x30(%1)\n"
|
||||
"lea 0x40(%1),%1\n"
|
||||
"sub $0x10,%2\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
|
||||
"pslld $0x18,%%xmm5 \n"
|
||||
"movdqa %3,%%xmm4 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa 0x10(%0),%%xmm1 \n"
|
||||
"movdqa 0x20(%0),%%xmm3 \n"
|
||||
"lea 0x30(%0),%0 \n"
|
||||
"movdqa %%xmm3,%%xmm2 \n"
|
||||
"palignr $0x8,%%xmm1,%%xmm2 \n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
|
||||
"pshufb %%xmm4,%%xmm2 \n"
|
||||
"por %%xmm5,%%xmm2 \n"
|
||||
"palignr $0xc,%%xmm0,%%xmm1 \n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
|
||||
"pshufb %%xmm4,%%xmm0 \n"
|
||||
"movdqa %%xmm2,0x20(%1) \n"
|
||||
"por %%xmm5,%%xmm0 \n"
|
||||
"pshufb %%xmm4,%%xmm1 \n"
|
||||
"movdqa %%xmm0,(%1) \n"
|
||||
"por %%xmm5,%%xmm1 \n"
|
||||
"palignr $0x4,%%xmm3,%%xmm3 \n" // xmm3 = { xmm3[4:15] }
|
||||
"pshufb %%xmm4,%%xmm3 \n"
|
||||
"movdqa %%xmm1,0x10(%1) \n"
|
||||
"por %%xmm5,%%xmm3 \n"
|
||||
"movdqa %%xmm3,0x30(%1) \n"
|
||||
"lea 0x40(%1),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src_raw), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(pix) // %2
|
||||
@ -213,29 +213,29 @@ void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
|
||||
}
|
||||
|
||||
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
asm volatile(
|
||||
"movdqa %4,%%xmm5\n"
|
||||
"movdqa %3,%%xmm4\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
"movdqa 0x20(%0),%%xmm2\n"
|
||||
"movdqa 0x30(%0),%%xmm3\n"
|
||||
"pmaddubsw %%xmm4,%%xmm0\n"
|
||||
"pmaddubsw %%xmm4,%%xmm1\n"
|
||||
"pmaddubsw %%xmm4,%%xmm2\n"
|
||||
"pmaddubsw %%xmm4,%%xmm3\n"
|
||||
"lea 0x40(%0),%0\n"
|
||||
"phaddw %%xmm1,%%xmm0\n"
|
||||
"phaddw %%xmm3,%%xmm2\n"
|
||||
"psrlw $0x7,%%xmm0\n"
|
||||
"psrlw $0x7,%%xmm2\n"
|
||||
"packuswb %%xmm2,%%xmm0\n"
|
||||
"paddb %%xmm5,%%xmm0\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"lea 0x10(%1),%1\n"
|
||||
"sub $0x10,%2\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"movdqa %4,%%xmm5 \n"
|
||||
"movdqa %3,%%xmm4 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa 0x10(%0),%%xmm1 \n"
|
||||
"movdqa 0x20(%0),%%xmm2 \n"
|
||||
"movdqa 0x30(%0),%%xmm3 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm0 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm1 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm2 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm3 \n"
|
||||
"lea 0x40(%0),%0 \n"
|
||||
"phaddw %%xmm1,%%xmm0 \n"
|
||||
"phaddw %%xmm3,%%xmm2 \n"
|
||||
"psrlw $0x7,%%xmm0 \n"
|
||||
"psrlw $0x7,%%xmm2 \n"
|
||||
"packuswb %%xmm2,%%xmm0 \n"
|
||||
"paddb %%xmm5,%%xmm0 \n"
|
||||
"movdqa %%xmm0,(%1) \n"
|
||||
"lea 0x10(%1),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(pix) // %2
|
||||
@ -253,10 +253,10 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
#ifdef HAS_ARGBTOUVROW_SSSE3
|
||||
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
asm volatile(
|
||||
"movdqa %0,%%xmm4\n"
|
||||
"movdqa %1,%%xmm3\n"
|
||||
"movdqa %2,%%xmm5\n"
|
||||
asm volatile (
|
||||
"movdqa %0,%%xmm4 \n"
|
||||
"movdqa %1,%%xmm3 \n"
|
||||
"movdqa %2,%%xmm5 \n"
|
||||
:
|
||||
: "m"(kARGBToU), // %0
|
||||
"m"(kARGBToV), // %1
|
||||
@ -266,43 +266,43 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
"xmm3", "xmm4", "xmm5"
|
||||
#endif
|
||||
);
|
||||
asm volatile(
|
||||
"sub %1,%2\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
"movdqa 0x20(%0),%%xmm2\n"
|
||||
"movdqa 0x30(%0),%%xmm6\n"
|
||||
"pavgb (%0,%4,1),%%xmm0\n"
|
||||
"pavgb 0x10(%0,%4,1),%%xmm1\n"
|
||||
"pavgb 0x20(%0,%4,1),%%xmm2\n"
|
||||
"pavgb 0x30(%0,%4,1),%%xmm6\n"
|
||||
"lea 0x40(%0),%0\n"
|
||||
"movdqa %%xmm0,%%xmm7\n"
|
||||
"shufps $0x88,%%xmm1,%%xmm0\n"
|
||||
"shufps $0xdd,%%xmm1,%%xmm7\n"
|
||||
"pavgb %%xmm7,%%xmm0\n"
|
||||
"movdqa %%xmm2,%%xmm7\n"
|
||||
"shufps $0x88,%%xmm6,%%xmm2\n"
|
||||
"shufps $0xdd,%%xmm6,%%xmm7\n"
|
||||
"pavgb %%xmm7,%%xmm2\n"
|
||||
"movdqa %%xmm0,%%xmm1\n"
|
||||
"movdqa %%xmm2,%%xmm6\n"
|
||||
"pmaddubsw %%xmm4,%%xmm0\n"
|
||||
"pmaddubsw %%xmm4,%%xmm2\n"
|
||||
"pmaddubsw %%xmm3,%%xmm1\n"
|
||||
"pmaddubsw %%xmm3,%%xmm6\n"
|
||||
"phaddw %%xmm2,%%xmm0\n"
|
||||
"phaddw %%xmm6,%%xmm1\n"
|
||||
"psraw $0x8,%%xmm0\n"
|
||||
"psraw $0x8,%%xmm1\n"
|
||||
"packsswb %%xmm1,%%xmm0\n"
|
||||
"paddb %%xmm5,%%xmm0\n"
|
||||
"movlps %%xmm0,(%1)\n"
|
||||
"movhps %%xmm0,(%1,%2,1)\n"
|
||||
"lea 0x8(%1),%1\n"
|
||||
"sub $0x10,%3\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"sub %1,%2 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa 0x10(%0),%%xmm1 \n"
|
||||
"movdqa 0x20(%0),%%xmm2 \n"
|
||||
"movdqa 0x30(%0),%%xmm6 \n"
|
||||
"pavgb (%0,%4,1),%%xmm0 \n"
|
||||
"pavgb 0x10(%0,%4,1),%%xmm1 \n"
|
||||
"pavgb 0x20(%0,%4,1),%%xmm2 \n"
|
||||
"pavgb 0x30(%0,%4,1),%%xmm6 \n"
|
||||
"lea 0x40(%0),%0 \n"
|
||||
"movdqa %%xmm0,%%xmm7 \n"
|
||||
"shufps $0x88,%%xmm1,%%xmm0 \n"
|
||||
"shufps $0xdd,%%xmm1,%%xmm7 \n"
|
||||
"pavgb %%xmm7,%%xmm0 \n"
|
||||
"movdqa %%xmm2,%%xmm7 \n"
|
||||
"shufps $0x88,%%xmm6,%%xmm2 \n"
|
||||
"shufps $0xdd,%%xmm6,%%xmm7 \n"
|
||||
"pavgb %%xmm7,%%xmm2 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"movdqa %%xmm2,%%xmm6 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm0 \n"
|
||||
"pmaddubsw %%xmm4,%%xmm2 \n"
|
||||
"pmaddubsw %%xmm3,%%xmm1 \n"
|
||||
"pmaddubsw %%xmm3,%%xmm6 \n"
|
||||
"phaddw %%xmm2,%%xmm0 \n"
|
||||
"phaddw %%xmm6,%%xmm1 \n"
|
||||
"psraw $0x8,%%xmm0 \n"
|
||||
"psraw $0x8,%%xmm1 \n"
|
||||
"packsswb %%xmm1,%%xmm0 \n"
|
||||
"paddb %%xmm5,%%xmm0 \n"
|
||||
"movlps %%xmm0,(%1) \n"
|
||||
"movhps %%xmm0,(%1,%2,1) \n"
|
||||
"lea 0x8(%1),%1 \n"
|
||||
"sub $0x10,%3 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(dst_u), // %1
|
||||
"+r"(dst_v), // %2
|
||||
@ -332,98 +332,65 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
#define OMITFP __attribute__((optimize("omit-frame-pointer")))
|
||||
#endif
|
||||
|
||||
#if defined(__APPLE__)
|
||||
// REG6 version uses 1 less register but is slower
|
||||
#define REG6
|
||||
#endif
|
||||
|
||||
#ifdef REG6
|
||||
// 6 register version only has REG_a for temporary
|
||||
#define CLOBBER "%"REG_a
|
||||
#define YUVTORGB \
|
||||
"1:" \
|
||||
"movzb (%1),%%"REG_a"\n" \
|
||||
"lea 1(%1),%1\n" \
|
||||
"movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
|
||||
"movzb (%2),%%"REG_a"\n" \
|
||||
"lea 1(%2),%2\n" \
|
||||
"movq 4096(%5,%%"REG_a",8),%%xmm1\n" \
|
||||
"paddsw %%xmm1,%%xmm0\n" \
|
||||
"movzb (%0),%%"REG_a"\n" \
|
||||
"movq 0(%5,%%"REG_a",8),%%xmm2\n" \
|
||||
"movzb 0x1(%0),%%"REG_a"\n" \
|
||||
"movq 0(%5,%%"REG_a",8),%%xmm3\n" \
|
||||
"lea 2(%0),%0\n" \
|
||||
"paddsw %%xmm0,%%xmm2\n" \
|
||||
"paddsw %%xmm0,%%xmm3\n" \
|
||||
"shufps $0x44,%%xmm3,%%xmm2\n" \
|
||||
"psraw $0x6,%%xmm2\n" \
|
||||
"packuswb %%xmm2,%%xmm2\n" \
|
||||
"movq %%xmm2,0x0(%3)\n" \
|
||||
"lea 8(%3),%3\n" \
|
||||
"sub $0x2,%4\n" \
|
||||
"ja 1b\n"
|
||||
#else
|
||||
#define CLOBBER "%"REG_a, "%"REG_d
|
||||
// This version produces 2 pixels
|
||||
#define YUVTORGB \
|
||||
"1:" \
|
||||
"movzb (%1),%%"REG_a"\n" \
|
||||
"lea 1(%1),%1\n" \
|
||||
"movzb (%2),%%"REG_d"\n" \
|
||||
"lea 1(%2),%2\n" \
|
||||
"movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
|
||||
"movzb 0(%0),%%"REG_a"\n" \
|
||||
"movq 4096(%5,%%"REG_d",8),%%xmm1\n" \
|
||||
"paddsw %%xmm1,%%xmm0\n" \
|
||||
"movzb 1(%0),%%"REG_d"\n" \
|
||||
"punpcklqdq %%xmm0,%%xmm0\n" \
|
||||
"lea 2(%0),%0\n" \
|
||||
"movq 0(%5,%%"REG_a",8),%%xmm1\n" \
|
||||
"movhps 0(%5,%%"REG_d",8),%%xmm1\n" \
|
||||
"paddsw %%xmm0,%%xmm1\n" \
|
||||
"psraw $6,%%xmm1\n" \
|
||||
"packuswb %%xmm1,%%xmm1\n" \
|
||||
"movq %%xmm1,0(%3)\n" \
|
||||
"lea 8(%3),%3\n" \
|
||||
"sub $0x2,%4\n" \
|
||||
"ja 1b\n"
|
||||
"1: \n" \
|
||||
"movzb (%1),%%"REG_a" \n" \
|
||||
"lea 1(%1),%1 \n" \
|
||||
"movzb (%2),%%"REG_d" \n" \
|
||||
"lea 1(%2),%2 \n" \
|
||||
"movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \
|
||||
"movzb 0(%0),%%"REG_a" \n" \
|
||||
"movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \
|
||||
"paddsw %%xmm1,%%xmm0 \n" \
|
||||
"movzb 1(%0),%%"REG_d" \n" \
|
||||
"punpcklqdq %%xmm0,%%xmm0 \n" \
|
||||
"lea 2(%0),%0 \n" \
|
||||
"movq 0(%5,%%"REG_a",8),%%xmm1 \n" \
|
||||
"movhps 0(%5,%%"REG_d",8),%%xmm1 \n" \
|
||||
"paddsw %%xmm0,%%xmm1 \n" \
|
||||
"psraw $6,%%xmm1 \n" \
|
||||
"packuswb %%xmm1,%%xmm1 \n" \
|
||||
"movq %%xmm1,0(%3) \n" \
|
||||
"lea 8(%3),%3 \n" \
|
||||
"sub $0x2,%4 \n" \
|
||||
"ja 1b \n"
|
||||
// This version produces 4 pixels
|
||||
#define YUVTORGB4 \
|
||||
"1:" \
|
||||
"movzb 0(%1),%%"REG_a"\n" \
|
||||
"movzb 0(%2),%%"REG_d"\n" \
|
||||
"movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
|
||||
"movzb 0(%0),%%"REG_a"\n" \
|
||||
"movq 4096(%5,%%"REG_d",8),%%xmm1\n" \
|
||||
"paddsw %%xmm1,%%xmm0\n" \
|
||||
"movzb 1(%0),%%"REG_d"\n" \
|
||||
"punpcklqdq %%xmm0,%%xmm0\n" \
|
||||
"movq 0(%5,%%"REG_a",8),%%xmm2\n" \
|
||||
"movhps 0(%5,%%"REG_d",8),%%xmm2\n" \
|
||||
"paddsw %%xmm0,%%xmm2\n" \
|
||||
"psraw $6,%%xmm2\n" \
|
||||
"movzb 1(%1),%%"REG_a"\n" \
|
||||
"movzb 1(%2),%%"REG_d"\n" \
|
||||
"movq 2048(%5,%%"REG_a",8),%%xmm0\n" \
|
||||
"movzb 2(%0),%%"REG_a"\n" \
|
||||
"movq 4096(%5,%%"REG_d",8),%%xmm1\n" \
|
||||
"paddsw %%xmm1,%%xmm0\n" \
|
||||
"movzb 3(%0),%%"REG_d"\n" \
|
||||
"punpcklqdq %%xmm0,%%xmm0\n" \
|
||||
"movq 0(%5,%%"REG_a",8),%%xmm3\n" \
|
||||
"movhps 0(%5,%%"REG_d",8),%%xmm3\n" \
|
||||
"paddsw %%xmm0,%%xmm3\n" \
|
||||
"psraw $6,%%xmm3\n" \
|
||||
"lea 2(%1),%1\n" \
|
||||
"lea 2(%2),%2\n" \
|
||||
"lea 4(%0),%0\n" \
|
||||
"packuswb %%xmm3,%%xmm2\n" \
|
||||
"movdqa %%xmm2,0(%3)\n" \
|
||||
"lea 16(%3),%3\n" \
|
||||
"sub $0x4,%4\n" \
|
||||
"ja 1b\n"
|
||||
#endif
|
||||
"1: \n" \
|
||||
"movzb 0(%1),%%"REG_a" \n" \
|
||||
"movzb 0(%2),%%"REG_d" \n" \
|
||||
"movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \
|
||||
"movzb 0(%0),%%"REG_a" \n" \
|
||||
"movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \
|
||||
"paddsw %%xmm1,%%xmm0 \n" \
|
||||
"movzb 1(%0),%%"REG_d" \n" \
|
||||
"punpcklqdq %%xmm0,%%xmm0 \n" \
|
||||
"movq 0(%5,%%"REG_a",8),%%xmm2 \n" \
|
||||
"movhps 0(%5,%%"REG_d",8),%%xmm2 \n" \
|
||||
"paddsw %%xmm0,%%xmm2 \n" \
|
||||
"psraw $6,%%xmm2 \n" \
|
||||
"movzb 1(%1),%%"REG_a" \n" \
|
||||
"movzb 1(%2),%%"REG_d" \n" \
|
||||
"movq 2048(%5,%%"REG_a",8),%%xmm0 \n" \
|
||||
"movzb 2(%0),%%"REG_a" \n" \
|
||||
"movq 4096(%5,%%"REG_d",8),%%xmm1 \n" \
|
||||
"paddsw %%xmm1,%%xmm0 \n" \
|
||||
"movzb 3(%0),%%"REG_d" \n" \
|
||||
"punpcklqdq %%xmm0,%%xmm0 \n" \
|
||||
"movq 0(%5,%%"REG_a",8),%%xmm3 \n" \
|
||||
"movhps 0(%5,%%"REG_d",8),%%xmm3 \n" \
|
||||
"paddsw %%xmm0,%%xmm3 \n" \
|
||||
"psraw $6,%%xmm3 \n" \
|
||||
"lea 2(%1),%1 \n" \
|
||||
"lea 2(%2),%2 \n" \
|
||||
"lea 4(%0),%0 \n" \
|
||||
"packuswb %%xmm3,%%xmm2 \n" \
|
||||
"movdqa %%xmm2,0(%3) \n" \
|
||||
"lea 16(%3),%3 \n" \
|
||||
"sub $0x4,%4 \n" \
|
||||
"ja 1b \n" \
|
||||
|
||||
// 6 or 7 registers
|
||||
void OMITFP FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, // rdi
|
||||
@ -431,7 +398,7 @@ void OMITFP FastConvertYUVToARGBRow_SSE2(const uint8* y_buf, // rdi
|
||||
const uint8* v_buf, // rdx
|
||||
uint8* rgb_buf, // rcx
|
||||
int width) { // r8
|
||||
asm volatile(
|
||||
asm volatile (
|
||||
YUVTORGB
|
||||
: "+r"(y_buf), // %0
|
||||
"+r"(u_buf), // %1
|
||||
@ -452,7 +419,7 @@ void OMITFP FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf, // rdi
|
||||
const uint8* v_buf, // rdx
|
||||
uint8* rgb_buf, // rcx
|
||||
int width) { // r8
|
||||
asm volatile(
|
||||
asm volatile (
|
||||
YUVTORGB4
|
||||
: "+r"(y_buf), // %0
|
||||
"+r"(u_buf), // %1
|
||||
@ -472,7 +439,7 @@ void OMITFP FastConvertYUVToBGRARow_SSE2(const uint8* y_buf, // rdi
|
||||
const uint8* v_buf, // rdx
|
||||
uint8* rgb_buf, // rcx
|
||||
int width) { // r8
|
||||
asm volatile(
|
||||
asm volatile (
|
||||
YUVTORGB
|
||||
: "+r"(y_buf), // %0
|
||||
"+r"(u_buf), // %1
|
||||
@ -492,7 +459,7 @@ void OMITFP FastConvertYUVToABGRRow_SSE2(const uint8* y_buf, // rdi
|
||||
const uint8* v_buf, // rdx
|
||||
uint8* rgb_buf, // rcx
|
||||
int width) { // r8
|
||||
asm volatile(
|
||||
asm volatile (
|
||||
YUVTORGB
|
||||
: "+r"(y_buf), // %0
|
||||
"+r"(u_buf), // %1
|
||||
@ -513,26 +480,26 @@ void OMITFP FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, // rdi
|
||||
const uint8* v_buf, // rdx
|
||||
uint8* rgb_buf, // rcx
|
||||
int width) { // r8
|
||||
asm volatile(
|
||||
"1:"
|
||||
"movzb (%1),%%"REG_a"\n"
|
||||
"lea 1(%1),%1\n"
|
||||
"movq 2048(%5,%%"REG_a",8),%%xmm0\n"
|
||||
"movzb (%2),%%"REG_a"\n"
|
||||
"lea 1(%2),%2\n"
|
||||
"movq 4096(%5,%%"REG_a",8),%%xmm1\n"
|
||||
"paddsw %%xmm1,%%xmm0\n"
|
||||
"movzb (%0),%%"REG_a"\n"
|
||||
"lea 1(%0),%0\n"
|
||||
"movq 0(%5,%%"REG_a",8),%%xmm2\n"
|
||||
"paddsw %%xmm0,%%xmm2\n"
|
||||
"shufps $0x44,%%xmm2,%%xmm2\n"
|
||||
"psraw $0x6,%%xmm2\n"
|
||||
"packuswb %%xmm2,%%xmm2\n"
|
||||
"movd %%xmm2,0x0(%3)\n"
|
||||
"lea 4(%3),%3\n"
|
||||
"sub $0x1,%4\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
"movzb (%1),%%"REG_a" \n"
|
||||
"lea 1(%1),%1 \n"
|
||||
"movq 2048(%5,%%"REG_a",8),%%xmm0 \n"
|
||||
"movzb (%2),%%"REG_a" \n"
|
||||
"lea 1(%2),%2 \n"
|
||||
"movq 4096(%5,%%"REG_a",8),%%xmm1 \n"
|
||||
"paddsw %%xmm1,%%xmm0 \n"
|
||||
"movzb (%0),%%"REG_a" \n"
|
||||
"lea 1(%0),%0 \n"
|
||||
"movq 0(%5,%%"REG_a",8),%%xmm2 \n"
|
||||
"paddsw %%xmm0,%%xmm2 \n"
|
||||
"shufps $0x44,%%xmm2,%%xmm2 \n"
|
||||
"psraw $0x6,%%xmm2 \n"
|
||||
"packuswb %%xmm2,%%xmm2 \n"
|
||||
"movd %%xmm2,0x0(%3) \n"
|
||||
"lea 4(%3),%3 \n"
|
||||
"sub $0x1,%4 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(y_buf), // %0
|
||||
"+r"(u_buf), // %1
|
||||
"+r"(v_buf), // %2
|
||||
@ -550,19 +517,19 @@ void OMITFP FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf, // rdi
|
||||
void FastConvertYToARGBRow_SSE2(const uint8* y_buf, // rdi
|
||||
uint8* rgb_buf, // rcx
|
||||
int width) { // r8
|
||||
asm volatile(
|
||||
"1:"
|
||||
"movzb (%0),%%"REG_a"\n"
|
||||
"movzb 0x1(%0),%%"REG_d"\n"
|
||||
"movq (%3,%%"REG_a",8),%%xmm2\n"
|
||||
"lea 2(%0),%0\n"
|
||||
"movhps (%3,%%"REG_d",8),%%xmm2\n"
|
||||
"psraw $0x6,%%xmm2\n"
|
||||
"packuswb %%xmm2,%%xmm2\n"
|
||||
"movq %%xmm2,0x0(%1)\n"
|
||||
"lea 8(%1),%1\n"
|
||||
"sub $0x2,%2\n"
|
||||
"ja 1b\n"
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
"movzb (%0),%%"REG_a" \n"
|
||||
"movzb 0x1(%0),%%"REG_d" \n"
|
||||
"movq (%3,%%"REG_a",8),%%xmm2 \n"
|
||||
"lea 2(%0),%0 \n"
|
||||
"movhps (%3,%%"REG_d",8),%%xmm2 \n"
|
||||
"psraw $0x6,%%xmm2 \n"
|
||||
"packuswb %%xmm2,%%xmm2 \n"
|
||||
"movq %%xmm2,0x0(%1) \n"
|
||||
"lea 8(%1),%1 \n"
|
||||
"sub $0x2,%2 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(y_buf), // %0
|
||||
"+r"(rgb_buf), // %1
|
||||
"+rm"(width) // %2
|
||||
@ -591,44 +558,44 @@ void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
asm(
|
||||
".text\n"
|
||||
".text \n"
|
||||
#if defined(OSX) || defined(IOS)
|
||||
".globl _FastConvertYUVToARGBRow_MMX\n"
|
||||
"_FastConvertYUVToARGBRow_MMX:\n"
|
||||
".globl _FastConvertYUVToARGBRow_MMX \n"
|
||||
"_FastConvertYUVToARGBRow_MMX: \n"
|
||||
#else
|
||||
".global FastConvertYUVToARGBRow_MMX\n"
|
||||
"FastConvertYUVToARGBRow_MMX:\n"
|
||||
".global FastConvertYUVToARGBRow_MMX \n"
|
||||
"FastConvertYUVToARGBRow_MMX: \n"
|
||||
#endif
|
||||
"pusha\n"
|
||||
"mov 0x24(%esp),%edx\n"
|
||||
"mov 0x28(%esp),%edi\n"
|
||||
"mov 0x2c(%esp),%esi\n"
|
||||
"mov 0x30(%esp),%ebp\n"
|
||||
"mov 0x34(%esp),%ecx\n"
|
||||
"pusha \n"
|
||||
"mov 0x24(%esp),%edx \n"
|
||||
"mov 0x28(%esp),%edi \n"
|
||||
"mov 0x2c(%esp),%esi \n"
|
||||
"mov 0x30(%esp),%ebp \n"
|
||||
"mov 0x34(%esp),%ecx \n"
|
||||
|
||||
"1:"
|
||||
"movzbl (%edi),%eax\n"
|
||||
"lea 1(%edi),%edi\n"
|
||||
"movzbl (%esi),%ebx\n"
|
||||
"lea 1(%esi),%esi\n"
|
||||
"1: \n"
|
||||
"movzbl (%edi),%eax \n"
|
||||
"lea 1(%edi),%edi \n"
|
||||
"movzbl (%esi),%ebx \n"
|
||||
"lea 1(%esi),%esi \n"
|
||||
"movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
|
||||
"movzbl (%edx),%eax\n"
|
||||
"movzbl (%edx),%eax \n"
|
||||
"paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
|
||||
"movzbl 0x1(%edx),%ebx\n"
|
||||
"movzbl 0x1(%edx),%ebx \n"
|
||||
"movq " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm1\n"
|
||||
"lea 2(%edx),%edx\n"
|
||||
"lea 2(%edx),%edx \n"
|
||||
"movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm2\n"
|
||||
"paddsw %mm0,%mm1\n"
|
||||
"paddsw %mm0,%mm2\n"
|
||||
"psraw $0x6,%mm1\n"
|
||||
"psraw $0x6,%mm2\n"
|
||||
"packuswb %mm2,%mm1\n"
|
||||
"movq %mm1,0x0(%ebp)\n"
|
||||
"lea 8(%ebp),%ebp\n"
|
||||
"sub $0x2,%ecx\n"
|
||||
"ja 1b\n"
|
||||
"popa\n"
|
||||
"ret\n"
|
||||
"paddsw %mm0,%mm1 \n"
|
||||
"paddsw %mm0,%mm2 \n"
|
||||
"psraw $0x6,%mm1 \n"
|
||||
"psraw $0x6,%mm2 \n"
|
||||
"packuswb %mm2,%mm1 \n"
|
||||
"movq %mm1,0x0(%ebp) \n"
|
||||
"lea 8(%ebp),%ebp \n"
|
||||
"sub $0x2,%ecx \n"
|
||||
"ja 1b \n"
|
||||
"popa \n"
|
||||
"ret \n"
|
||||
);
|
||||
|
||||
void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
|
||||
@ -637,44 +604,44 @@ void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
asm(
|
||||
".text\n"
|
||||
".text \n"
|
||||
#if defined(OSX) || defined(IOS)
|
||||
".globl _FastConvertYUVToBGRARow_MMX\n"
|
||||
"_FastConvertYUVToBGRARow_MMX:\n"
|
||||
".globl _FastConvertYUVToBGRARow_MMX \n"
|
||||
"_FastConvertYUVToBGRARow_MMX: \n"
|
||||
#else
|
||||
".global FastConvertYUVToBGRARow_MMX\n"
|
||||
"FastConvertYUVToBGRARow_MMX:\n"
|
||||
".global FastConvertYUVToBGRARow_MMX \n"
|
||||
"FastConvertYUVToBGRARow_MMX: \n"
|
||||
#endif
|
||||
"pusha\n"
|
||||
"mov 0x24(%esp),%edx\n"
|
||||
"mov 0x28(%esp),%edi\n"
|
||||
"mov 0x2c(%esp),%esi\n"
|
||||
"mov 0x30(%esp),%ebp\n"
|
||||
"mov 0x34(%esp),%ecx\n"
|
||||
"pusha \n"
|
||||
"mov 0x24(%esp),%edx \n"
|
||||
"mov 0x28(%esp),%edi \n"
|
||||
"mov 0x2c(%esp),%esi \n"
|
||||
"mov 0x30(%esp),%ebp \n"
|
||||
"mov 0x34(%esp),%ecx \n"
|
||||
|
||||
"1:"
|
||||
"movzbl (%edi),%eax\n"
|
||||
"lea 1(%edi),%edi\n"
|
||||
"movzbl (%esi),%ebx\n"
|
||||
"lea 1(%esi),%esi\n"
|
||||
"1: \n"
|
||||
"movzbl (%edi),%eax \n"
|
||||
"lea 1(%edi),%edi \n"
|
||||
"movzbl (%esi),%ebx \n"
|
||||
"lea 1(%esi),%esi \n"
|
||||
"movq " UNDERSCORE "kCoefficientsBgraY+2048(,%eax,8),%mm0\n"
|
||||
"movzbl (%edx),%eax\n"
|
||||
"movzbl (%edx),%eax \n"
|
||||
"paddsw " UNDERSCORE "kCoefficientsBgraY+4096(,%ebx,8),%mm0\n"
|
||||
"movzbl 0x1(%edx),%ebx\n"
|
||||
"movzbl 0x1(%edx),%ebx \n"
|
||||
"movq " UNDERSCORE "kCoefficientsBgraY(,%eax,8),%mm1\n"
|
||||
"lea 2(%edx),%edx\n"
|
||||
"lea 2(%edx),%edx \n"
|
||||
"movq " UNDERSCORE "kCoefficientsBgraY(,%ebx,8),%mm2\n"
|
||||
"paddsw %mm0,%mm1\n"
|
||||
"paddsw %mm0,%mm2\n"
|
||||
"psraw $0x6,%mm1\n"
|
||||
"psraw $0x6,%mm2\n"
|
||||
"packuswb %mm2,%mm1\n"
|
||||
"movq %mm1,0x0(%ebp)\n"
|
||||
"lea 8(%ebp),%ebp\n"
|
||||
"sub $0x2,%ecx\n"
|
||||
"ja 1b\n"
|
||||
"popa\n"
|
||||
"ret\n"
|
||||
"paddsw %mm0,%mm1 \n"
|
||||
"paddsw %mm0,%mm2 \n"
|
||||
"psraw $0x6,%mm1 \n"
|
||||
"psraw $0x6,%mm2 \n"
|
||||
"packuswb %mm2,%mm1 \n"
|
||||
"movq %mm1,0x0(%ebp) \n"
|
||||
"lea 8(%ebp),%ebp \n"
|
||||
"sub $0x2,%ecx \n"
|
||||
"ja 1b \n"
|
||||
"popa \n"
|
||||
"ret \n"
|
||||
);
|
||||
|
||||
void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
|
||||
@ -683,44 +650,44 @@ void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
asm(
|
||||
".text\n"
|
||||
".text \n"
|
||||
#if defined(OSX) || defined(IOS)
|
||||
".globl _FastConvertYUVToABGRRow_MMX\n"
|
||||
"_FastConvertYUVToABGRRow_MMX:\n"
|
||||
".globl _FastConvertYUVToABGRRow_MMX \n"
|
||||
"_FastConvertYUVToABGRRow_MMX: \n"
|
||||
#else
|
||||
".global FastConvertYUVToABGRRow_MMX\n"
|
||||
"FastConvertYUVToABGRRow_MMX:\n"
|
||||
".global FastConvertYUVToABGRRow_MMX \n"
|
||||
"FastConvertYUVToABGRRow_MMX: \n"
|
||||
#endif
|
||||
"pusha\n"
|
||||
"mov 0x24(%esp),%edx\n"
|
||||
"mov 0x28(%esp),%edi\n"
|
||||
"mov 0x2c(%esp),%esi\n"
|
||||
"mov 0x30(%esp),%ebp\n"
|
||||
"mov 0x34(%esp),%ecx\n"
|
||||
"pusha \n"
|
||||
"mov 0x24(%esp),%edx \n"
|
||||
"mov 0x28(%esp),%edi \n"
|
||||
"mov 0x2c(%esp),%esi \n"
|
||||
"mov 0x30(%esp),%ebp \n"
|
||||
"mov 0x34(%esp),%ecx \n"
|
||||
|
||||
"1:"
|
||||
"movzbl (%edi),%eax\n"
|
||||
"lea 1(%edi),%edi\n"
|
||||
"movzbl (%esi),%ebx\n"
|
||||
"lea 1(%esi),%esi\n"
|
||||
"1: \n"
|
||||
"movzbl (%edi),%eax \n"
|
||||
"lea 1(%edi),%edi \n"
|
||||
"movzbl (%esi),%ebx \n"
|
||||
"lea 1(%esi),%esi \n"
|
||||
"movq " UNDERSCORE "kCoefficientsAbgrY+2048(,%eax,8),%mm0\n"
|
||||
"movzbl (%edx),%eax\n"
|
||||
"movzbl (%edx),%eax \n"
|
||||
"paddsw " UNDERSCORE "kCoefficientsAbgrY+4096(,%ebx,8),%mm0\n"
|
||||
"movzbl 0x1(%edx),%ebx\n"
|
||||
"movzbl 0x1(%edx),%ebx \n"
|
||||
"movq " UNDERSCORE "kCoefficientsAbgrY(,%eax,8),%mm1\n"
|
||||
"lea 2(%edx),%edx\n"
|
||||
"lea 2(%edx),%edx \n"
|
||||
"movq " UNDERSCORE "kCoefficientsAbgrY(,%ebx,8),%mm2\n"
|
||||
"paddsw %mm0,%mm1\n"
|
||||
"paddsw %mm0,%mm2\n"
|
||||
"psraw $0x6,%mm1\n"
|
||||
"psraw $0x6,%mm2\n"
|
||||
"packuswb %mm2,%mm1\n"
|
||||
"movq %mm1,0x0(%ebp)\n"
|
||||
"lea 8(%ebp),%ebp\n"
|
||||
"sub $0x2,%ecx\n"
|
||||
"ja 1b\n"
|
||||
"popa\n"
|
||||
"ret\n"
|
||||
"paddsw %mm0,%mm1 \n"
|
||||
"paddsw %mm0,%mm2 \n"
|
||||
"psraw $0x6,%mm1 \n"
|
||||
"psraw $0x6,%mm2 \n"
|
||||
"packuswb %mm2,%mm1 \n"
|
||||
"movq %mm1,0x0(%ebp) \n"
|
||||
"lea 8(%ebp),%ebp \n"
|
||||
"sub $0x2,%ecx \n"
|
||||
"ja 1b \n"
|
||||
"popa \n"
|
||||
"ret \n"
|
||||
);
|
||||
|
||||
void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
|
||||
@ -729,73 +696,73 @@ void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
asm(
|
||||
".text\n"
|
||||
".text \n"
|
||||
#if defined(OSX) || defined(IOS)
|
||||
".globl _FastConvertYUV444ToARGBRow_MMX\n"
|
||||
"_FastConvertYUV444ToARGBRow_MMX:\n"
|
||||
".globl _FastConvertYUV444ToARGBRow_MMX \n"
|
||||
"_FastConvertYUV444ToARGBRow_MMX: \n"
|
||||
#else
|
||||
".global FastConvertYUV444ToARGBRow_MMX\n"
|
||||
"FastConvertYUV444ToARGBRow_MMX:\n"
|
||||
".global FastConvertYUV444ToARGBRow_MMX \n"
|
||||
"FastConvertYUV444ToARGBRow_MMX: \n"
|
||||
#endif
|
||||
"pusha\n"
|
||||
"mov 0x24(%esp),%edx\n"
|
||||
"mov 0x28(%esp),%edi\n"
|
||||
"mov 0x2c(%esp),%esi\n"
|
||||
"mov 0x30(%esp),%ebp\n"
|
||||
"mov 0x34(%esp),%ecx\n"
|
||||
"pusha \n"
|
||||
"mov 0x24(%esp),%edx \n"
|
||||
"mov 0x28(%esp),%edi \n"
|
||||
"mov 0x2c(%esp),%esi \n"
|
||||
"mov 0x30(%esp),%ebp \n"
|
||||
"mov 0x34(%esp),%ecx \n"
|
||||
|
||||
"1:"
|
||||
"movzbl (%edi),%eax\n"
|
||||
"lea 1(%edi),%edi\n"
|
||||
"movzbl (%esi),%ebx\n"
|
||||
"lea 1(%esi),%esi\n"
|
||||
"1: \n"
|
||||
"movzbl (%edi),%eax \n"
|
||||
"lea 1(%edi),%edi \n"
|
||||
"movzbl (%esi),%ebx \n"
|
||||
"lea 1(%esi),%esi \n"
|
||||
"movq " UNDERSCORE "kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
|
||||
"movzbl (%edx),%eax\n"
|
||||
"movzbl (%edx),%eax \n"
|
||||
"paddsw " UNDERSCORE "kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
|
||||
"lea 1(%edx),%edx\n"
|
||||
"lea 1(%edx),%edx \n"
|
||||
"paddsw " UNDERSCORE "kCoefficientsRgbY(,%eax,8),%mm0\n"
|
||||
"psraw $0x6,%mm0\n"
|
||||
"packuswb %mm0,%mm0\n"
|
||||
"movd %mm0,0x0(%ebp)\n"
|
||||
"lea 4(%ebp),%ebp\n"
|
||||
"sub $0x1,%ecx\n"
|
||||
"ja 1b\n"
|
||||
"popa\n"
|
||||
"ret\n"
|
||||
"psraw $0x6,%mm0 \n"
|
||||
"packuswb %mm0,%mm0 \n"
|
||||
"movd %mm0,0x0(%ebp) \n"
|
||||
"lea 4(%ebp),%ebp \n"
|
||||
"sub $0x1,%ecx \n"
|
||||
"ja 1b \n"
|
||||
"popa \n"
|
||||
"ret \n"
|
||||
);
|
||||
|
||||
void FastConvertYToARGBRow_MMX(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
asm(
|
||||
".text\n"
|
||||
".text \n"
|
||||
#if defined(OSX) || defined(IOS)
|
||||
".globl _FastConvertYToARGBRow_MMX\n"
|
||||
"_FastConvertYToARGBRow_MMX:\n"
|
||||
".globl _FastConvertYToARGBRow_MMX \n"
|
||||
"_FastConvertYToARGBRow_MMX: \n"
|
||||
#else
|
||||
".global FastConvertYToARGBRow_MMX\n"
|
||||
"FastConvertYToARGBRow_MMX:\n"
|
||||
".global FastConvertYToARGBRow_MMX \n"
|
||||
"FastConvertYToARGBRow_MMX: \n"
|
||||
#endif
|
||||
"push %ebx\n"
|
||||
"mov 0x8(%esp),%eax\n"
|
||||
"mov 0xc(%esp),%edx\n"
|
||||
"mov 0x10(%esp),%ecx\n"
|
||||
"push %ebx \n"
|
||||
"mov 0x8(%esp),%eax \n"
|
||||
"mov 0xc(%esp),%edx \n"
|
||||
"mov 0x10(%esp),%ecx \n"
|
||||
|
||||
"1:"
|
||||
"movzbl (%eax),%ebx\n"
|
||||
"1: \n"
|
||||
"movzbl (%eax),%ebx \n"
|
||||
"movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm0\n"
|
||||
"psraw $0x6,%mm0\n"
|
||||
"movzbl 0x1(%eax),%ebx\n"
|
||||
"psraw $0x6,%mm0 \n"
|
||||
"movzbl 0x1(%eax),%ebx \n"
|
||||
"movq " UNDERSCORE "kCoefficientsRgbY(,%ebx,8),%mm1\n"
|
||||
"psraw $0x6,%mm1\n"
|
||||
"packuswb %mm1,%mm0\n"
|
||||
"lea 0x2(%eax),%eax\n"
|
||||
"movq %mm0,(%edx)\n"
|
||||
"lea 0x8(%edx),%edx\n"
|
||||
"sub $0x2,%ecx\n"
|
||||
"ja 1b\n"
|
||||
"pop %ebx\n"
|
||||
"ret\n"
|
||||
"psraw $0x6,%mm1 \n"
|
||||
"packuswb %mm1,%mm0 \n"
|
||||
"lea 0x2(%eax),%eax \n"
|
||||
"movq %mm0,(%edx) \n"
|
||||
"lea 0x8(%edx),%edx \n"
|
||||
"sub $0x2,%ecx \n"
|
||||
"ja 1b \n"
|
||||
"pop %ebx \n"
|
||||
"ret \n"
|
||||
);
|
||||
|
||||
#endif
|
||||
|
||||
@ -92,7 +92,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
|
||||
pslld xmm5, 24
|
||||
|
||||
wloop:
|
||||
convertloop:
|
||||
movq xmm0, qword ptr [eax]
|
||||
lea eax, [eax + 8]
|
||||
punpcklbw xmm0, xmm0
|
||||
@ -105,7 +105,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
movdqa [edx + 16], xmm1
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
ja wloop
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
@ -753,18 +753,18 @@ SIMD_ALIGNED(const int16 kUVBiasR[8]) = {
|
||||
__asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \
|
||||
__asm movdqa xmm1, xmm0 \
|
||||
__asm movdqa xmm2, xmm0 \
|
||||
__asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
|
||||
__asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
|
||||
__asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
|
||||
__asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
|
||||
__asm psubw xmm1, kUVBiasG \
|
||||
__asm psubw xmm2, kUVBiasR \
|
||||
__asm pmaddubsw xmm0, kUVToB /* scale B UV */ \
|
||||
__asm pmaddubsw xmm1, kUVToG /* scale G UV */ \
|
||||
__asm pmaddubsw xmm2, kUVToR /* scale R UV */ \
|
||||
__asm psubw xmm0, kUVBiasB /* unbias back to signed */ \
|
||||
__asm psubw xmm1, kUVBiasG \
|
||||
__asm psubw xmm2, kUVBiasR \
|
||||
/* Step 2: Find Y contribution to 8 R,G,B values */ \
|
||||
__asm movq xmm3, qword ptr [eax] \
|
||||
__asm lea eax, [eax + 8] \
|
||||
__asm punpcklbw xmm3, xmm4 \
|
||||
__asm psubsw xmm3, kYSub16 \
|
||||
__asm pmullw xmm3, kYToRgb \
|
||||
__asm psubsw xmm3, kYSub16 \
|
||||
__asm pmullw xmm3, kYToRgb \
|
||||
__asm paddw xmm0, xmm3 /* B += Y */ \
|
||||
__asm paddw xmm1, xmm3 /* G += Y */ \
|
||||
__asm paddw xmm2, xmm3 /* R += Y */ \
|
||||
|
||||
2250
source/scale.cc
2250
source/scale.cc
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user