mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
BUG=none TEST=libyuv_unittest --gtest_filter=*ARGBSepia* Review URL: https://webrtc-codereview.appspot.com/573008 git-svn-id: http://libyuv.googlecode.com/svn/trunk@270 16f28f9a-4ce2-e073-06de-1de4eb20be90
2660 lines
104 KiB
C++
2660 lines
104 KiB
C++
/*
|
|
* Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
|
|
*
|
|
* Use of this source code is governed by a BSD-style license
|
|
* that can be found in the LICENSE file in the root of the source
|
|
* tree. An additional intellectual property rights grant can be found
|
|
* in the file PATENTS. All contributing project authors may
|
|
* be found in the AUTHORS file in the root of the source tree.
|
|
*/
|
|
|
|
#include "source/row.h"
|
|
|
|
#include "libyuv/basic_types.h"
|
|
|
|
#ifdef __cplusplus
|
|
namespace libyuv {
|
|
extern "C" {
|
|
#endif
|
|
|
|
// This module is for GCC x86 and x64
|
|
#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
|
|
|
|
// GCC 4.2 on OSX has link error when passing static or const to inline.
|
|
// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
|
|
#ifdef __APPLE__
|
|
#define CONST
|
|
#else
|
|
#define CONST static const
|
|
#endif
|
|
|
|
#ifdef HAS_ARGBTOYROW_SSSE3
|
|
|
|
// Constants for ARGB
|
|
CONST vec8 kARGBToY = {
|
|
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
|
|
};
|
|
|
|
CONST vec8 kARGBToU = {
|
|
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
|
|
};
|
|
|
|
CONST vec8 kARGBToV = {
|
|
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
|
|
};
|
|
|
|
// Constants for BGRA
|
|
CONST vec8 kBGRAToY = {
|
|
0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
|
|
};
|
|
|
|
CONST vec8 kBGRAToU = {
|
|
0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
|
|
};
|
|
|
|
CONST vec8 kBGRAToV = {
|
|
0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
|
|
};
|
|
|
|
// Constants for ABGR
|
|
CONST vec8 kABGRToY = {
|
|
33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
|
|
};
|
|
|
|
CONST vec8 kABGRToU = {
|
|
-38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
|
|
};
|
|
|
|
CONST vec8 kABGRToV = {
|
|
112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
|
|
};
|
|
|
|
CONST uvec8 kAddY16 = {
|
|
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
|
|
};
|
|
|
|
CONST uvec8 kAddUV128 = {
|
|
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
|
|
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
|
|
};
|
|
|
|
// Shuffle table for converting RGB24 to ARGB.
|
|
CONST uvec8 kShuffleMaskRGB24ToARGB = {
|
|
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
|
|
};
|
|
|
|
// Shuffle table for converting RAW to ARGB.
|
|
CONST uvec8 kShuffleMaskRAWToARGB = {
|
|
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
|
|
};
|
|
|
|
// Shuffle table for converting ABGR to ARGB.
|
|
CONST uvec8 kShuffleMaskABGRToARGB = {
|
|
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
|
|
};
|
|
|
|
// Shuffle table for converting BGRA to ARGB.
|
|
CONST uvec8 kShuffleMaskBGRAToARGB = {
|
|
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
|
|
};
|
|
|
|
// Shuffle table for converting ARGB to RGB24.
|
|
CONST uvec8 kShuffleMaskARGBToRGB24 = {
|
|
0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
|
|
};
|
|
|
|
// Shuffle table for converting ARGB to RAW.
|
|
CONST uvec8 kShuffleMaskARGBToRAW = {
|
|
2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
|
|
};
|
|
|
|
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"pslld $0x18,%%xmm5 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movq (%0),%%xmm0 \n"
|
|
"lea 0x8(%0),%0 \n"
|
|
"punpcklbw %%xmm0,%%xmm0 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"punpcklwd %%xmm0,%%xmm0 \n"
|
|
"punpckhwd %%xmm1,%%xmm1 \n"
|
|
"por %%xmm5,%%xmm0 \n"
|
|
"por %%xmm5,%%xmm1 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"movdqa %%xmm1,0x10(%1) \n"
|
|
"lea 0x20(%1),%1 \n"
|
|
"sub $0x8,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_y), // %0
|
|
"+r"(dst_argb), // %1
|
|
"+r"(pix) // %2
|
|
:
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
|
|
asm volatile (
|
|
"movdqa %3,%%xmm5 \n"
|
|
"sub %0,%1 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"pshufb %%xmm5,%%xmm0 \n"
|
|
"sub $0x4,%2 \n"
|
|
"movdqa %%xmm0,(%0,%1,1) \n"
|
|
"lea 0x10(%0),%0 \n"
|
|
"jg 1b \n"
|
|
|
|
: "+r"(src_abgr), // %0
|
|
"+r"(dst_argb), // %1
|
|
"+r"(pix) // %2
|
|
: "m"(kShuffleMaskABGRToARGB) // %3
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
|
|
asm volatile (
|
|
"movdqa %3,%%xmm5 \n"
|
|
"sub %0,%1 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"pshufb %%xmm5,%%xmm0 \n"
|
|
"sub $0x4,%2 \n"
|
|
"movdqa %%xmm0,(%0,%1,1) \n"
|
|
"lea 0x10(%0),%0 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_bgra), // %0
|
|
"+r"(dst_argb), // %1
|
|
"+r"(pix) // %2
|
|
: "m"(kShuffleMaskBGRAToARGB) // %3
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
|
|
"pslld $0x18,%%xmm5 \n"
|
|
"movdqa %3,%%xmm4 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"movdqu 0x20(%0),%%xmm3 \n"
|
|
"lea 0x30(%0),%0 \n"
|
|
"movdqa %%xmm3,%%xmm2 \n"
|
|
"palignr $0x8,%%xmm1,%%xmm2 \n"
|
|
"pshufb %%xmm4,%%xmm2 \n"
|
|
"por %%xmm5,%%xmm2 \n"
|
|
"palignr $0xc,%%xmm0,%%xmm1 \n"
|
|
"pshufb %%xmm4,%%xmm0 \n"
|
|
"movdqa %%xmm2,0x20(%1) \n"
|
|
"por %%xmm5,%%xmm0 \n"
|
|
"pshufb %%xmm4,%%xmm1 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"por %%xmm5,%%xmm1 \n"
|
|
"palignr $0x4,%%xmm3,%%xmm3 \n"
|
|
"pshufb %%xmm4,%%xmm3 \n"
|
|
"movdqa %%xmm1,0x10(%1) \n"
|
|
"por %%xmm5,%%xmm3 \n"
|
|
"sub $0x10,%2 \n"
|
|
"movdqa %%xmm3,0x30(%1) \n"
|
|
"lea 0x40(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_rgb24), // %0
|
|
"+r"(dst_argb), // %1
|
|
"+r"(pix) // %2
|
|
: "m"(kShuffleMaskRGB24ToARGB) // %3
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm5,%%xmm5 \n" // generate mask 0xff000000
|
|
"pslld $0x18,%%xmm5 \n"
|
|
"movdqa %3,%%xmm4 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"movdqu 0x20(%0),%%xmm3 \n"
|
|
"lea 0x30(%0),%0 \n"
|
|
"movdqa %%xmm3,%%xmm2 \n"
|
|
"palignr $0x8,%%xmm1,%%xmm2 \n"
|
|
"pshufb %%xmm4,%%xmm2 \n"
|
|
"por %%xmm5,%%xmm2 \n"
|
|
"palignr $0xc,%%xmm0,%%xmm1 \n"
|
|
"pshufb %%xmm4,%%xmm0 \n"
|
|
"movdqa %%xmm2,0x20(%1) \n"
|
|
"por %%xmm5,%%xmm0 \n"
|
|
"pshufb %%xmm4,%%xmm1 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"por %%xmm5,%%xmm1 \n"
|
|
"palignr $0x4,%%xmm3,%%xmm3 \n"
|
|
"pshufb %%xmm4,%%xmm3 \n"
|
|
"movdqa %%xmm1,0x10(%1) \n"
|
|
"por %%xmm5,%%xmm3 \n"
|
|
"sub $0x10,%2 \n"
|
|
"movdqa %%xmm3,0x30(%1) \n"
|
|
"lea 0x40(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_raw), // %0
|
|
"+r"(dst_argb), // %1
|
|
"+r"(pix) // %2
|
|
: "m"(kShuffleMaskRAWToARGB) // %3
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
|
|
asm volatile (
|
|
"mov $0x1080108,%%eax \n"
|
|
"movd %%eax,%%xmm5 \n"
|
|
"pshufd $0x0,%%xmm5,%%xmm5 \n"
|
|
"mov $0x20082008,%%eax \n"
|
|
"movd %%eax,%%xmm6 \n"
|
|
"pshufd $0x0,%%xmm6,%%xmm6 \n"
|
|
"pcmpeqb %%xmm3,%%xmm3 \n"
|
|
"psllw $0xb,%%xmm3 \n"
|
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
|
"psllw $0xa,%%xmm4 \n"
|
|
"psrlw $0x5,%%xmm4 \n"
|
|
"pcmpeqb %%xmm7,%%xmm7 \n"
|
|
"psllw $0x8,%%xmm7 \n"
|
|
"sub %0,%1 \n"
|
|
"sub %0,%1 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"movdqa %%xmm0,%%xmm2 \n"
|
|
"pand %%xmm3,%%xmm1 \n"
|
|
"psllw $0xb,%%xmm2 \n"
|
|
"pmulhuw %%xmm5,%%xmm1 \n"
|
|
"pmulhuw %%xmm5,%%xmm2 \n"
|
|
"psllw $0x8,%%xmm1 \n"
|
|
"por %%xmm2,%%xmm1 \n"
|
|
"pand %%xmm4,%%xmm0 \n"
|
|
"pmulhuw %%xmm6,%%xmm0 \n"
|
|
"por %%xmm7,%%xmm0 \n"
|
|
"movdqa %%xmm1,%%xmm2 \n"
|
|
"punpcklbw %%xmm0,%%xmm1 \n"
|
|
"punpckhbw %%xmm0,%%xmm2 \n"
|
|
"movdqa %%xmm1,(%1,%0,2) \n"
|
|
"movdqa %%xmm2,0x10(%1,%0,2) \n"
|
|
"lea 0x10(%0),%0 \n"
|
|
"sub $0x8,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src), // %0
|
|
"+r"(dst), // %1
|
|
"+r"(pix) // %2
|
|
:
|
|
: "memory", "cc", "eax"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
|
|
asm volatile (
|
|
"mov $0x1080108,%%eax \n"
|
|
"movd %%eax,%%xmm5 \n"
|
|
"pshufd $0x0,%%xmm5,%%xmm5 \n"
|
|
"mov $0x42004200,%%eax \n"
|
|
"movd %%eax,%%xmm6 \n"
|
|
"pshufd $0x0,%%xmm6,%%xmm6 \n"
|
|
"pcmpeqb %%xmm3,%%xmm3 \n"
|
|
"psllw $0xb,%%xmm3 \n"
|
|
"movdqa %%xmm3,%%xmm4 \n"
|
|
"psrlw $0x6,%%xmm4 \n"
|
|
"pcmpeqb %%xmm7,%%xmm7 \n"
|
|
"psllw $0x8,%%xmm7 \n"
|
|
"sub %0,%1 \n"
|
|
"sub %0,%1 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"movdqa %%xmm0,%%xmm2 \n"
|
|
"psllw $0x1,%%xmm1 \n"
|
|
"psllw $0xb,%%xmm2 \n"
|
|
"pand %%xmm3,%%xmm1 \n"
|
|
"pmulhuw %%xmm5,%%xmm2 \n"
|
|
"pmulhuw %%xmm5,%%xmm1 \n"
|
|
"psllw $0x8,%%xmm1 \n"
|
|
"por %%xmm2,%%xmm1 \n"
|
|
"movdqa %%xmm0,%%xmm2 \n"
|
|
"pand %%xmm4,%%xmm0 \n"
|
|
"psraw $0x8,%%xmm2 \n"
|
|
"pmulhuw %%xmm6,%%xmm0 \n"
|
|
"pand %%xmm7,%%xmm2 \n"
|
|
"por %%xmm2,%%xmm0 \n"
|
|
"movdqa %%xmm1,%%xmm2 \n"
|
|
"punpcklbw %%xmm0,%%xmm1 \n"
|
|
"punpckhbw %%xmm0,%%xmm2 \n"
|
|
"movdqa %%xmm1,(%1,%0,2) \n"
|
|
"movdqa %%xmm2,0x10(%1,%0,2) \n"
|
|
"lea 0x10(%0),%0 \n"
|
|
"sub $0x8,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src), // %0
|
|
"+r"(dst), // %1
|
|
"+r"(pix) // %2
|
|
:
|
|
: "memory", "cc", "eax"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
|
|
asm volatile (
|
|
"mov $0xf0f0f0f,%%eax \n"
|
|
"movd %%eax,%%xmm4 \n"
|
|
"pshufd $0x0,%%xmm4,%%xmm4 \n"
|
|
"movdqa %%xmm4,%%xmm5 \n"
|
|
"pslld $0x4,%%xmm5 \n"
|
|
"sub %0,%1 \n"
|
|
"sub %0,%1 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqa %%xmm0,%%xmm2 \n"
|
|
"pand %%xmm4,%%xmm0 \n"
|
|
"pand %%xmm5,%%xmm2 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"movdqa %%xmm2,%%xmm3 \n"
|
|
"psllw $0x4,%%xmm1 \n"
|
|
"psrlw $0x4,%%xmm3 \n"
|
|
"por %%xmm1,%%xmm0 \n"
|
|
"por %%xmm3,%%xmm2 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"punpcklbw %%xmm2,%%xmm0 \n"
|
|
"punpckhbw %%xmm2,%%xmm1 \n"
|
|
"movdqa %%xmm0,(%1,%0,2) \n"
|
|
"movdqa %%xmm1,0x10(%1,%0,2) \n"
|
|
"lea 0x10(%0),%0 \n"
|
|
"sub $0x8,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src), // %0
|
|
"+r"(dst), // %1
|
|
"+r"(pix) // %2
|
|
:
|
|
: "memory", "cc", "eax"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
|
|
asm volatile (
|
|
"movdqa %3,%%xmm6 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"movdqa 0x20(%0),%%xmm2 \n"
|
|
"movdqa 0x30(%0),%%xmm3 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"pshufb %%xmm6,%%xmm0 \n"
|
|
"pshufb %%xmm6,%%xmm1 \n"
|
|
"pshufb %%xmm6,%%xmm2 \n"
|
|
"pshufb %%xmm6,%%xmm3 \n"
|
|
"movdqa %%xmm1,%%xmm4 \n"
|
|
"psrldq $0x4,%%xmm1 \n"
|
|
"pslldq $0xc,%%xmm4 \n"
|
|
"movdqa %%xmm2,%%xmm5 \n"
|
|
"por %%xmm4,%%xmm0 \n"
|
|
"pslldq $0x8,%%xmm5 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"por %%xmm5,%%xmm1 \n"
|
|
"psrldq $0x8,%%xmm2 \n"
|
|
"pslldq $0x4,%%xmm3 \n"
|
|
"por %%xmm3,%%xmm2 \n"
|
|
"movdqa %%xmm1,0x10(%1) \n"
|
|
"movdqa %%xmm2,0x20(%1) \n"
|
|
"lea 0x30(%1),%1 \n"
|
|
"sub $0x10,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src), // %0
|
|
"+r"(dst), // %1
|
|
"+r"(pix) // %2
|
|
: "m"(kShuffleMaskARGBToRGB24) // %3
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
|
|
asm volatile (
|
|
"movdqa %3,%%xmm6 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"movdqa 0x20(%0),%%xmm2 \n"
|
|
"movdqa 0x30(%0),%%xmm3 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"pshufb %%xmm6,%%xmm0 \n"
|
|
"pshufb %%xmm6,%%xmm1 \n"
|
|
"pshufb %%xmm6,%%xmm2 \n"
|
|
"pshufb %%xmm6,%%xmm3 \n"
|
|
"movdqa %%xmm1,%%xmm4 \n"
|
|
"psrldq $0x4,%%xmm1 \n"
|
|
"pslldq $0xc,%%xmm4 \n"
|
|
"movdqa %%xmm2,%%xmm5 \n"
|
|
"por %%xmm4,%%xmm0 \n"
|
|
"pslldq $0x8,%%xmm5 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"por %%xmm5,%%xmm1 \n"
|
|
"psrldq $0x8,%%xmm2 \n"
|
|
"pslldq $0x4,%%xmm3 \n"
|
|
"por %%xmm3,%%xmm2 \n"
|
|
"movdqa %%xmm1,0x10(%1) \n"
|
|
"movdqa %%xmm2,0x20(%1) \n"
|
|
"lea 0x30(%1),%1 \n"
|
|
"sub $0x10,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src), // %0
|
|
"+r"(dst), // %1
|
|
"+r"(pix) // %2
|
|
: "m"(kShuffleMaskARGBToRAW) // %3
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm3,%%xmm3 \n"
|
|
"psrld $0x1b,%%xmm3 \n"
|
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
|
"psrld $0x1a,%%xmm4 \n"
|
|
"pslld $0x5,%%xmm4 \n"
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"pslld $0xb,%%xmm5 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"movdqa %%xmm0,%%xmm2 \n"
|
|
"pslld $0x8,%%xmm0 \n"
|
|
"psrld $0x3,%%xmm1 \n"
|
|
"psrld $0x5,%%xmm2 \n"
|
|
"psrad $0x10,%%xmm0 \n"
|
|
"pand %%xmm3,%%xmm1 \n"
|
|
"pand %%xmm4,%%xmm2 \n"
|
|
"pand %%xmm5,%%xmm0 \n"
|
|
"por %%xmm2,%%xmm1 \n"
|
|
"por %%xmm1,%%xmm0 \n"
|
|
"packssdw %%xmm0,%%xmm0 \n"
|
|
"lea 0x10(%0),%0 \n"
|
|
"movq %%xmm0,(%1) \n"
|
|
"lea 0x8(%1),%1 \n"
|
|
"sub $0x4,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src), // %0
|
|
"+r"(dst), // %1
|
|
"+r"(pix) // %2
|
|
:
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
|
"psrld $0x1b,%%xmm4 \n"
|
|
"movdqa %%xmm4,%%xmm5 \n"
|
|
"pslld $0x5,%%xmm5 \n"
|
|
"movdqa %%xmm4,%%xmm6 \n"
|
|
"pslld $0xa,%%xmm6 \n"
|
|
"pcmpeqb %%xmm7,%%xmm7 \n"
|
|
"pslld $0xf,%%xmm7 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"movdqa %%xmm0,%%xmm2 \n"
|
|
"movdqa %%xmm0,%%xmm3 \n"
|
|
"psrad $0x10,%%xmm0 \n"
|
|
"psrld $0x3,%%xmm1 \n"
|
|
"psrld $0x6,%%xmm2 \n"
|
|
"psrld $0x9,%%xmm3 \n"
|
|
"pand %%xmm7,%%xmm0 \n"
|
|
"pand %%xmm4,%%xmm1 \n"
|
|
"pand %%xmm5,%%xmm2 \n"
|
|
"pand %%xmm6,%%xmm3 \n"
|
|
"por %%xmm1,%%xmm0 \n"
|
|
"por %%xmm3,%%xmm2 \n"
|
|
"por %%xmm2,%%xmm0 \n"
|
|
"packssdw %%xmm0,%%xmm0 \n"
|
|
"lea 0x10(%0),%0 \n"
|
|
"movq %%xmm0,(%1) \n"
|
|
"lea 0x8(%1),%1 \n"
|
|
"sub $0x4,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src), // %0
|
|
"+r"(dst), // %1
|
|
"+r"(pix) // %2
|
|
:
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
|
"psllw $0xc,%%xmm4 \n"
|
|
"movdqa %%xmm4,%%xmm3 \n"
|
|
"psrlw $0x8,%%xmm3 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"pand %%xmm3,%%xmm0 \n"
|
|
"pand %%xmm4,%%xmm1 \n"
|
|
"psrlq $0x4,%%xmm0 \n"
|
|
"psrlq $0x8,%%xmm1 \n"
|
|
"por %%xmm1,%%xmm0 \n"
|
|
"packuswb %%xmm0,%%xmm0 \n"
|
|
"lea 0x10(%0),%0 \n"
|
|
"movq %%xmm0,(%1) \n"
|
|
"lea 0x8(%1),%1 \n"
|
|
"sub $0x4,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src), // %0
|
|
"+r"(dst), // %1
|
|
"+r"(pix) // %2
|
|
:
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
|
asm volatile (
|
|
"movdqa %4,%%xmm5 \n"
|
|
"movdqa %3,%%xmm4 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"movdqa 0x20(%0),%%xmm2 \n"
|
|
"movdqa 0x30(%0),%%xmm3 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm1 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm4,%%xmm3 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"phaddw %%xmm1,%%xmm0 \n"
|
|
"phaddw %%xmm3,%%xmm2 \n"
|
|
"psrlw $0x7,%%xmm0 \n"
|
|
"psrlw $0x7,%%xmm2 \n"
|
|
"packuswb %%xmm2,%%xmm0 \n"
|
|
"paddb %%xmm5,%%xmm0 \n"
|
|
"sub $0x10,%2 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_argb), // %0
|
|
"+r"(dst_y), // %1
|
|
"+r"(pix) // %2
|
|
: "m"(kARGBToY), // %3
|
|
"m"(kAddY16) // %4
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
|
asm volatile (
|
|
"movdqa %4,%%xmm5 \n"
|
|
"movdqa %3,%%xmm4 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"movdqu 0x20(%0),%%xmm2 \n"
|
|
"movdqu 0x30(%0),%%xmm3 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm1 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm4,%%xmm3 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"phaddw %%xmm1,%%xmm0 \n"
|
|
"phaddw %%xmm3,%%xmm2 \n"
|
|
"psrlw $0x7,%%xmm0 \n"
|
|
"psrlw $0x7,%%xmm2 \n"
|
|
"packuswb %%xmm2,%%xmm0 \n"
|
|
"paddb %%xmm5,%%xmm0 \n"
|
|
"sub $0x10,%2 \n"
|
|
"movdqu %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_argb), // %0
|
|
"+r"(dst_y), // %1
|
|
"+r"(pix) // %2
|
|
: "m"(kARGBToY), // %3
|
|
"m"(kAddY16) // %4
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
// TODO(fbarchard): pass xmm constants to single block of assembly.
|
|
// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
|
|
// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
|
|
// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
|
|
// and considered unsafe.
|
|
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
|
uint8* dst_u, uint8* dst_v, int width) {
|
|
asm volatile (
|
|
"movdqa %0,%%xmm4 \n"
|
|
"movdqa %1,%%xmm3 \n"
|
|
"movdqa %2,%%xmm5 \n"
|
|
:
|
|
: "m"(kARGBToU), // %0
|
|
"m"(kARGBToV), // %1
|
|
"m"(kAddUV128) // %2
|
|
);
|
|
asm volatile (
|
|
"sub %1,%2 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"movdqa 0x20(%0),%%xmm2 \n"
|
|
"movdqa 0x30(%0),%%xmm6 \n"
|
|
"pavgb (%0,%4,1),%%xmm0 \n"
|
|
"pavgb 0x10(%0,%4,1),%%xmm1 \n"
|
|
"pavgb 0x20(%0,%4,1),%%xmm2 \n"
|
|
"pavgb 0x30(%0,%4,1),%%xmm6 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"movdqa %%xmm0,%%xmm7 \n"
|
|
"shufps $0x88,%%xmm1,%%xmm0 \n"
|
|
"shufps $0xdd,%%xmm1,%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm0 \n"
|
|
"movdqa %%xmm2,%%xmm7 \n"
|
|
"shufps $0x88,%%xmm6,%%xmm2 \n"
|
|
"shufps $0xdd,%%xmm6,%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm2 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"movdqa %%xmm2,%%xmm6 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm3,%%xmm1 \n"
|
|
"pmaddubsw %%xmm3,%%xmm6 \n"
|
|
"phaddw %%xmm2,%%xmm0 \n"
|
|
"phaddw %%xmm6,%%xmm1 \n"
|
|
"psraw $0x8,%%xmm0 \n"
|
|
"psraw $0x8,%%xmm1 \n"
|
|
"packsswb %%xmm1,%%xmm0 \n"
|
|
"paddb %%xmm5,%%xmm0 \n"
|
|
"sub $0x10,%3 \n"
|
|
"movlps %%xmm0,(%1) \n"
|
|
"movhps %%xmm0,(%1,%2,1) \n"
|
|
"lea 0x8(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_argb0), // %0
|
|
"+r"(dst_u), // %1
|
|
"+r"(dst_v), // %2
|
|
"+rm"(width) // %3
|
|
: "r"(static_cast<intptr_t>(src_stride_argb))
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
|
uint8* dst_u, uint8* dst_v, int width) {
|
|
asm volatile (
|
|
"movdqa %0,%%xmm4 \n"
|
|
"movdqa %1,%%xmm3 \n"
|
|
"movdqa %2,%%xmm5 \n"
|
|
:
|
|
: "m"(kARGBToU), // %0
|
|
"m"(kARGBToV), // %1
|
|
"m"(kAddUV128) // %2
|
|
);
|
|
asm volatile (
|
|
"sub %1,%2 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"movdqu 0x20(%0),%%xmm2 \n"
|
|
"movdqu 0x30(%0),%%xmm6 \n"
|
|
"movdqu (%0,%4,1),%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm0 \n"
|
|
"movdqu 0x10(%0,%4,1),%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm1 \n"
|
|
"movdqu 0x20(%0,%4,1),%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm2 \n"
|
|
"movdqu 0x30(%0,%4,1),%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm6 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"movdqa %%xmm0,%%xmm7 \n"
|
|
"shufps $0x88,%%xmm1,%%xmm0 \n"
|
|
"shufps $0xdd,%%xmm1,%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm0 \n"
|
|
"movdqa %%xmm2,%%xmm7 \n"
|
|
"shufps $0x88,%%xmm6,%%xmm2 \n"
|
|
"shufps $0xdd,%%xmm6,%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm2 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"movdqa %%xmm2,%%xmm6 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm3,%%xmm1 \n"
|
|
"pmaddubsw %%xmm3,%%xmm6 \n"
|
|
"phaddw %%xmm2,%%xmm0 \n"
|
|
"phaddw %%xmm6,%%xmm1 \n"
|
|
"psraw $0x8,%%xmm0 \n"
|
|
"psraw $0x8,%%xmm1 \n"
|
|
"packsswb %%xmm1,%%xmm0 \n"
|
|
"paddb %%xmm5,%%xmm0 \n"
|
|
"sub $0x10,%3 \n"
|
|
"movlps %%xmm0,(%1) \n"
|
|
"movhps %%xmm0,(%1,%2,1) \n"
|
|
"lea 0x8(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_argb0), // %0
|
|
"+r"(dst_u), // %1
|
|
"+r"(dst_v), // %2
|
|
"+rm"(width) // %3
|
|
: "r"(static_cast<intptr_t>(src_stride_argb))
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
|
|
asm volatile (
|
|
"movdqa %4,%%xmm5 \n"
|
|
"movdqa %3,%%xmm4 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"movdqa 0x20(%0),%%xmm2 \n"
|
|
"movdqa 0x30(%0),%%xmm3 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm1 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm4,%%xmm3 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"phaddw %%xmm1,%%xmm0 \n"
|
|
"phaddw %%xmm3,%%xmm2 \n"
|
|
"psrlw $0x7,%%xmm0 \n"
|
|
"psrlw $0x7,%%xmm2 \n"
|
|
"packuswb %%xmm2,%%xmm0 \n"
|
|
"paddb %%xmm5,%%xmm0 \n"
|
|
"sub $0x10,%2 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_bgra), // %0
|
|
"+r"(dst_y), // %1
|
|
"+r"(pix) // %2
|
|
: "m"(kBGRAToY), // %3
|
|
"m"(kAddY16) // %4
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
|
|
asm volatile (
|
|
"movdqa %4,%%xmm5 \n"
|
|
"movdqa %3,%%xmm4 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"movdqu 0x20(%0),%%xmm2 \n"
|
|
"movdqu 0x30(%0),%%xmm3 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm1 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm4,%%xmm3 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"phaddw %%xmm1,%%xmm0 \n"
|
|
"phaddw %%xmm3,%%xmm2 \n"
|
|
"psrlw $0x7,%%xmm0 \n"
|
|
"psrlw $0x7,%%xmm2 \n"
|
|
"packuswb %%xmm2,%%xmm0 \n"
|
|
"paddb %%xmm5,%%xmm0 \n"
|
|
"sub $0x10,%2 \n"
|
|
"movdqu %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_bgra), // %0
|
|
"+r"(dst_y), // %1
|
|
"+r"(pix) // %2
|
|
: "m"(kBGRAToY), // %3
|
|
"m"(kAddY16) // %4
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
|
|
uint8* dst_u, uint8* dst_v, int width) {
|
|
asm volatile (
|
|
"movdqa %0,%%xmm4 \n"
|
|
"movdqa %1,%%xmm3 \n"
|
|
"movdqa %2,%%xmm5 \n"
|
|
:
|
|
: "m"(kBGRAToU), // %0
|
|
"m"(kBGRAToV), // %1
|
|
"m"(kAddUV128) // %2
|
|
);
|
|
asm volatile (
|
|
"sub %1,%2 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"movdqa 0x20(%0),%%xmm2 \n"
|
|
"movdqa 0x30(%0),%%xmm6 \n"
|
|
"pavgb (%0,%4,1),%%xmm0 \n"
|
|
"pavgb 0x10(%0,%4,1),%%xmm1 \n"
|
|
"pavgb 0x20(%0,%4,1),%%xmm2 \n"
|
|
"pavgb 0x30(%0,%4,1),%%xmm6 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"movdqa %%xmm0,%%xmm7 \n"
|
|
"shufps $0x88,%%xmm1,%%xmm0 \n"
|
|
"shufps $0xdd,%%xmm1,%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm0 \n"
|
|
"movdqa %%xmm2,%%xmm7 \n"
|
|
"shufps $0x88,%%xmm6,%%xmm2 \n"
|
|
"shufps $0xdd,%%xmm6,%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm2 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"movdqa %%xmm2,%%xmm6 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm3,%%xmm1 \n"
|
|
"pmaddubsw %%xmm3,%%xmm6 \n"
|
|
"phaddw %%xmm2,%%xmm0 \n"
|
|
"phaddw %%xmm6,%%xmm1 \n"
|
|
"psraw $0x8,%%xmm0 \n"
|
|
"psraw $0x8,%%xmm1 \n"
|
|
"packsswb %%xmm1,%%xmm0 \n"
|
|
"paddb %%xmm5,%%xmm0 \n"
|
|
"sub $0x10,%3 \n"
|
|
"movlps %%xmm0,(%1) \n"
|
|
"movhps %%xmm0,(%1,%2,1) \n"
|
|
"lea 0x8(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_bgra0), // %0
|
|
"+r"(dst_u), // %1
|
|
"+r"(dst_v), // %2
|
|
"+rm"(width) // %3
|
|
: "r"(static_cast<intptr_t>(src_stride_bgra))
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
|
|
uint8* dst_u, uint8* dst_v, int width) {
|
|
asm volatile (
|
|
"movdqa %0,%%xmm4 \n"
|
|
"movdqa %1,%%xmm3 \n"
|
|
"movdqa %2,%%xmm5 \n"
|
|
:
|
|
: "m"(kBGRAToU), // %0
|
|
"m"(kBGRAToV), // %1
|
|
"m"(kAddUV128) // %2
|
|
);
|
|
asm volatile (
|
|
"sub %1,%2 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"movdqu 0x20(%0),%%xmm2 \n"
|
|
"movdqu 0x30(%0),%%xmm6 \n"
|
|
"movdqu (%0,%4,1),%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm0 \n"
|
|
"movdqu 0x10(%0,%4,1),%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm1 \n"
|
|
"movdqu 0x20(%0,%4,1),%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm2 \n"
|
|
"movdqu 0x30(%0,%4,1),%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm6 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"movdqa %%xmm0,%%xmm7 \n"
|
|
"shufps $0x88,%%xmm1,%%xmm0 \n"
|
|
"shufps $0xdd,%%xmm1,%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm0 \n"
|
|
"movdqa %%xmm2,%%xmm7 \n"
|
|
"shufps $0x88,%%xmm6,%%xmm2 \n"
|
|
"shufps $0xdd,%%xmm6,%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm2 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"movdqa %%xmm2,%%xmm6 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm3,%%xmm1 \n"
|
|
"pmaddubsw %%xmm3,%%xmm6 \n"
|
|
"phaddw %%xmm2,%%xmm0 \n"
|
|
"phaddw %%xmm6,%%xmm1 \n"
|
|
"psraw $0x8,%%xmm0 \n"
|
|
"psraw $0x8,%%xmm1 \n"
|
|
"packsswb %%xmm1,%%xmm0 \n"
|
|
"paddb %%xmm5,%%xmm0 \n"
|
|
"sub $0x10,%3 \n"
|
|
"movlps %%xmm0,(%1) \n"
|
|
"movhps %%xmm0,(%1,%2,1) \n"
|
|
"lea 0x8(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_bgra0), // %0
|
|
"+r"(dst_u), // %1
|
|
"+r"(dst_v), // %2
|
|
"+rm"(width) // %3
|
|
: "r"(static_cast<intptr_t>(src_stride_bgra))
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
|
|
asm volatile (
|
|
"movdqa %4,%%xmm5 \n"
|
|
"movdqa %3,%%xmm4 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"movdqa 0x20(%0),%%xmm2 \n"
|
|
"movdqa 0x30(%0),%%xmm3 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm1 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm4,%%xmm3 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"phaddw %%xmm1,%%xmm0 \n"
|
|
"phaddw %%xmm3,%%xmm2 \n"
|
|
"psrlw $0x7,%%xmm0 \n"
|
|
"psrlw $0x7,%%xmm2 \n"
|
|
"packuswb %%xmm2,%%xmm0 \n"
|
|
"paddb %%xmm5,%%xmm0 \n"
|
|
"sub $0x10,%2 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_abgr), // %0
|
|
"+r"(dst_y), // %1
|
|
"+r"(pix) // %2
|
|
: "m"(kABGRToY), // %3
|
|
"m"(kAddY16) // %4
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
|
|
asm volatile (
|
|
"movdqa %4,%%xmm5 \n"
|
|
"movdqa %3,%%xmm4 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"movdqu 0x20(%0),%%xmm2 \n"
|
|
"movdqu 0x30(%0),%%xmm3 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm1 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm4,%%xmm3 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"phaddw %%xmm1,%%xmm0 \n"
|
|
"phaddw %%xmm3,%%xmm2 \n"
|
|
"psrlw $0x7,%%xmm0 \n"
|
|
"psrlw $0x7,%%xmm2 \n"
|
|
"packuswb %%xmm2,%%xmm0 \n"
|
|
"paddb %%xmm5,%%xmm0 \n"
|
|
"sub $0x10,%2 \n"
|
|
"movdqu %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_abgr), // %0
|
|
"+r"(dst_y), // %1
|
|
"+r"(pix) // %2
|
|
: "m"(kABGRToY), // %3
|
|
"m"(kAddY16) // %4
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
|
|
uint8* dst_u, uint8* dst_v, int width) {
|
|
asm volatile (
|
|
"movdqa %0,%%xmm4 \n"
|
|
"movdqa %1,%%xmm3 \n"
|
|
"movdqa %2,%%xmm5 \n"
|
|
:
|
|
: "m"(kABGRToU), // %0
|
|
"m"(kABGRToV), // %1
|
|
"m"(kAddUV128) // %2
|
|
);
|
|
asm volatile (
|
|
"sub %1,%2 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"movdqa 0x20(%0),%%xmm2 \n"
|
|
"movdqa 0x30(%0),%%xmm6 \n"
|
|
"pavgb (%0,%4,1),%%xmm0 \n"
|
|
"pavgb 0x10(%0,%4,1),%%xmm1 \n"
|
|
"pavgb 0x20(%0,%4,1),%%xmm2 \n"
|
|
"pavgb 0x30(%0,%4,1),%%xmm6 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"movdqa %%xmm0,%%xmm7 \n"
|
|
"shufps $0x88,%%xmm1,%%xmm0 \n"
|
|
"shufps $0xdd,%%xmm1,%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm0 \n"
|
|
"movdqa %%xmm2,%%xmm7 \n"
|
|
"shufps $0x88,%%xmm6,%%xmm2 \n"
|
|
"shufps $0xdd,%%xmm6,%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm2 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"movdqa %%xmm2,%%xmm6 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm3,%%xmm1 \n"
|
|
"pmaddubsw %%xmm3,%%xmm6 \n"
|
|
"phaddw %%xmm2,%%xmm0 \n"
|
|
"phaddw %%xmm6,%%xmm1 \n"
|
|
"psraw $0x8,%%xmm0 \n"
|
|
"psraw $0x8,%%xmm1 \n"
|
|
"packsswb %%xmm1,%%xmm0 \n"
|
|
"paddb %%xmm5,%%xmm0 \n"
|
|
"sub $0x10,%3 \n"
|
|
"movlps %%xmm0,(%1) \n"
|
|
"movhps %%xmm0,(%1,%2,1) \n"
|
|
"lea 0x8(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_abgr0), // %0
|
|
"+r"(dst_u), // %1
|
|
"+r"(dst_v), // %2
|
|
"+rm"(width) // %3
|
|
: "r"(static_cast<intptr_t>(src_stride_abgr))
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
|
|
uint8* dst_u, uint8* dst_v, int width) {
|
|
asm volatile (
|
|
"movdqa %0,%%xmm4 \n"
|
|
"movdqa %1,%%xmm3 \n"
|
|
"movdqa %2,%%xmm5 \n"
|
|
:
|
|
: "m"(kABGRToU), // %0
|
|
"m"(kABGRToV), // %1
|
|
"m"(kAddUV128) // %2
|
|
);
|
|
asm volatile (
|
|
"sub %1,%2 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"movdqu 0x20(%0),%%xmm2 \n"
|
|
"movdqu 0x30(%0),%%xmm6 \n"
|
|
"movdqu (%0,%4,1),%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm0 \n"
|
|
"movdqu 0x10(%0,%4,1),%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm1 \n"
|
|
"movdqu 0x20(%0,%4,1),%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm2 \n"
|
|
"movdqu 0x30(%0,%4,1),%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm6 \n"
|
|
"lea 0x40(%0),%0 \n"
|
|
"movdqa %%xmm0,%%xmm7 \n"
|
|
"shufps $0x88,%%xmm1,%%xmm0 \n"
|
|
"shufps $0xdd,%%xmm1,%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm0 \n"
|
|
"movdqa %%xmm2,%%xmm7 \n"
|
|
"shufps $0x88,%%xmm6,%%xmm2 \n"
|
|
"shufps $0xdd,%%xmm6,%%xmm7 \n"
|
|
"pavgb %%xmm7,%%xmm2 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"movdqa %%xmm2,%%xmm6 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm2 \n"
|
|
"pmaddubsw %%xmm3,%%xmm1 \n"
|
|
"pmaddubsw %%xmm3,%%xmm6 \n"
|
|
"phaddw %%xmm2,%%xmm0 \n"
|
|
"phaddw %%xmm6,%%xmm1 \n"
|
|
"psraw $0x8,%%xmm0 \n"
|
|
"psraw $0x8,%%xmm1 \n"
|
|
"packsswb %%xmm1,%%xmm0 \n"
|
|
"paddb %%xmm5,%%xmm0 \n"
|
|
"sub $0x10,%3 \n"
|
|
"movlps %%xmm0,(%1) \n"
|
|
"movhps %%xmm0,(%1,%2,1) \n"
|
|
"lea 0x8(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_abgr0), // %0
|
|
"+r"(dst_u), // %1
|
|
"+r"(dst_v), // %2
|
|
"+rm"(width) // %3
|
|
: "r"(static_cast<intptr_t>(src_stride_abgr))
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
#endif // HAS_ARGBTOYROW_SSSE3
|
|
|
|
#ifdef HAS_I420TOARGBROW_SSSE3
|
|
#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
|
|
#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
|
|
#define UR 0
|
|
|
|
#define VB 0
|
|
#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
|
|
#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
|
|
|
|
// Bias
|
|
#define BB UB * 128 + VB * 128
|
|
#define BG UG * 128 + VG * 128
|
|
#define BR UR * 128 + VR * 128
|
|
|
|
#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
|
|
|
|
struct {
|
|
vec8 kUVToB;
|
|
vec8 kUVToG;
|
|
vec8 kUVToR;
|
|
vec16 kUVBiasB;
|
|
vec16 kUVBiasG;
|
|
vec16 kUVBiasR;
|
|
vec16 kYSub16;
|
|
vec16 kYToRgb;
|
|
} CONST SIMD_ALIGNED(kYuvConstants) = {
|
|
{ UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
|
|
{ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
|
|
{ UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
|
|
{ BB, BB, BB, BB, BB, BB, BB, BB },
|
|
{ BG, BG, BG, BG, BG, BG, BG, BG },
|
|
{ BR, BR, BR, BR, BR, BR, BR, BR },
|
|
{ 16, 16, 16, 16, 16, 16, 16, 16 },
|
|
{ YG, YG, YG, YG, YG, YG, YG, YG }
|
|
};
|
|
|
|
// Convert 8 pixels
|
|
#define YUVTORGB \
|
|
"movd (%1),%%xmm0 \n" \
|
|
"movd (%1,%2,1),%%xmm1 \n" \
|
|
"lea 0x4(%1),%1 \n" \
|
|
"punpcklbw %%xmm1,%%xmm0 \n" \
|
|
"punpcklwd %%xmm0,%%xmm0 \n" \
|
|
"movdqa %%xmm0,%%xmm1 \n" \
|
|
"movdqa %%xmm0,%%xmm2 \n" \
|
|
"pmaddubsw (%5),%%xmm0 \n" \
|
|
"pmaddubsw 16(%5),%%xmm1 \n" \
|
|
"pmaddubsw 32(%5),%%xmm2 \n" \
|
|
"psubw 48(%5),%%xmm0 \n" \
|
|
"psubw 64(%5),%%xmm1 \n" \
|
|
"psubw 80(%5),%%xmm2 \n" \
|
|
"movq (%0),%%xmm3 \n" \
|
|
"lea 0x8(%0),%0 \n" \
|
|
"punpcklbw %%xmm4,%%xmm3 \n" \
|
|
"psubsw 96(%5),%%xmm3 \n" \
|
|
"pmullw 112(%5),%%xmm3 \n" \
|
|
"paddsw %%xmm3,%%xmm0 \n" \
|
|
"paddsw %%xmm3,%%xmm1 \n" \
|
|
"paddsw %%xmm3,%%xmm2 \n" \
|
|
"psraw $0x6,%%xmm0 \n" \
|
|
"psraw $0x6,%%xmm1 \n" \
|
|
"psraw $0x6,%%xmm2 \n" \
|
|
"packuswb %%xmm0,%%xmm0 \n" \
|
|
"packuswb %%xmm1,%%xmm1 \n" \
|
|
"packuswb %%xmm2,%%xmm2 \n"
|
|
|
|
void OMITFP I420ToARGBRow_SSSE3(const uint8* y_buf,
|
|
const uint8* u_buf,
|
|
const uint8* v_buf,
|
|
uint8* rgb_buf,
|
|
int width) {
|
|
asm volatile (
|
|
"sub %1,%2 \n"
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"pxor %%xmm4,%%xmm4 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
YUVTORGB
|
|
"punpcklbw %%xmm1,%%xmm0 \n"
|
|
"punpcklbw %%xmm5,%%xmm2 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"punpcklwd %%xmm2,%%xmm0 \n"
|
|
"punpckhwd %%xmm2,%%xmm1 \n"
|
|
"movdqa %%xmm0,(%3) \n"
|
|
"movdqa %%xmm1,0x10(%3) \n"
|
|
"lea 0x20(%3),%3 \n"
|
|
"sub $0x8,%4 \n"
|
|
"jg 1b \n"
|
|
: "+r"(y_buf), // %0
|
|
"+r"(u_buf), // %1
|
|
"+r"(v_buf), // %2
|
|
"+r"(rgb_buf), // %3
|
|
"+rm"(width) // %4
|
|
: "r"(&kYuvConstants.kUVToB) // %5
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void OMITFP I420ToBGRARow_SSSE3(const uint8* y_buf,
|
|
const uint8* u_buf,
|
|
const uint8* v_buf,
|
|
uint8* rgb_buf,
|
|
int width) {
|
|
asm volatile (
|
|
"sub %1,%2 \n"
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"pxor %%xmm4,%%xmm4 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
YUVTORGB
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"punpcklbw %%xmm0,%%xmm1 \n"
|
|
"punpcklbw %%xmm2,%%xmm5 \n"
|
|
"movdqa %%xmm5,%%xmm0 \n"
|
|
"punpcklwd %%xmm1,%%xmm5 \n"
|
|
"punpckhwd %%xmm1,%%xmm0 \n"
|
|
"movdqa %%xmm5,(%3) \n"
|
|
"movdqa %%xmm0,0x10(%3) \n"
|
|
"lea 0x20(%3),%3 \n"
|
|
"sub $0x8,%4 \n"
|
|
"jg 1b \n"
|
|
: "+r"(y_buf), // %0
|
|
"+r"(u_buf), // %1
|
|
"+r"(v_buf), // %2
|
|
"+r"(rgb_buf), // %3
|
|
"+rm"(width) // %4
|
|
: "r"(&kYuvConstants.kUVToB) // %5
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void OMITFP I420ToABGRRow_SSSE3(const uint8* y_buf,
|
|
const uint8* u_buf,
|
|
const uint8* v_buf,
|
|
uint8* rgb_buf,
|
|
int width) {
|
|
asm volatile (
|
|
"sub %1,%2 \n"
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"pxor %%xmm4,%%xmm4 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
YUVTORGB
|
|
"punpcklbw %%xmm1,%%xmm2 \n"
|
|
"punpcklbw %%xmm5,%%xmm0 \n"
|
|
"movdqa %%xmm2,%%xmm1 \n"
|
|
"punpcklwd %%xmm0,%%xmm2 \n"
|
|
"punpckhwd %%xmm0,%%xmm1 \n"
|
|
"movdqa %%xmm2,(%3) \n"
|
|
"movdqa %%xmm1,0x10(%3) \n"
|
|
"lea 0x20(%3),%3 \n"
|
|
"sub $0x8,%4 \n"
|
|
"jg 1b \n"
|
|
: "+r"(y_buf), // %0
|
|
"+r"(u_buf), // %1
|
|
"+r"(v_buf), // %2
|
|
"+r"(rgb_buf), // %3
|
|
"+rm"(width) // %4
|
|
: "r"(&kYuvConstants.kUVToB) // %5
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void OMITFP I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
|
const uint8* u_buf,
|
|
const uint8* v_buf,
|
|
uint8* rgb_buf,
|
|
int width) {
|
|
asm volatile (
|
|
"sub %1,%2 \n"
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"pxor %%xmm4,%%xmm4 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
YUVTORGB
|
|
"punpcklbw %%xmm1,%%xmm0 \n"
|
|
"punpcklbw %%xmm5,%%xmm2 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"punpcklwd %%xmm2,%%xmm0 \n"
|
|
"punpckhwd %%xmm2,%%xmm1 \n"
|
|
"movdqu %%xmm0,(%3) \n"
|
|
"movdqu %%xmm1,0x10(%3) \n"
|
|
"lea 0x20(%3),%3 \n"
|
|
"sub $0x8,%4 \n"
|
|
"jg 1b \n"
|
|
: "+r"(y_buf), // %0
|
|
"+r"(u_buf), // %1
|
|
"+r"(v_buf), // %2
|
|
"+r"(rgb_buf), // %3
|
|
"+rm"(width) // %4
|
|
: "r"(&kYuvConstants.kUVToB) // %5
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void OMITFP I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
|
|
const uint8* u_buf,
|
|
const uint8* v_buf,
|
|
uint8* rgb_buf,
|
|
int width) {
|
|
asm volatile (
|
|
"sub %1,%2 \n"
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"pxor %%xmm4,%%xmm4 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
YUVTORGB
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"punpcklbw %%xmm0,%%xmm1 \n"
|
|
"punpcklbw %%xmm2,%%xmm5 \n"
|
|
"movdqa %%xmm5,%%xmm0 \n"
|
|
"punpcklwd %%xmm1,%%xmm5 \n"
|
|
"punpckhwd %%xmm1,%%xmm0 \n"
|
|
"movdqu %%xmm5,(%3) \n"
|
|
"movdqu %%xmm0,0x10(%3) \n"
|
|
"lea 0x20(%3),%3 \n"
|
|
"sub $0x8,%4 \n"
|
|
"jg 1b \n"
|
|
: "+r"(y_buf), // %0
|
|
"+r"(u_buf), // %1
|
|
"+r"(v_buf), // %2
|
|
"+r"(rgb_buf), // %3
|
|
"+rm"(width) // %4
|
|
: "r"(&kYuvConstants.kUVToB) // %5
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void OMITFP I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
|
|
const uint8* u_buf,
|
|
const uint8* v_buf,
|
|
uint8* rgb_buf,
|
|
int width) {
|
|
asm volatile (
|
|
"sub %1,%2 \n"
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"pxor %%xmm4,%%xmm4 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
YUVTORGB
|
|
"punpcklbw %%xmm1,%%xmm2 \n"
|
|
"punpcklbw %%xmm5,%%xmm0 \n"
|
|
"movdqa %%xmm2,%%xmm1 \n"
|
|
"punpcklwd %%xmm0,%%xmm2 \n"
|
|
"punpckhwd %%xmm0,%%xmm1 \n"
|
|
"movdqu %%xmm2,(%3) \n"
|
|
"movdqu %%xmm1,0x10(%3) \n"
|
|
"lea 0x20(%3),%3 \n"
|
|
"sub $0x8,%4 \n"
|
|
"jg 1b \n"
|
|
: "+r"(y_buf), // %0
|
|
"+r"(u_buf), // %1
|
|
"+r"(v_buf), // %2
|
|
"+r"(rgb_buf), // %3
|
|
"+rm"(width) // %4
|
|
: "r"(&kYuvConstants.kUVToB) // %5
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
|
|
const uint8* u_buf,
|
|
const uint8* v_buf,
|
|
uint8* rgb_buf,
|
|
int width) {
|
|
asm volatile (
|
|
"sub %1,%2 \n"
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"pxor %%xmm4,%%xmm4 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movd (%1),%%xmm0 \n"
|
|
"movd (%1,%2,1),%%xmm1 \n"
|
|
"lea 0x4(%1),%1 \n"
|
|
"punpcklbw %%xmm1,%%xmm0 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"movdqa %%xmm0,%%xmm2 \n"
|
|
"pmaddubsw (%5),%%xmm0 \n"
|
|
"pmaddubsw 16(%5),%%xmm1 \n"
|
|
"pmaddubsw 32(%5),%%xmm2 \n"
|
|
"psubw 48(%5),%%xmm0 \n"
|
|
"psubw 64(%5),%%xmm1 \n"
|
|
"psubw 80(%5),%%xmm2 \n"
|
|
"movd (%0),%%xmm3 \n"
|
|
"lea 0x4(%0),%0 \n"
|
|
"punpcklbw %%xmm4,%%xmm3 \n"
|
|
"psubsw 96(%5),%%xmm3 \n"
|
|
"pmullw 112(%5),%%xmm3 \n"
|
|
"paddsw %%xmm3,%%xmm0 \n"
|
|
"paddsw %%xmm3,%%xmm1 \n"
|
|
"paddsw %%xmm3,%%xmm2 \n"
|
|
"psraw $0x6,%%xmm0 \n"
|
|
"psraw $0x6,%%xmm1 \n"
|
|
"psraw $0x6,%%xmm2 \n"
|
|
"packuswb %%xmm0,%%xmm0 \n"
|
|
"packuswb %%xmm1,%%xmm1 \n"
|
|
"packuswb %%xmm2,%%xmm2 \n"
|
|
"punpcklbw %%xmm1,%%xmm0 \n"
|
|
"punpcklbw %%xmm5,%%xmm2 \n"
|
|
"punpcklwd %%xmm2,%%xmm0 \n"
|
|
"sub $0x4,%4 \n"
|
|
"movdqa %%xmm0,(%3) \n"
|
|
"lea 0x10(%3),%3 \n"
|
|
"jg 1b \n"
|
|
: "+r"(y_buf), // %0
|
|
"+r"(u_buf), // %1
|
|
"+r"(v_buf), // %2
|
|
"+r"(rgb_buf), // %3
|
|
"+rm"(width) // %4
|
|
: "r"(&kYuvConstants.kUVToB) // %5
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_YTOARGBROW_SSE2
|
|
void YToARGBRow_SSE2(const uint8* y_buf,
|
|
uint8* rgb_buf,
|
|
int width) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
|
"pslld $0x18,%%xmm4 \n"
|
|
"mov $0x10001000,%%eax \n"
|
|
"movd %%eax,%%xmm3 \n"
|
|
"pshufd $0x0,%%xmm3,%%xmm3 \n"
|
|
"mov $0x012a012a,%%eax \n"
|
|
"movd %%eax,%%xmm2 \n"
|
|
"pshufd $0x0,%%xmm2,%%xmm2 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
|
|
"movq (%0),%%xmm0 \n"
|
|
"lea 0x8(%0),%0 \n"
|
|
"punpcklbw %%xmm0,%%xmm0 \n"
|
|
"psubusw %%xmm3,%%xmm0 \n"
|
|
"pmulhuw %%xmm2,%%xmm0 \n"
|
|
"packuswb %%xmm0,%%xmm0 \n"
|
|
|
|
// Step 2: Weave into ARGB
|
|
"punpcklbw %%xmm0,%%xmm0 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"punpcklwd %%xmm0,%%xmm0 \n"
|
|
"punpckhwd %%xmm1,%%xmm1 \n"
|
|
"por %%xmm4,%%xmm0 \n"
|
|
"por %%xmm4,%%xmm1 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"movdqa %%xmm1,16(%1) \n"
|
|
"lea 32(%1),%1 \n"
|
|
|
|
"sub $0x8,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(y_buf), // %0
|
|
"+r"(rgb_buf), // %1
|
|
"+rm"(width) // %2
|
|
:
|
|
: "memory", "cc", "eax"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
|
|
#endif
|
|
);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_MIRRORROW_SSSE3
|
|
// Shuffle table for reversing the bytes.
|
|
CONST uvec8 kShuffleMirror = {
|
|
15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
|
|
};
|
|
|
|
void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
|
intptr_t temp_width = static_cast<intptr_t>(width);
|
|
asm volatile (
|
|
"movdqa %3,%%xmm5 \n"
|
|
"lea -0x10(%0),%0 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0,%2),%%xmm0 \n"
|
|
"pshufb %%xmm5,%%xmm0 \n"
|
|
"sub $0x10,%2 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src), // %0
|
|
"+r"(dst), // %1
|
|
"+r"(temp_width) // %2
|
|
: "m"(kShuffleMirror) // %3
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_MIRRORROW_SSE2
|
|
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
|
|
intptr_t temp_width = static_cast<intptr_t>(width);
|
|
asm volatile (
|
|
"lea -0x10(%0),%0 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0,%2),%%xmm0 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"psllw $0x8,%%xmm0 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"por %%xmm1,%%xmm0 \n"
|
|
"pshuflw $0x1b,%%xmm0,%%xmm0 \n"
|
|
"pshufhw $0x1b,%%xmm0,%%xmm0 \n"
|
|
"pshufd $0x4e,%%xmm0,%%xmm0 \n"
|
|
"sub $0x10,%2 \n"
|
|
"movdqu %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src), // %0
|
|
"+r"(dst), // %1
|
|
"+r"(temp_width) // %2
|
|
:
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1"
|
|
#endif
|
|
);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_MIRRORROW_UV_SSSE3
|
|
// Shuffle table for reversing the bytes of UV channels.
|
|
CONST uvec8 kShuffleMirrorUV = {
|
|
14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
|
|
};
|
|
void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
|
|
int width) {
|
|
intptr_t temp_width = static_cast<intptr_t>(width);
|
|
asm volatile (
|
|
"movdqa %4,%%xmm1 \n"
|
|
"lea -16(%0,%3,2),%0 \n"
|
|
"sub %1,%2 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"lea -16(%0),%0 \n"
|
|
"pshufb %%xmm1,%%xmm0 \n"
|
|
"sub $8,%3 \n"
|
|
"movlpd %%xmm0,(%1) \n"
|
|
"movhpd %%xmm0,(%1,%2) \n"
|
|
"lea 8(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src), // %0
|
|
"+r"(dst_u), // %1
|
|
"+r"(dst_v), // %2
|
|
"+r"(temp_width) // %3
|
|
: "m"(kShuffleMirrorUV) // %4
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1"
|
|
#endif
|
|
);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_ADDROW_SSE2
|
|
// dst and width aligned to 16
|
|
void AddRow_SSE2(const uint8* src, uint16* dst, int width) {
|
|
asm volatile (
|
|
"pxor %%xmm4,%%xmm4 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm2 \n"
|
|
"lea 0x10(%0),%0 \n"
|
|
"movdqa (%1),%%xmm0 \n"
|
|
"movdqa 0x10(%1),%%xmm1 \n"
|
|
"movdqa %%xmm2,%%xmm3 \n"
|
|
"punpcklbw %%xmm4,%%xmm2 \n"
|
|
"punpckhbw %%xmm4,%%xmm3 \n"
|
|
"paddusw %%xmm2,%%xmm0 \n"
|
|
"paddusw %%xmm3,%%xmm1 \n"
|
|
"sub $0x10,%2 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"movdqa %%xmm1,0x10(%1) \n"
|
|
"lea 0x20(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src), // %0
|
|
"+r"(dst), // %1
|
|
"+r"(width) // %2
|
|
:
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
// dst and width aligned to 16
|
|
void SubRow_SSE2(const uint8* src, uint16* dst, int width) {
|
|
asm volatile (
|
|
"pxor %%xmm4,%%xmm4 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm2 \n"
|
|
"lea 0x10(%0),%0 \n"
|
|
"movdqa (%1),%%xmm0 \n"
|
|
"movdqa 0x10(%1),%%xmm1 \n"
|
|
"movdqa %%xmm2,%%xmm3 \n"
|
|
"punpcklbw %%xmm4,%%xmm2 \n"
|
|
"punpckhbw %%xmm4,%%xmm3 \n"
|
|
"psubusw %%xmm2,%%xmm0 \n"
|
|
"psubusw %%xmm3,%%xmm1 \n"
|
|
"sub $0x10,%2 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"movdqa %%xmm1,0x10(%1) \n"
|
|
"lea 0x20(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src), // %0
|
|
"+r"(dst), // %1
|
|
"+r"(width) // %2
|
|
:
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
|
|
#endif
|
|
);
|
|
}
|
|
#endif // HAS_ADDROW_SSE2
|
|
|
|
#ifdef HAS_SPLITUV_SSE2
|
|
void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"psrlw $0x8,%%xmm5 \n"
|
|
"sub %1,%2 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"movdqa %%xmm0,%%xmm2 \n"
|
|
"movdqa %%xmm1,%%xmm3 \n"
|
|
"pand %%xmm5,%%xmm0 \n"
|
|
"pand %%xmm5,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm0 \n"
|
|
"psrlw $0x8,%%xmm2 \n"
|
|
"psrlw $0x8,%%xmm3 \n"
|
|
"packuswb %%xmm3,%%xmm2 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"movdqa %%xmm2,(%1,%2) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"sub $0x10,%3 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_uv), // %0
|
|
"+r"(dst_u), // %1
|
|
"+r"(dst_v), // %2
|
|
"+r"(pix) // %3
|
|
:
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_COPYROW_SSE2
|
|
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
|
|
asm volatile (
|
|
"sub %0,%1 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"movdqa %%xmm0,(%0,%1) \n"
|
|
"movdqa %%xmm1,0x10(%0,%1) \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"sub $0x20,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src), // %0
|
|
"+r"(dst), // %1
|
|
"+r"(count) // %2
|
|
:
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1"
|
|
#endif
|
|
);
|
|
}
|
|
#endif // HAS_COPYROW_SSE2
|
|
|
|
#ifdef HAS_COPYROW_X86
|
|
void CopyRow_X86(const uint8* src, uint8* dst, int width) {
|
|
size_t width_tmp = static_cast<size_t>(width);
|
|
asm volatile (
|
|
"shr $0x2,%2 \n"
|
|
"rep movsl \n"
|
|
: "+S"(src), // %0
|
|
"+D"(dst), // %1
|
|
"+c"(width_tmp) // %2
|
|
:
|
|
: "memory", "cc"
|
|
);
|
|
}
|
|
#endif
|
|
|
|
#ifdef HAS_YUY2TOYROW_SSE2
|
|
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"psrlw $0x8,%%xmm5 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"pand %%xmm5,%%xmm0 \n"
|
|
"pand %%xmm5,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm0 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"sub $0x10,%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_yuy2), // %0
|
|
"+r"(dst_y), // %1
|
|
"+r"(pix) // %2
|
|
:
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
|
uint8* dst_u, uint8* dst_y, int pix) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"psrlw $0x8,%%xmm5 \n"
|
|
"sub %1,%2 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"movdqa (%0,%4,1),%%xmm2 \n"
|
|
"movdqa 0x10(%0,%4,1),%%xmm3 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"pavgb %%xmm2,%%xmm0 \n"
|
|
"pavgb %%xmm3,%%xmm1 \n"
|
|
"psrlw $0x8,%%xmm0 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm0 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"pand %%xmm5,%%xmm0 \n"
|
|
"packuswb %%xmm0,%%xmm0 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm1 \n"
|
|
"movq %%xmm0,(%1) \n"
|
|
"movq %%xmm1,(%1,%2) \n"
|
|
"lea 0x8(%1),%1 \n"
|
|
"sub $0x10,%3 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_yuy2), // %0
|
|
"+r"(dst_u), // %1
|
|
"+r"(dst_y), // %2
|
|
"+r"(pix) // %3
|
|
: "r"(static_cast<intptr_t>(stride_yuy2)) // %4
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
|
|
void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
|
|
uint8* dst_y, int pix) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"psrlw $0x8,%%xmm5 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"pand %%xmm5,%%xmm0 \n"
|
|
"pand %%xmm5,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm0 \n"
|
|
"sub $0x10,%2 \n"
|
|
"movdqu %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_yuy2), // %0
|
|
"+r"(dst_y), // %1
|
|
"+r"(pix) // %2
|
|
:
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
|
|
int stride_yuy2,
|
|
uint8* dst_u, uint8* dst_y,
|
|
int pix) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"psrlw $0x8,%%xmm5 \n"
|
|
"sub %1,%2 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"movdqu (%0,%4,1),%%xmm2 \n"
|
|
"movdqu 0x10(%0,%4,1),%%xmm3 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"pavgb %%xmm2,%%xmm0 \n"
|
|
"pavgb %%xmm3,%%xmm1 \n"
|
|
"psrlw $0x8,%%xmm0 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm0 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"pand %%xmm5,%%xmm0 \n"
|
|
"packuswb %%xmm0,%%xmm0 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm1 \n"
|
|
"movq %%xmm0,(%1) \n"
|
|
"movq %%xmm1,(%1,%2) \n"
|
|
"lea 0x8(%1),%1 \n"
|
|
"sub $0x10,%3 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_yuy2), // %0
|
|
"+r"(dst_u), // %1
|
|
"+r"(dst_y), // %2
|
|
"+r"(pix) // %3
|
|
: "r"(static_cast<intptr_t>(stride_yuy2)) // %4
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
|
|
asm volatile (
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"psrlw $0x8,%%xmm0 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm0 \n"
|
|
"sub $0x10,%2 \n"
|
|
"movdqa %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_uyvy), // %0
|
|
"+r"(dst_y), // %1
|
|
"+r"(pix) // %2
|
|
:
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
|
uint8* dst_u, uint8* dst_y, int pix) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"psrlw $0x8,%%xmm5 \n"
|
|
"sub %1,%2 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"movdqa (%0,%4,1),%%xmm2 \n"
|
|
"movdqa 0x10(%0,%4,1),%%xmm3 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"pavgb %%xmm2,%%xmm0 \n"
|
|
"pavgb %%xmm3,%%xmm1 \n"
|
|
"pand %%xmm5,%%xmm0 \n"
|
|
"pand %%xmm5,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm0 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"pand %%xmm5,%%xmm0 \n"
|
|
"packuswb %%xmm0,%%xmm0 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm1 \n"
|
|
"movq %%xmm0,(%1) \n"
|
|
"movq %%xmm1,(%1,%2) \n"
|
|
"lea 0x8(%1),%1 \n"
|
|
"sub $0x10,%3 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_uyvy), // %0
|
|
"+r"(dst_u), // %1
|
|
"+r"(dst_y), // %2
|
|
"+r"(pix) // %3
|
|
: "r"(static_cast<intptr_t>(stride_uyvy)) // %4
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
|
|
uint8* dst_y, int pix) {
|
|
asm volatile (
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"psrlw $0x8,%%xmm0 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm0 \n"
|
|
"sub $0x10,%2 \n"
|
|
"movdqu %%xmm0,(%1) \n"
|
|
"lea 0x10(%1),%1 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_uyvy), // %0
|
|
"+r"(dst_y), // %1
|
|
"+r"(pix) // %2
|
|
:
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1"
|
|
#endif
|
|
);
|
|
}
|
|
|
|
void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
|
uint8* dst_u, uint8* dst_y, int pix) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"psrlw $0x8,%%xmm5 \n"
|
|
"sub %1,%2 \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm0 \n"
|
|
"movdqu 0x10(%0),%%xmm1 \n"
|
|
"movdqu (%0,%4,1),%%xmm2 \n"
|
|
"movdqu 0x10(%0,%4,1),%%xmm3 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"pavgb %%xmm2,%%xmm0 \n"
|
|
"pavgb %%xmm3,%%xmm1 \n"
|
|
"pand %%xmm5,%%xmm0 \n"
|
|
"pand %%xmm5,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm0 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"pand %%xmm5,%%xmm0 \n"
|
|
"packuswb %%xmm0,%%xmm0 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm1 \n"
|
|
"movq %%xmm0,(%1) \n"
|
|
"movq %%xmm1,(%1,%2) \n"
|
|
"lea 0x8(%1),%1 \n"
|
|
"sub $0x10,%3 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_uyvy), // %0
|
|
"+r"(dst_u), // %1
|
|
"+r"(dst_y), // %2
|
|
"+r"(pix) // %3
|
|
: "r"(static_cast<intptr_t>(stride_uyvy)) // %4
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
#endif // HAS_YUY2TOYROW_SSE2
|
|
|
|
#ifdef HAS_ARGBBLENDROW_SSE2
|
|
// Blend 8 pixels at a time.
|
|
// src_argb0 unaligned.
|
|
// src_argb1 and dst_argb aligned to 16 bytes.
|
|
// width must be multiple of 4 pixels.
|
|
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
|
uint8* dst_argb, int width) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm7,%%xmm7 \n"
|
|
"psrlw $0xf,%%xmm7 \n"
|
|
"pcmpeqb %%xmm6,%%xmm6 \n"
|
|
"psrlw $0x8,%%xmm6 \n"
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"psllw $0x8,%%xmm5 \n"
|
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
|
"pslld $0x18,%%xmm4 \n"
|
|
|
|
// 8 pixel loop
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm3 \n"
|
|
"movdqa %%xmm3,%%xmm0 \n"
|
|
"pxor %%xmm4,%%xmm3 \n"
|
|
"movdqu (%1),%%xmm2 \n"
|
|
"psrlw $0x8,%%xmm3 \n"
|
|
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
|
|
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
|
|
"pand %%xmm6,%%xmm2 \n"
|
|
"paddw %%xmm7,%%xmm3 \n"
|
|
"pmullw %%xmm3,%%xmm2 \n"
|
|
"movdqu (%1),%%xmm1 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"por %%xmm4,%%xmm0 \n"
|
|
"pmullw %%xmm3,%%xmm1 \n"
|
|
"movdqu 0x10(%0),%%xmm3 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"psrlw $0x8,%%xmm2 \n"
|
|
"paddusb %%xmm2,%%xmm0 \n"
|
|
"pand %%xmm5,%%xmm1 \n"
|
|
"paddusb %%xmm1,%%xmm0 \n"
|
|
"sub $0x4,%3 \n"
|
|
"movdqa %%xmm0,(%2) \n"
|
|
"jle 9f \n"
|
|
"movdqa %%xmm3,%%xmm0 \n"
|
|
"pxor %%xmm4,%%xmm3 \n"
|
|
"movdqu 0x10(%1),%%xmm2 \n"
|
|
"psrlw $0x8,%%xmm3 \n"
|
|
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
|
|
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
|
|
"pand %%xmm6,%%xmm2 \n"
|
|
"paddw %%xmm7,%%xmm3 \n"
|
|
"pmullw %%xmm3,%%xmm2 \n"
|
|
"movdqu 0x10(%1),%%xmm1 \n"
|
|
"lea 0x20(%1),%1 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"por %%xmm4,%%xmm0 \n"
|
|
"pmullw %%xmm3,%%xmm1 \n"
|
|
"psrlw $0x8,%%xmm2 \n"
|
|
"paddusb %%xmm2,%%xmm0 \n"
|
|
"pand %%xmm5,%%xmm1 \n"
|
|
"paddusb %%xmm1,%%xmm0 \n"
|
|
"sub $0x4,%3 \n"
|
|
"movdqa %%xmm0,0x10(%2) \n"
|
|
"lea 0x20(%2),%2 \n"
|
|
"jg 1b \n"
|
|
"9: \n"
|
|
: "+r"(src_argb0), // %0
|
|
"+r"(src_argb1), // %1
|
|
"+r"(dst_argb), // %2
|
|
"+r"(width) // %3
|
|
:
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
|
#endif
|
|
);
|
|
}
|
|
#endif // HAS_ARGBBLENDROW_SSE2
|
|
|
|
#ifdef HAS_ARGBBLENDROW1_SSE2
|
|
// Blend 1 pixel at a time, unaligned
|
|
void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
|
uint8* dst_argb, int width) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm7,%%xmm7 \n"
|
|
"psrlw $0xf,%%xmm7 \n"
|
|
"pcmpeqb %%xmm6,%%xmm6 \n"
|
|
"psrlw $0x8,%%xmm6 \n"
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"psllw $0x8,%%xmm5 \n"
|
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
|
"pslld $0x18,%%xmm4 \n"
|
|
|
|
// 1 pixel loop
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movd (%0),%%xmm3 \n"
|
|
"lea 0x4(%0),%0 \n"
|
|
"movdqa %%xmm3,%%xmm0 \n"
|
|
"pxor %%xmm4,%%xmm3 \n"
|
|
"movd (%1),%%xmm2 \n"
|
|
"psrlw $0x8,%%xmm3 \n"
|
|
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
|
|
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
|
|
"pand %%xmm6,%%xmm2 \n"
|
|
"paddw %%xmm7,%%xmm3 \n"
|
|
"pmullw %%xmm3,%%xmm2 \n"
|
|
"movd (%1),%%xmm1 \n"
|
|
"lea 0x4(%1),%1 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"por %%xmm4,%%xmm0 \n"
|
|
"pmullw %%xmm3,%%xmm1 \n"
|
|
"psrlw $0x8,%%xmm2 \n"
|
|
"paddusb %%xmm2,%%xmm0 \n"
|
|
"pand %%xmm5,%%xmm1 \n"
|
|
"paddusb %%xmm1,%%xmm0 \n"
|
|
"sub $0x1,%3 \n"
|
|
"movd %%xmm0,(%2) \n"
|
|
"lea 0x4(%2),%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_argb0), // %0
|
|
"+r"(src_argb1), // %1
|
|
"+r"(dst_argb), // %2
|
|
"+r"(width) // %3
|
|
:
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
|
#endif
|
|
);
|
|
}
|
|
#endif // HAS_ARGBBLENDROW1_SSE2
|
|
|
|
#ifdef HAS_ARGBBLENDROW_SSSE3
|
|
// Shuffle table for reversing the bytes.
|
|
CONST uvec8 kShuffleAlpha = {
|
|
3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
|
|
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
|
|
};
|
|
void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
|
uint8* dst_argb, int width) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm7,%%xmm7 \n"
|
|
"psrlw $0xf,%%xmm7 \n"
|
|
"pcmpeqb %%xmm6,%%xmm6 \n"
|
|
"psrlw $0x8,%%xmm6 \n"
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"psllw $0x8,%%xmm5 \n"
|
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
|
"pslld $0x18,%%xmm4 \n"
|
|
|
|
// 8 pixel loop
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqu (%0),%%xmm3 \n"
|
|
"movdqa %%xmm3,%%xmm0 \n"
|
|
"pxor %%xmm4,%%xmm3 \n"
|
|
"pshufb %4,%%xmm3 \n"
|
|
"movdqu (%1),%%xmm2 \n"
|
|
"pand %%xmm6,%%xmm2 \n"
|
|
"paddw %%xmm7,%%xmm3 \n"
|
|
"pmullw %%xmm3,%%xmm2 \n"
|
|
"movdqu (%1),%%xmm1 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"por %%xmm4,%%xmm0 \n"
|
|
"pmullw %%xmm3,%%xmm1 \n"
|
|
"movdqu 0x10(%0),%%xmm3 \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"psrlw $0x8,%%xmm2 \n"
|
|
"paddusb %%xmm2,%%xmm0 \n"
|
|
"pand %%xmm5,%%xmm1 \n"
|
|
"paddusb %%xmm1,%%xmm0 \n"
|
|
"sub $0x4,%3 \n"
|
|
"movdqa %%xmm0,(%2) \n"
|
|
"jle 9f \n"
|
|
"movdqa %%xmm3,%%xmm0 \n"
|
|
"pxor %%xmm4,%%xmm3 \n"
|
|
"movdqu 0x10(%1),%%xmm2 \n"
|
|
"pshufb %4,%%xmm3 \n"
|
|
"pand %%xmm6,%%xmm2 \n"
|
|
"paddw %%xmm7,%%xmm3 \n"
|
|
"pmullw %%xmm3,%%xmm2 \n"
|
|
"movdqu 0x10(%1),%%xmm1 \n"
|
|
"lea 0x20(%1),%1 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"por %%xmm4,%%xmm0 \n"
|
|
"pmullw %%xmm3,%%xmm1 \n"
|
|
"psrlw $0x8,%%xmm2 \n"
|
|
"paddusb %%xmm2,%%xmm0 \n"
|
|
"pand %%xmm5,%%xmm1 \n"
|
|
"paddusb %%xmm1,%%xmm0 \n"
|
|
"sub $0x4,%3 \n"
|
|
"movdqa %%xmm0,0x10(%2) \n"
|
|
"lea 0x20(%2),%2 \n"
|
|
"jg 1b \n"
|
|
"9: \n"
|
|
: "+r"(src_argb0), // %0
|
|
"+r"(src_argb1), // %1
|
|
"+r"(dst_argb), // %2
|
|
"+r"(width) // %3
|
|
: "m"(kShuffleAlpha) // %4
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
|
#endif
|
|
);
|
|
}
|
|
#endif // HAS_ARGBBLENDROW_SSSE3
|
|
|
|
|
|
#ifdef HAS_ARGBBLENDROW1_SSSE3
|
|
// Blend 1 pixel at a time, unaligned
|
|
void ARGBBlendRow1_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
|
uint8* dst_argb, int width) {
|
|
asm volatile (
|
|
"pcmpeqb %%xmm7,%%xmm7 \n"
|
|
"psrlw $0xf,%%xmm7 \n"
|
|
"pcmpeqb %%xmm6,%%xmm6 \n"
|
|
"psrlw $0x8,%%xmm6 \n"
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"psllw $0x8,%%xmm5 \n"
|
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
|
"pslld $0x18,%%xmm4 \n"
|
|
|
|
// 1 pixel loop
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movd (%0),%%xmm3 \n"
|
|
"lea 0x4(%0),%0 \n"
|
|
"movdqa %%xmm3,%%xmm0 \n"
|
|
"pxor %%xmm4,%%xmm3 \n"
|
|
"movd (%1),%%xmm2 \n"
|
|
"pshufb %4,%%xmm3 \n"
|
|
"pand %%xmm6,%%xmm2 \n"
|
|
"paddw %%xmm7,%%xmm3 \n"
|
|
"pmullw %%xmm3,%%xmm2 \n"
|
|
"movd (%1),%%xmm1 \n"
|
|
"lea 0x4(%1),%1 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"por %%xmm4,%%xmm0 \n"
|
|
"pmullw %%xmm3,%%xmm1 \n"
|
|
"psrlw $0x8,%%xmm2 \n"
|
|
"paddusb %%xmm2,%%xmm0 \n"
|
|
"pand %%xmm5,%%xmm1 \n"
|
|
"paddusb %%xmm1,%%xmm0 \n"
|
|
"sub $0x1,%3 \n"
|
|
"movd %%xmm0,(%2) \n"
|
|
"lea 0x4(%2),%2 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_argb0), // %0
|
|
"+r"(src_argb1), // %1
|
|
"+r"(dst_argb), // %2
|
|
"+r"(width) // %3
|
|
: "m"(kShuffleAlpha) // %4
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
|
#endif
|
|
);
|
|
}
|
|
#endif // HAS_ARGBBLENDROW1_SSSE3
|
|
|
|
#ifdef HAS_ARGBATTENUATE_SSE2
|
|
// Attenuate 4 pixels at a time.
|
|
// aligned to 16 bytes
|
|
void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
|
asm volatile (
|
|
"sub %0,%1 \n"
|
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
|
"pslld $0x18,%%xmm4 \n"
|
|
"pcmpeqb %%xmm5,%%xmm5 \n"
|
|
"psrld $0x8,%%xmm5 \n"
|
|
|
|
// 4 pixel loop
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"punpcklbw %%xmm0,%%xmm0 \n"
|
|
"pshufhw $0xff,%%xmm0,%%xmm2 \n"
|
|
"pshuflw $0xff,%%xmm2,%%xmm2 \n"
|
|
"pmulhuw %%xmm2,%%xmm0 \n"
|
|
"movdqa (%0),%%xmm1 \n"
|
|
"punpckhbw %%xmm1,%%xmm1 \n"
|
|
"pshufhw $0xff,%%xmm1,%%xmm2 \n"
|
|
"pshuflw $0xff,%%xmm2,%%xmm2 \n"
|
|
"pmulhuw %%xmm2,%%xmm1 \n"
|
|
"movdqa (%0),%%xmm2 \n"
|
|
"psrlw $0x8,%%xmm0 \n"
|
|
"pand %%xmm4,%%xmm2 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm0 \n"
|
|
"pand %%xmm5,%%xmm0 \n"
|
|
"por %%xmm2,%%xmm0 \n"
|
|
"sub $0x4,%2 \n"
|
|
"movdqa %%xmm0,(%0,%1,1) \n"
|
|
"lea 0x10(%0),%0 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_argb), // %0
|
|
"+r"(dst_argb), // %1
|
|
"+r"(width) // %2
|
|
:
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
#endif // HAS_ARGBATTENUATE_SSE2
|
|
|
|
#ifdef HAS_ARGBATTENUATE_SSSE3
|
|
// Shuffle table duplicating alpha
|
|
CONST uvec8 kShuffleAlpha0 = {
|
|
3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
|
|
};
|
|
CONST uvec8 kShuffleAlpha1 = {
|
|
11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
|
|
15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
|
|
};
|
|
// Attenuate 4 pixels at a time.
|
|
// aligned to 16 bytes
|
|
void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
|
asm volatile (
|
|
"sub %0,%1 \n"
|
|
"pcmpeqb %%xmm3,%%xmm3 \n"
|
|
"pslld $0x18,%%xmm3 \n"
|
|
"movdqa %3,%%xmm4 \n"
|
|
"movdqa %4,%%xmm5 \n"
|
|
|
|
// 4 pixel loop
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"pshufb %%xmm4,%%xmm0 \n"
|
|
"movdqa (%0),%%xmm1 \n"
|
|
"punpcklbw %%xmm1,%%xmm1 \n"
|
|
"pmulhuw %%xmm1,%%xmm0 \n"
|
|
"movdqa (%0),%%xmm1 \n"
|
|
"pshufb %%xmm5,%%xmm1 \n"
|
|
"movdqa (%0),%%xmm2 \n"
|
|
"punpckhbw %%xmm2,%%xmm2 \n"
|
|
"pmulhuw %%xmm2,%%xmm1 \n"
|
|
"movdqa (%0),%%xmm2 \n"
|
|
"pand %%xmm3,%%xmm2 \n"
|
|
"psrlw $0x8,%%xmm0 \n"
|
|
"psrlw $0x8,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm0 \n"
|
|
"por %%xmm2,%%xmm0 \n"
|
|
"sub $0x4,%2 \n"
|
|
"movdqa %%xmm0,(%0,%1,1) \n"
|
|
"lea 0x10(%0),%0 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_argb), // %0
|
|
"+r"(dst_argb), // %1
|
|
"+r"(width) // %2
|
|
: "m"(kShuffleAlpha0), // %3
|
|
"m"(kShuffleAlpha1) // %4
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
#endif // HAS_ARGBATTENUATE_SSSE3
|
|
|
|
#ifdef HAS_ARGBUNATTENUATE_SSE2
|
|
// Unattenuate 4 pixels at a time.
|
|
// aligned to 16 bytes
|
|
void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
|
|
int width) {
|
|
uintptr_t alpha = 0;
|
|
asm volatile (
|
|
"sub %0,%1 \n"
|
|
"pcmpeqb %%xmm4,%%xmm4 \n"
|
|
"pslld $0x18,%%xmm4 \n"
|
|
|
|
// 4 pixel loop
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movzb 0x3(%0),%3 \n"
|
|
"punpcklbw %%xmm0,%%xmm0 \n"
|
|
"movd 0x0(%4,%3,4),%%xmm2 \n"
|
|
"movzb 0x7(%0),%3 \n"
|
|
"movd 0x0(%4,%3,4),%%xmm3 \n"
|
|
"pshuflw $0xc0,%%xmm2,%%xmm2 \n"
|
|
"pshuflw $0xc0,%%xmm3,%%xmm3 \n"
|
|
"movlhps %%xmm3,%%xmm2 \n"
|
|
"pmulhuw %%xmm2,%%xmm0 \n"
|
|
"movdqa (%0),%%xmm1 \n"
|
|
"movzb 0xb(%0),%3 \n"
|
|
"punpckhbw %%xmm1,%%xmm1 \n"
|
|
"movd 0x0(%4,%3,4),%%xmm2 \n"
|
|
"movzb 0xf(%0),%3 \n"
|
|
"movd 0x0(%4,%3,4),%%xmm3 \n"
|
|
"pshuflw $0xc0,%%xmm2,%%xmm2 \n"
|
|
"pshuflw $0xc0,%%xmm3,%%xmm3 \n"
|
|
"movlhps %%xmm3,%%xmm2 \n"
|
|
"pmulhuw %%xmm2,%%xmm1 \n"
|
|
"movdqa (%0),%%xmm2 \n"
|
|
"pand %%xmm4,%%xmm2 \n"
|
|
"packuswb %%xmm1,%%xmm0 \n"
|
|
"por %%xmm2,%%xmm0 \n"
|
|
"sub $0x4,%2 \n"
|
|
"movdqa %%xmm0,(%0,%1,1) \n"
|
|
"lea 0x10(%0),%0 \n"
|
|
"jg 1b \n"
|
|
: "+r"(src_argb), // %0
|
|
"+r"(dst_argb), // %1
|
|
"+r"(width), // %2
|
|
"+r"(alpha) // %3
|
|
: "r"(fixed_invtbl8) // %4
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
|
|
#endif
|
|
);
|
|
}
|
|
#endif // HAS_ARGBUNATTENUATE_SSE2
|
|
|
|
#ifdef HAS_ARGBGRAYROW_SSSE3
|
|
// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
|
|
CONST vec8 kARGBToGray = {
|
|
14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
|
|
};
|
|
|
|
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
|
|
void ARGBGrayRow_SSSE3(uint8* dst_argb, int width) {
|
|
asm volatile (
|
|
"movdqa %2,%%xmm4 \n"
|
|
// 8 pixel loop \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"pmaddubsw %%xmm4,%%xmm0 \n"
|
|
"pmaddubsw %%xmm4,%%xmm1 \n"
|
|
"phaddw %%xmm1,%%xmm0 \n"
|
|
"psrlw $0x7,%%xmm0 \n"
|
|
"packuswb %%xmm0,%%xmm0 \n"
|
|
"movdqa (%0),%%xmm2 \n"
|
|
"movdqa 0x10(%0),%%xmm3 \n"
|
|
"psrld $0x18,%%xmm2 \n"
|
|
"psrld $0x18,%%xmm3 \n"
|
|
"packuswb %%xmm3,%%xmm2 \n"
|
|
"packuswb %%xmm2,%%xmm2 \n"
|
|
"movdqa %%xmm0,%%xmm3 \n"
|
|
"punpcklbw %%xmm0,%%xmm0 \n"
|
|
"punpcklbw %%xmm2,%%xmm3 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"punpcklwd %%xmm3,%%xmm0 \n"
|
|
"punpckhwd %%xmm3,%%xmm1 \n"
|
|
"sub $0x8,%1 \n"
|
|
"movdqa %%xmm0,(%0) \n"
|
|
"movdqa %%xmm1,0x10(%0) \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"jg 1b \n"
|
|
: "+r"(dst_argb), // %0
|
|
"+r"(width) // %1
|
|
: "m"(kARGBToGray) // %2
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
|
|
#endif
|
|
);
|
|
}
|
|
#endif // HAS_ARGBGRAYROW_SSSE3
|
|
|
|
#ifdef HAS_ARGBSEPIAROW_SSSE3
|
|
// b = (r * 35 + g * 68 + b * 17) >> 7
|
|
// g = (r * 45 + g * 88 + b * 22) >> 7
|
|
// r = (r * 50 + g * 98 + b * 24) >> 7
|
|
// Constant for ARGB color to sepia tone
|
|
CONST vec8 kARGBToSepiaB = {
|
|
17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
|
|
};
|
|
|
|
CONST vec8 kARGBToSepiaG = {
|
|
22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
|
|
};
|
|
|
|
CONST vec8 kARGBToSepiaR = {
|
|
24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
|
|
};
|
|
|
|
// Convert 8 ARGB pixels (64 bytes) to 8 Sepia ARGB pixels
|
|
void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
|
|
asm volatile (
|
|
"movdqa %2,%%xmm2 \n"
|
|
"movdqa %3,%%xmm3 \n"
|
|
"movdqa %4,%%xmm4 \n"
|
|
// 8 pixel loop \n"
|
|
".p2align 4 \n"
|
|
"1: \n"
|
|
"movdqa (%0),%%xmm0 \n"
|
|
"movdqa 0x10(%0),%%xmm6 \n"
|
|
"pmaddubsw %%xmm2,%%xmm0 \n"
|
|
"pmaddubsw %%xmm2,%%xmm6 \n"
|
|
"phaddw %%xmm6,%%xmm0 \n"
|
|
"psrlw $0x7,%%xmm0 \n"
|
|
"packuswb %%xmm0,%%xmm0 \n"
|
|
"movdqa (%0),%%xmm5 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"pmaddubsw %%xmm3,%%xmm5 \n"
|
|
"pmaddubsw %%xmm3,%%xmm1 \n"
|
|
"phaddw %%xmm1,%%xmm5 \n"
|
|
"psrlw $0x7,%%xmm5 \n"
|
|
"packuswb %%xmm5,%%xmm5 \n"
|
|
"punpcklbw %%xmm5,%%xmm0 \n"
|
|
"movdqa (%0),%%xmm5 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"pmaddubsw %%xmm4,%%xmm5 \n"
|
|
"pmaddubsw %%xmm4,%%xmm1 \n"
|
|
"phaddw %%xmm1,%%xmm5 \n"
|
|
"psrlw $0x7,%%xmm5 \n"
|
|
"packuswb %%xmm5,%%xmm5 \n"
|
|
"movdqa (%0),%%xmm6 \n"
|
|
"movdqa 0x10(%0),%%xmm1 \n"
|
|
"psrld $0x18,%%xmm6 \n"
|
|
"psrld $0x18,%%xmm1 \n"
|
|
"packuswb %%xmm1,%%xmm6 \n"
|
|
"packuswb %%xmm6,%%xmm6 \n"
|
|
"punpcklbw %%xmm6,%%xmm5 \n"
|
|
"movdqa %%xmm0,%%xmm1 \n"
|
|
"punpcklwd %%xmm5,%%xmm0 \n"
|
|
"punpckhwd %%xmm5,%%xmm1 \n"
|
|
"sub $0x8,%1 \n"
|
|
"movdqa %%xmm0,(%0) \n"
|
|
"movdqa %%xmm1,0x10(%0) \n"
|
|
"lea 0x20(%0),%0 \n"
|
|
"jg 1b \n"
|
|
: "+r"(dst_argb), // %0
|
|
"+r"(width) // %1
|
|
: "m"(kARGBToSepiaB), // %2
|
|
"m"(kARGBToSepiaG), // %3
|
|
"m"(kARGBToSepiaR) // %4
|
|
: "memory", "cc"
|
|
#if defined(__SSE2__)
|
|
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
|
|
#endif
|
|
);
|
|
}
|
|
#endif // HAS_ARGBSEPIAROW_SSSE3
|
|
|
|
#endif // defined(__x86_64__) || defined(__i386__)
|
|
|
|
#ifdef __cplusplus
|
|
} // extern "C"
|
|
} // namespace libyuv
|
|
#endif
|