mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 08:46:47 +08:00
Remove old alpha blend, expose GetARGB2Blend, fix ComputeSumSquareErrorPlane on SSE2
BUG=29 TEST=none Review URL: https://webrtc-codereview.appspot.com/469005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@234 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
c757f308ea
commit
d2f4413d29
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 233
|
||||
Version: 234
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -133,24 +133,19 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height);
|
||||
|
||||
// Alpha Blend ARGB row of pixels.
|
||||
void ARGBBlendRow(const uint8* src_argb, uint8* dst_argb, int width);
|
||||
typedef void (*ARGBBlendRow)(const uint8* src_argb0,
|
||||
const uint8* src_argb1,
|
||||
uint8* dst_argb, int width);
|
||||
|
||||
// Alpha Blend 2 rows of ARGB pixels and store to destination.
|
||||
void ARGBBlend2Row(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width);
|
||||
// Get function to Alpha Blend ARGB pixels and store to destination.
|
||||
ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width);
|
||||
|
||||
// Alpha Blend ARGB.
|
||||
int ARGBBlend(const uint8* src_argb, int src_stride_argb,
|
||||
// Alpha Blend ARGB images and store to destination.
|
||||
int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
|
||||
const uint8* src_argb1, int src_stride_argb1,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height);
|
||||
|
||||
// Alpha Blend 2 ARGB images and store to destination.
|
||||
int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0,
|
||||
const uint8* src_argb1, int src_stride_argb1,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height);
|
||||
|
||||
// Convert I422 to YUY2.
|
||||
int I422ToYUY2(const uint8* src_y, int src_stride_y,
|
||||
const uint8* src_u, int src_stride_u,
|
||||
|
||||
@ -20,7 +20,7 @@ extern "C" {
|
||||
|
||||
// Supported rotation
|
||||
enum RotationMode {
|
||||
kRotate0 = 0, // No rotation
|
||||
kRotate0 = 0, // No rotation
|
||||
kRotate90 = 90, // Rotate 90 degrees clockwise
|
||||
kRotate180 = 180, // Rotate 180 degrees
|
||||
kRotate270 = 270, // Rotate 270 degrees clockwise
|
||||
|
||||
@ -11,7 +11,7 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define INCLUDE_LIBYUV_VERSION 233
|
||||
#define LIBYUV_VERSION 234
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
|
||||
@ -25,18 +25,37 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// hash seed of 5381 recommended.
|
||||
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
|
||||
// Internal C version of HashDjb2 with int sized count for efficiency.
|
||||
static uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
|
||||
uint32 hash = seed;
|
||||
if (count > 0) {
|
||||
do {
|
||||
hash = hash * 33 + *src++;
|
||||
} while (--count);
|
||||
for (int i = 0; i < count; ++i) {
|
||||
hash += (hash << 5) + src[i];
|
||||
}
|
||||
return hash;
|
||||
}
|
||||
|
||||
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
|
||||
// hash seed of 5381 recommended.
|
||||
uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
|
||||
const int kBlockSize = 1 << 15; // 32768;
|
||||
while (count >= static_cast<uint64>(kBlockSize)) {
|
||||
seed = HashDjb2_C(src, kBlockSize, seed);
|
||||
src += kBlockSize;
|
||||
count -= kBlockSize;
|
||||
}
|
||||
int remainder = static_cast<int>(count) & ~15;
|
||||
if (remainder) {
|
||||
seed = HashDjb2_C(src, remainder, seed);
|
||||
src += remainder;
|
||||
count -= remainder;
|
||||
}
|
||||
remainder = static_cast<int>(count) & 15;
|
||||
if (remainder) {
|
||||
seed = HashDjb2_C(src, remainder, seed);
|
||||
}
|
||||
return seed;
|
||||
}
|
||||
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
|
||||
#define HAS_SUMSQUAREERROR_NEON
|
||||
|
||||
static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b,
|
||||
@ -75,9 +94,9 @@ static uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b,
|
||||
return sse;
|
||||
}
|
||||
|
||||
#elif defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
|
||||
#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
#define HAS_SUMSQUAREERROR_SSE2
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
|
||||
int count) {
|
||||
__asm {
|
||||
@ -94,7 +113,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
|
||||
movdqa xmm2, [eax + edx]
|
||||
lea eax, [eax + 16]
|
||||
sub ecx, 16
|
||||
movdqa xmm3, xmm1
|
||||
movdqa xmm3, xmm1 // abs trick
|
||||
psubusb xmm1, xmm2
|
||||
psubusb xmm2, xmm3
|
||||
por xmm1, xmm2
|
||||
@ -116,7 +135,7 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
|
||||
#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
|
||||
#define HAS_SUMSQUAREERROR_SSE2
|
||||
static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
|
||||
int count) {
|
||||
@ -167,11 +186,9 @@ static uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b,
|
||||
static uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b,
|
||||
int count) {
|
||||
uint32 sse = 0u;
|
||||
for (int x = 0; x < count; ++x) {
|
||||
int diff = src_a[0] - src_b[0];
|
||||
for (int i = 0; i < count; ++i) {
|
||||
int diff = src_a[i] - src_b[i];
|
||||
sse += static_cast<uint32>(diff * diff);
|
||||
src_a += 1;
|
||||
src_b += 1;
|
||||
}
|
||||
return sse;
|
||||
}
|
||||
@ -187,6 +204,7 @@ uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
|
||||
#elif defined(HAS_SUMSQUAREERROR_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||
IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
|
||||
// Note only used for multiples of 16 so count is not checked.
|
||||
SumSquareError = SumSquareError_SSE2;
|
||||
}
|
||||
#endif
|
||||
@ -225,8 +243,9 @@ uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
|
||||
SumSquareError = SumSquareError_NEON;
|
||||
}
|
||||
#elif defined(HAS_SUMSQUAREERROR_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||
IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
|
||||
if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
|
||||
IS_ALIGNED(src_a, 16) && IS_ALIGNED(stride_a, 16) &&
|
||||
IS_ALIGNED(src_b, 16) && IS_ALIGNED(stride_b, 16)) {
|
||||
SumSquareError = SumSquareError_SSE2;
|
||||
}
|
||||
#endif
|
||||
|
||||
@ -61,9 +61,9 @@ int I420Copy(const uint8* src_y, int src_stride_y,
|
||||
return 0;
|
||||
}
|
||||
|
||||
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
#define HAS_HALFROW_SSE2
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
|
||||
uint8* dst_uv, int pix) {
|
||||
__asm {
|
||||
@ -86,7 +86,7 @@ static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
|
||||
#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
|
||||
#define HAS_HALFROW_SSE2
|
||||
static void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
|
||||
uint8* dst_uv, int pix) {
|
||||
@ -179,12 +179,13 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
|
||||
|
||||
// Blends 32x2 pixels to 16x1
|
||||
// source in scale.cc
|
||||
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
|
||||
#define HAS_SCALEROWDOWN2_NEON
|
||||
void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
#elif defined(_M_IX86) || defined(__x86_64__) || defined(__i386__) && \
|
||||
!defined(YUV_DISABLE_ASM)
|
||||
#elif !defined(YUV_DISABLE_ASM) && \
|
||||
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
|
||||
|
||||
void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
#endif
|
||||
@ -450,9 +451,9 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420,
|
||||
width, height);
|
||||
}
|
||||
|
||||
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
#define HAS_SPLITYUY2_SSE2
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void SplitYUY2_SSE2(const uint8* src_yuy2,
|
||||
uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix) {
|
||||
__asm {
|
||||
@ -498,7 +499,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
|
||||
#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
|
||||
#define HAS_SPLITYUY2_SSE2
|
||||
static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
|
||||
uint8* dst_u, uint8* dst_v, int pix) {
|
||||
|
||||
@ -205,9 +205,9 @@ int I400Copy(const uint8* src_y, int src_stride_y,
|
||||
// UYVY - Macro-pixel = 2 image pixels
|
||||
// U0Y0V0Y1
|
||||
|
||||
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
#define HAS_I42XTOYUY2ROW_SSE2
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void I42xToYUY2Row_SSE2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
@ -246,7 +246,7 @@ static void I42xToYUY2Row_SSE2(const uint8* src_y,
|
||||
}
|
||||
|
||||
#define HAS_I42XTOUYVYROW_SSE2
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void I42xToUYVYRow_SSE2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
@ -283,7 +283,7 @@ static void I42xToUYVYRow_SSE2(const uint8* src_y,
|
||||
ret
|
||||
}
|
||||
}
|
||||
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
|
||||
#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
|
||||
#define HAS_I42XTOYUY2ROW_SSE2
|
||||
static void I42xToYUY2Row_SSE2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
|
||||
@ -24,9 +24,9 @@ extern "C" {
|
||||
// and vst would select which 2 components to write. The low level would need
|
||||
// to be ARGBToBG, ARGBToGB, ARGBToRG, ARGBToGR
|
||||
|
||||
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
#define HAS_ARGBTOBAYERROW_SSSE3
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
|
||||
uint8* dst_bayer, uint32 selector, int pix) {
|
||||
__asm {
|
||||
@ -36,6 +36,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
|
||||
mov ecx, [esp + 16] // pix
|
||||
pshufd xmm5, xmm5, 0
|
||||
|
||||
align 16
|
||||
wloop:
|
||||
movdqa xmm0, [eax]
|
||||
lea eax, [eax + 16]
|
||||
@ -48,7 +49,7 @@ static void ARGBToBayerRow_SSSE3(const uint8* src_argb,
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
|
||||
#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
|
||||
|
||||
#define HAS_ARGBTOBAYERROW_SSSE3
|
||||
static void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
|
||||
|
||||
@ -137,87 +137,38 @@ int ARGBCopy(const uint8* src_argb, int src_stride_argb,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Alpha Blend ARGB
|
||||
void ARGBBlendRow(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
#if defined(HAS_ARGBBLENDROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBBlendRow_SSSE3(src_argb, dst_argb, width);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
// Get a blender that optimized for the CPU, alignment and pixel count.
|
||||
// As there are 6 blenders to choose from, the caller should try to use
|
||||
// the same blend function for all pixels if possible.
|
||||
ARGBBlendRow GetARGBBlend(uint8* dst_argb, int dst_stride_argb, int width) {
|
||||
void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) = ARGBBlendRow_C;
|
||||
#if defined(HAS_ARGBBLENDROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ARGBBlendRow_SSE2(src_argb, dst_argb, width);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
ARGBBlendRow_C(src_argb, dst_argb, width);
|
||||
}
|
||||
|
||||
// Alpha Blend 2 rows of ARGB pixels and store to destination.
|
||||
void ARGBBlend2Row(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) {
|
||||
#if defined(HAS_ARGBBLENDROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBBlend2Row_SSSE3(src_argb0, src_argb1, dst_argb, width);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBBLENDROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ARGBBlend2Row_SSE2(src_argb0, src_argb1, dst_argb, width);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
ARGBBlend2Row_C(src_argb0, src_argb1, dst_argb, width);
|
||||
}
|
||||
|
||||
// Alpha Blend ARGB
|
||||
// TODO(fbarchard): Call 3 pointer low levels to reduce code size.
|
||||
int ARGBBlend(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height) {
|
||||
if (!src_argb || !dst_argb || width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_argb = src_argb + (height - 1) * src_stride_argb;
|
||||
src_stride_argb = -src_stride_argb;
|
||||
}
|
||||
|
||||
void (*ARGBBlendRow)(const uint8* src_argb, uint8* dst_argb, int width) =
|
||||
ARGBBlendRow_C;
|
||||
#if defined(HAS_ARGBBLENDROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ARGBBlendRow = ARGBBlendRow_SSE2;
|
||||
if (IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
|
||||
ARGBBlendRow = ARGBBlendRow_Aligned_SSE2;
|
||||
ARGBBlendRow = ARGBBlendRow1_SSE2;
|
||||
if (width >= 4) {
|
||||
ARGBBlendRow = ARGBBlendRow_Any_SSE2;
|
||||
if (IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
|
||||
ARGBBlendRow = ARGBBlendRow_Aligned_SSE2;
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBBLENDROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBBlendRow = ARGBBlendRow_SSSE3;
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && width >= 4) {
|
||||
ARGBBlendRow = ARGBBlendRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
|
||||
ARGBBlendRow = ARGBBlendRow_Aligned_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
ARGBBlendRow(src_argb, dst_argb, width);
|
||||
src_argb += src_stride_argb;
|
||||
dst_argb += dst_stride_argb;
|
||||
}
|
||||
return 0;
|
||||
return ARGBBlendRow;
|
||||
}
|
||||
|
||||
// Alpha Blend 2 ARGB images and store to destination.
|
||||
int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0,
|
||||
int ARGBBlend(const uint8* src_argb0, int src_stride_argb0,
|
||||
const uint8* src_argb1, int src_stride_argb1,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height) {
|
||||
@ -230,30 +181,12 @@ int ARGB2Blend(const uint8* src_argb0, int src_stride_argb0,
|
||||
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
|
||||
dst_stride_argb = -dst_stride_argb;
|
||||
}
|
||||
|
||||
void (*ARGBBlend2Row)(const uint8* src_argb, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) = ARGBBlend2Row_C;
|
||||
#if defined(HAS_ARGBBLENDROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
ARGBBlend2Row = ARGBBlend2Row_SSE2;
|
||||
if (IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
|
||||
ARGBBlend2Row = ARGBBlend2Row_Aligned_SSE2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_ARGBBLENDROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3)) {
|
||||
ARGBBlend2Row = ARGBBlend2Row_SSSE3;
|
||||
if (IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
|
||||
ARGBBlend2Row = ARGBBlend2Row_Aligned_SSSE3;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
void (*ARGBBlendRow)(const uint8* src_argb, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) =
|
||||
GetARGBBlend(dst_argb, dst_stride_argb, width);
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
ARGBBlend2Row(src_argb0, src_argb1, dst_argb, width);
|
||||
ARGBBlendRow(src_argb0, src_argb1, dst_argb, width);
|
||||
src_argb0 += src_stride_argb0;
|
||||
src_argb1 += src_stride_argb1;
|
||||
dst_argb += dst_stride_argb;
|
||||
@ -725,7 +658,7 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
|
||||
// SetRow8 writes 'count' bytes using a 32 bit value repeated
|
||||
// SetRow32 writes 'count' words using a 32 bit value repeated
|
||||
|
||||
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
|
||||
#define HAS_SETROW_NEON
|
||||
static void SetRow8_NEON(uint8* dst, uint32 v32, int count) {
|
||||
asm volatile (
|
||||
@ -749,9 +682,9 @@ static void SetRows32_NEON(uint8* dst, uint32 v32, int width,
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
|
||||
#elif !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
#define HAS_SETROW_X86
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
|
||||
__asm {
|
||||
mov edx, edi
|
||||
@ -765,7 +698,7 @@ static void SetRow8_X86(uint8* dst, uint32 v32, int count) {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void SetRows32_X86(uint8* dst, uint32 v32, int width,
|
||||
int dst_stride, int height) {
|
||||
__asm {
|
||||
@ -793,7 +726,7 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width,
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
|
||||
#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
|
||||
#define HAS_SETROW_X86
|
||||
static void SetRow8_X86(uint8* dst, uint32 v32, int width) {
|
||||
size_t width_tmp = static_cast<size_t>(width);
|
||||
@ -903,6 +836,7 @@ int I420Rect(uint8* dst_y, int dst_stride_y,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Add TestCpuFlag(kCpuHasX86) to allow C code to be tested.
|
||||
// Draw a rectangle into ARGB
|
||||
int ARGBRect(uint8* dst_argb, int dst_stride_argb,
|
||||
int dst_x, int dst_y,
|
||||
@ -916,12 +850,14 @@ int ARGBRect(uint8* dst_argb, int dst_stride_argb,
|
||||
uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4;
|
||||
#if defined(HAS_SETROW_X86)
|
||||
SetRows32_X86(dst, value, width, dst_stride_argb, height);
|
||||
#elif defined(HAS_SETROW_NEON)
|
||||
#else
|
||||
#if defined(HAS_SETROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16) &&
|
||||
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
|
||||
SetRows32_NEON(dst, value, width, dst_stride_argb, height);
|
||||
return 0;
|
||||
}
|
||||
#endif
|
||||
SetRows32_C(dst, value, width, dst_stride_argb, height);
|
||||
#endif
|
||||
return 0;
|
||||
|
||||
@ -21,8 +21,8 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
|
||||
!defined(YUV_DISABLE_ASM)
|
||||
#if !defined(YUV_DISABLE_ASM) && \
|
||||
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
|
||||
#if defined(__APPLE__) && defined(__i386__)
|
||||
#define DECLARE_FUNCTION(name) \
|
||||
".text \n" \
|
||||
@ -59,9 +59,9 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
|
||||
int width);
|
||||
#endif
|
||||
|
||||
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
#define HAS_TRANSPOSE_WX8_SSSE3
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
__asm {
|
||||
@ -153,7 +153,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
|
||||
}
|
||||
|
||||
#define HAS_TRANSPOSE_UVWX8_SSE2
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
@ -281,7 +281,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
ret
|
||||
}
|
||||
}
|
||||
#elif defined(__i386__) || defined(__x86_64__) && !defined(YUV_DISABLE_ASM)
|
||||
#elif !defined(YUV_DISABLE_ASM) && (defined(__i386__) || defined(__x86_64__))
|
||||
#define HAS_TRANSPOSE_WX8_SSSE3
|
||||
static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
@ -369,7 +369,7 @@ static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
#if defined (__i386__)
|
||||
#if !defined(YUV_DISABLE_ASM) && defined (__i386__)
|
||||
#define HAS_TRANSPOSE_UVWX8_SSE2
|
||||
extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
@ -491,7 +491,7 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
"pop %ebx \n"
|
||||
"ret \n"
|
||||
);
|
||||
#elif defined(__x86_64__)
|
||||
#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
|
||||
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
|
||||
#define HAS_TRANSPOSE_WX8_FAST_SSSE3
|
||||
static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
|
||||
|
||||
@ -17,7 +17,7 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
|
||||
|
||||
static const uvec8 vtbl_4x4_transpose =
|
||||
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
|
||||
|
||||
37
source/row.h
37
source/row.h
@ -18,6 +18,7 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// TODO(fbarchard): Remove kMaxStride
|
||||
#define kMaxStride (2560 * 4)
|
||||
#define IS_ALIGNED(p, a) (!((uintptr_t)(p) & ((a) - 1)))
|
||||
|
||||
@ -26,8 +27,9 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
// The following are available on all x86 platforms
|
||||
#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
|
||||
!defined(YUV_DISABLE_ASM)
|
||||
#if !defined(YUV_DISABLE_ASM) && \
|
||||
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
|
||||
|
||||
#define HAS_ABGRTOARGBROW_SSSE3
|
||||
#define HAS_BGRATOARGBROW_SSSE3
|
||||
#define HAS_RGB24TOARGBROW_SSSE3
|
||||
@ -66,7 +68,7 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
// The following are available on Neon platforms
|
||||
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
|
||||
#define HAS_MIRRORROW_NEON
|
||||
#define HAS_MIRRORROWUV_NEON
|
||||
#define HAS_SPLITUV_NEON
|
||||
@ -78,7 +80,7 @@ extern "C" {
|
||||
|
||||
// The following are only available on Win32
|
||||
// TODO(fbarchard): Port to GCC
|
||||
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
#define HAS_ARGBBLENDROW_SSSE3
|
||||
#endif
|
||||
|
||||
@ -265,25 +267,18 @@ void YToARGBRow_SSE2(const uint8* y_buf,
|
||||
int width);
|
||||
|
||||
// ARGB preattenuated alpha blend.
|
||||
void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
||||
int width);
|
||||
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
int width);
|
||||
void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width);
|
||||
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width);
|
||||
void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width);
|
||||
|
||||
// ARGB preattenuated alpha blend with 2 sources and a destination.
|
||||
void ARGBBlend2Row_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width);
|
||||
void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width);
|
||||
void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width);
|
||||
void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width);
|
||||
void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width);
|
||||
void ARGBBlend2Row_C(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width);
|
||||
void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width);
|
||||
void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width);
|
||||
void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width);
|
||||
|
||||
// 'Any' functions handle any size and alignment.
|
||||
void I420ToARGBRow_Any_SSSE3(const uint8* y_buf,
|
||||
|
||||
@ -454,73 +454,10 @@ void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
|
||||
}
|
||||
|
||||
#define BLENDER(f, b, a) (((256 - a) * b) >> 8) + f
|
||||
void ARGBBlendRow_C(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
for (int x = 0; x < width - 1; x += 2) {
|
||||
uint32 a = src_argb[3];
|
||||
if (a) {
|
||||
if (a < 255) {
|
||||
const uint32 fb = src_argb[0];
|
||||
const uint32 fg = src_argb[1];
|
||||
const uint32 fr = src_argb[2];
|
||||
const uint32 bb = dst_argb[0];
|
||||
const uint32 bg = dst_argb[1];
|
||||
const uint32 br = dst_argb[2];
|
||||
dst_argb[0] = BLENDER(fb, bb, a);
|
||||
dst_argb[1] = BLENDER(fg, bg, a);
|
||||
dst_argb[2] = BLENDER(fr, br, a);
|
||||
dst_argb[3] = 255u;
|
||||
} else {
|
||||
*reinterpret_cast<uint32*>(dst_argb) =
|
||||
*reinterpret_cast<const uint32*>(src_argb);
|
||||
}
|
||||
}
|
||||
a = src_argb[4 + 3];
|
||||
if (a) {
|
||||
if (a < 255) {
|
||||
const uint32 fb = src_argb[4 + 0];
|
||||
const uint32 fg = src_argb[4 + 1];
|
||||
const uint32 fr = src_argb[4 + 2];
|
||||
const uint32 bb = dst_argb[4 + 0];
|
||||
const uint32 bg = dst_argb[4 + 1];
|
||||
const uint32 br = dst_argb[4 + 2];
|
||||
dst_argb[4 + 0] = BLENDER(fb, bb, a);
|
||||
dst_argb[4 + 1] = BLENDER(fg, bg, a);
|
||||
dst_argb[4 + 2] = BLENDER(fr, br, a);
|
||||
dst_argb[4 + 3] = 255u;
|
||||
} else {
|
||||
*reinterpret_cast<uint32*>(dst_argb + 4) =
|
||||
*reinterpret_cast<const uint32*>(src_argb + 4);
|
||||
}
|
||||
}
|
||||
src_argb += 8;
|
||||
dst_argb += 8;
|
||||
}
|
||||
|
||||
if (width & 1) {
|
||||
const uint32 a = src_argb[3];
|
||||
if (a) {
|
||||
if (a < 255) {
|
||||
const uint32 fb = src_argb[0];
|
||||
const uint32 fg = src_argb[1];
|
||||
const uint32 fr = src_argb[2];
|
||||
const uint32 bb = dst_argb[0];
|
||||
const uint32 bg = dst_argb[1];
|
||||
const uint32 br = dst_argb[2];
|
||||
dst_argb[0] = BLENDER(fb, bb, a);
|
||||
dst_argb[1] = BLENDER(fg, bg, a);
|
||||
dst_argb[2] = BLENDER(fr, br, a);
|
||||
dst_argb[3] = 255u;
|
||||
} else {
|
||||
*reinterpret_cast<uint32*>(dst_argb) =
|
||||
*reinterpret_cast<const uint32*>(src_argb);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Blend src_argb0 over src_argb1 and store to dst_argb.
|
||||
// dst_argb may be src_argb0 or src_argb1.
|
||||
void ARGBBlend2Row_C(const uint8* src_argb0, const uint8* src_argb1,
|
||||
void ARGBBlendRow_C(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) {
|
||||
for (int x = 0; x < width - 1; x += 2) {
|
||||
uint32 a = src_argb0[3];
|
||||
|
||||
@ -16,7 +16,7 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for GCC Neon
|
||||
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
|
||||
|
||||
#define YUVTORGB \
|
||||
"vld1.u8 {d0}, [%0]! \n" \
|
||||
|
||||
@ -18,7 +18,7 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for GCC x86 and x64
|
||||
#if (defined(__x86_64__) || defined(__i386__)) && !defined(YUV_DISABLE_ASM)
|
||||
#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
|
||||
|
||||
// GCC 4.2 on OSX has link error when passing static or const to inline.
|
||||
// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
|
||||
@ -2031,162 +2031,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
||||
#ifdef HAS_ARGBBLENDROW_SSE2
|
||||
// Blend 8 pixels at a time
|
||||
// Destination aligned to 16 bytes, multiple of 4 pixels
|
||||
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
int width) {
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm7,%%xmm7 \n"
|
||||
"psrlw $0xf,%%xmm7 \n"
|
||||
"pcmpeqb %%xmm6,%%xmm6 \n"
|
||||
"psrlw $0x8,%%xmm6 \n"
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"psllw $0x8,%%xmm5 \n"
|
||||
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||
"pslld $0x18,%%xmm4 \n"
|
||||
|
||||
// 8 pixel loop
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm3 \n" // first 4 pixels
|
||||
"movdqa %%xmm3,%%xmm0 \n"
|
||||
"pxor %%xmm4,%%xmm3 \n"
|
||||
"movdqa (%1),%%xmm2 \n"
|
||||
"psrlw $0x8,%%xmm3 \n"
|
||||
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
|
||||
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
|
||||
"pand %%xmm6,%%xmm2 \n"
|
||||
"paddw %%xmm7,%%xmm3 \n"
|
||||
"pmullw %%xmm3,%%xmm2 \n"
|
||||
"movdqa (%1),%%xmm1 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"por %%xmm4,%%xmm0 \n"
|
||||
"pmullw %%xmm3,%%xmm1 \n"
|
||||
"movdqu 0x10(%0),%%xmm3 \n"
|
||||
"lea 0x20(%0),%0 \n"
|
||||
"psrlw $0x8,%%xmm2 \n"
|
||||
"paddusb %%xmm2,%%xmm0 \n"
|
||||
"pand %%xmm5,%%xmm1 \n"
|
||||
"paddusb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"movdqa %%xmm0,(%1) \n"
|
||||
"jle 9f \n"
|
||||
"movdqa %%xmm3,%%xmm0 \n" // next 4 pixels
|
||||
"pxor %%xmm4,%%xmm3 \n"
|
||||
"movdqa 0x10(%1),%%xmm2 \n"
|
||||
"psrlw $0x8,%%xmm3 \n"
|
||||
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
|
||||
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
|
||||
"pand %%xmm6,%%xmm2 \n"
|
||||
"paddw %%xmm7,%%xmm3 \n"
|
||||
"pmullw %%xmm3,%%xmm2 \n"
|
||||
"movdqa 0x10(%1),%%xmm1 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"por %%xmm4,%%xmm0 \n"
|
||||
"pmullw %%xmm3,%%xmm1 \n"
|
||||
"psrlw $0x8,%%xmm2 \n"
|
||||
"paddusb %%xmm2,%%xmm0 \n"
|
||||
"pand %%xmm5,%%xmm1 \n"
|
||||
"paddusb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"movdqa %%xmm0,0x10(%1) \n"
|
||||
"lea 0x20(%1),%1 \n"
|
||||
"jg 1b \n"
|
||||
"9: \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
:
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
|
||||
// Blend 1 pixel at a time, unaligned
|
||||
void ARGBBlendRow1_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm7,%%xmm7 \n"
|
||||
"psrlw $0xf,%%xmm7 \n"
|
||||
"pcmpeqb %%xmm6,%%xmm6 \n"
|
||||
"psrlw $0x8,%%xmm6 \n"
|
||||
"pcmpeqb %%xmm5,%%xmm5 \n"
|
||||
"psllw $0x8,%%xmm5 \n"
|
||||
"pcmpeqb %%xmm4,%%xmm4 \n"
|
||||
"pslld $0x18,%%xmm4 \n"
|
||||
|
||||
// 1 pixel loop
|
||||
"1: \n"
|
||||
"movd (%0),%%xmm3 \n"
|
||||
"lea 0x4(%0),%0 \n"
|
||||
"movdqa %%xmm3,%%xmm0 \n"
|
||||
"pxor %%xmm4,%%xmm3 \n"
|
||||
"movd (%1),%%xmm2 \n"
|
||||
"psrlw $0x8,%%xmm3 \n"
|
||||
"pshufhw $0xf5,%%xmm3,%%xmm3 \n"
|
||||
"pshuflw $0xf5,%%xmm3,%%xmm3 \n"
|
||||
"pand %%xmm6,%%xmm2 \n"
|
||||
"paddw %%xmm7,%%xmm3 \n"
|
||||
"pmullw %%xmm3,%%xmm2 \n"
|
||||
"movd (%1),%%xmm1 \n"
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"por %%xmm4,%%xmm0 \n"
|
||||
"pmullw %%xmm3,%%xmm1 \n"
|
||||
"psrlw $0x8,%%xmm2 \n"
|
||||
"paddusb %%xmm2,%%xmm0 \n"
|
||||
"pand %%xmm5,%%xmm1 \n"
|
||||
"paddusb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x1,%2 \n"
|
||||
"movd %%xmm0,(%1) \n"
|
||||
"lea 0x4(%1),%1 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(width) // %2
|
||||
:
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
|
||||
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
// Do 1 to 3 pixels to get destination aligned.
|
||||
if ((uintptr_t)(dst_argb) & 15) {
|
||||
int count = width;
|
||||
if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
|
||||
count = (-(intptr_t)(dst_argb) >> 2) & 3;
|
||||
}
|
||||
ARGBBlendRow1_SSE2(src_argb, dst_argb, count);
|
||||
src_argb += count * 4;
|
||||
dst_argb += count * 4;
|
||||
width -= count;
|
||||
}
|
||||
// Do multiple of 4 pixels
|
||||
if (width & ~3) {
|
||||
ARGBBlendRow_Aligned_SSE2(src_argb, dst_argb, width & ~3);
|
||||
}
|
||||
// Do remaining 1 to 3 pixels
|
||||
if (width & 3) {
|
||||
src_argb += (width & ~3) * 4;
|
||||
dst_argb += (width & ~3) * 4;
|
||||
width &= 3;
|
||||
ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // HAS_ARGBBLENDROW_SSE2
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#ifdef HAS_ARGBBLENDROW_SSE2
|
||||
// Blend 8 pixels at a time
|
||||
// Destination aligned to 16 bytes, multiple of 4 pixels
|
||||
void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm7,%%xmm7 \n"
|
||||
@ -2259,7 +2104,7 @@ void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
}
|
||||
|
||||
// Blend 1 pixel at a time, unaligned
|
||||
void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
"pcmpeqb %%xmm7,%%xmm7 \n"
|
||||
@ -2309,15 +2154,15 @@ void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
);
|
||||
}
|
||||
|
||||
void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) {
|
||||
void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) {
|
||||
// Do 1 to 3 pixels to get destination aligned.
|
||||
if ((uintptr_t)(dst_argb) & 15) {
|
||||
int count = width;
|
||||
if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
|
||||
count = (-(intptr_t)(dst_argb) >> 2) & 3;
|
||||
}
|
||||
ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count);
|
||||
ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count);
|
||||
src_argb0 += count * 4;
|
||||
src_argb1 += count * 4;
|
||||
dst_argb += count * 4;
|
||||
@ -2325,7 +2170,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
}
|
||||
// Do multiple of 4 pixels
|
||||
if (width & ~3) {
|
||||
ARGBBlend2Row_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
|
||||
ARGBBlendRow_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
|
||||
}
|
||||
// Do remaining 1 to 3 pixels
|
||||
if (width & 3) {
|
||||
@ -2333,19 +2178,11 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
src_argb1 += (width & ~3) * 4;
|
||||
dst_argb += (width & ~3) * 4;
|
||||
width &= 3;
|
||||
ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width);
|
||||
ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width);
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBBLENDROW_SSE2
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#endif // defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -16,7 +16,7 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
// This module is for Visual C x86
|
||||
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
|
||||
@ -99,7 +99,7 @@ static const uvec8 kShuffleMaskARGBToRAW = {
|
||||
2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
|
||||
};
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_y
|
||||
@ -127,7 +127,7 @@ void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_abgr
|
||||
@ -148,7 +148,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_bgra
|
||||
@ -169,7 +169,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_rgb24
|
||||
@ -208,7 +208,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
|
||||
int pix) {
|
||||
__asm {
|
||||
@ -255,7 +255,7 @@ __asm {
|
||||
// v * (256 + 8)
|
||||
// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
|
||||
// 20 instructions
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
|
||||
int pix) {
|
||||
__asm {
|
||||
@ -306,7 +306,7 @@ __asm {
|
||||
}
|
||||
|
||||
// 24 instructions
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
|
||||
int pix) {
|
||||
__asm {
|
||||
@ -360,7 +360,7 @@ __asm {
|
||||
}
|
||||
|
||||
// 18 instructions
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
|
||||
int pix) {
|
||||
__asm {
|
||||
@ -399,7 +399,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_argb
|
||||
@ -438,7 +438,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_argb
|
||||
@ -477,7 +477,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_argb
|
||||
@ -516,7 +516,7 @@ __asm {
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Improve sign extension/packing
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_argb
|
||||
@ -558,7 +558,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_argb
|
||||
@ -589,7 +589,7 @@ __asm {
|
||||
}
|
||||
|
||||
// Convert 16 ARGB pixels (64 bytes) to 16 Y values
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
@ -623,7 +623,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
@ -657,7 +657,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
@ -691,7 +691,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
@ -725,7 +725,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
@ -759,7 +759,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
@ -793,7 +793,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
__asm {
|
||||
@ -859,7 +859,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
__asm {
|
||||
@ -929,7 +929,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
__asm {
|
||||
@ -995,7 +995,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
__asm {
|
||||
@ -1065,7 +1065,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
__asm {
|
||||
@ -1131,7 +1131,7 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
__asm {
|
||||
@ -1268,7 +1268,7 @@ static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
|
||||
__asm packuswb xmm2, xmm2 /* R */ \
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I420ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -1308,7 +1308,7 @@ void I420ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I420ToBGRARow_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -1348,7 +1348,7 @@ void I420ToBGRARow_SSSE3(const uint8* y_buf,
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I420ToABGRRow_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -1388,7 +1388,7 @@ void I420ToABGRRow_SSSE3(const uint8* y_buf,
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -1428,7 +1428,7 @@ void I420ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -1468,7 +1468,7 @@ void I420ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -1508,7 +1508,7 @@ void I420ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void I444ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -1575,7 +1575,7 @@ void I444ToARGBRow_SSSE3(const uint8* y_buf,
|
||||
#endif
|
||||
|
||||
#ifdef HAS_YTOARGBROW_SSE2
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void YToARGBRow_SSE2(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
@ -1628,7 +1628,7 @@ static const uvec8 kShuffleMirror = {
|
||||
15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
|
||||
};
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src
|
||||
@ -1653,7 +1653,7 @@ __asm {
|
||||
#ifdef HAS_MIRRORROW_SSE2
|
||||
// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
|
||||
// version can not.
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src
|
||||
@ -1686,7 +1686,7 @@ static const uvec8 kShuffleMirrorUV = {
|
||||
14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
|
||||
};
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
|
||||
int width) {
|
||||
__asm {
|
||||
@ -1717,7 +1717,7 @@ void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SPLITUV_SSE2
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
||||
__asm {
|
||||
push edi
|
||||
@ -1756,7 +1756,7 @@ void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
||||
|
||||
#ifdef HAS_COPYROW_SSE2
|
||||
// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src
|
||||
@ -1779,7 +1779,7 @@ void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
|
||||
#endif // HAS_COPYROW_SSE2
|
||||
|
||||
#ifdef HAS_COPYROW_X86
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void CopyRow_X86(const uint8* src, uint8* dst, int count) {
|
||||
__asm {
|
||||
mov eax, esi
|
||||
@ -1797,7 +1797,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int count) {
|
||||
#endif
|
||||
|
||||
#ifdef HAS_YUY2TOYROW_SSE2
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void YUY2ToYRow_SSE2(const uint8* src_yuy2,
|
||||
uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
@ -1823,7 +1823,7 @@ void YUY2ToYRow_SSE2(const uint8* src_yuy2,
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
||||
uint8* dst_u, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
@ -1867,7 +1867,7 @@ void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
|
||||
uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
@ -1893,7 +1893,7 @@ void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
||||
uint8* dst_u, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
@ -1937,7 +1937,7 @@ void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void UYVYToYRow_SSE2(const uint8* src_uyvy,
|
||||
uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
@ -1961,7 +1961,7 @@ void UYVYToYRow_SSE2(const uint8* src_uyvy,
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
||||
uint8* dst_u, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
@ -2005,7 +2005,7 @@ void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
|
||||
uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
@ -2029,7 +2029,7 @@ void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
||||
uint8* dst_u, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
@ -2078,273 +2078,7 @@ void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
||||
// Blend 8 pixels at a time
|
||||
// Destination aligned to 16 bytes, multiple of 4 pixels
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb, uint8* dst_argb,
|
||||
int width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_argb
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // width
|
||||
pcmpeqb xmm7, xmm7 // generate constant 1
|
||||
psrlw xmm7, 15
|
||||
pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
|
||||
psrlw xmm6, 8
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
|
||||
psllw xmm5, 8
|
||||
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
|
||||
pslld xmm4, 24
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
movdqu xmm3, [eax]
|
||||
movdqa xmm0, xmm3 // src argb
|
||||
pxor xmm3, xmm4 // ~alpha
|
||||
movdqa xmm2, [edx] // _r_b
|
||||
psrlw xmm3, 8 // alpha
|
||||
pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
||||
pshuflw xmm3, xmm3,0F5h
|
||||
pand xmm2, xmm6 // _r_b
|
||||
paddw xmm3, xmm7 // 256 - alpha
|
||||
pmullw xmm2, xmm3 // _r_b * alpha
|
||||
movdqa xmm1, [edx] // _a_g
|
||||
psrlw xmm1, 8 // _a_g
|
||||
por xmm0, xmm4 // set alpha to 255
|
||||
pmullw xmm1, xmm3 // _a_g * alpha
|
||||
movdqu xmm3, [eax + 16]
|
||||
lea eax, [eax + 32]
|
||||
psrlw xmm2, 8 // _r_b convert to 8 bits again
|
||||
paddusb xmm0, xmm2 // + src argb
|
||||
pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
||||
paddusb xmm0, xmm1 // + src argb
|
||||
sub ecx, 4
|
||||
movdqa [edx], xmm0
|
||||
jle done
|
||||
|
||||
movdqa xmm0, xmm3 // src argb
|
||||
pxor xmm3, xmm4 // ~alpha
|
||||
movdqa xmm2, [edx + 16] // _r_b
|
||||
psrlw xmm3, 8 // alpha
|
||||
pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
||||
pshuflw xmm3, xmm3,0F5h
|
||||
pand xmm2, xmm6 // _r_b
|
||||
paddw xmm3, xmm7 // 256 - alpha
|
||||
pmullw xmm2, xmm3 // _r_b * alpha
|
||||
movdqa xmm1, [edx + 16] // _a_g
|
||||
psrlw xmm1, 8 // _a_g
|
||||
por xmm0, xmm4 // set alpha to 255
|
||||
pmullw xmm1, xmm3 // _a_g * alpha
|
||||
psrlw xmm2, 8 // _r_b convert to 8 bits again
|
||||
paddusb xmm0, xmm2 // + src argb
|
||||
pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
||||
paddusb xmm0, xmm1 // + src argb
|
||||
sub ecx, 4
|
||||
movdqa [edx + 16], xmm0
|
||||
lea edx, [edx + 32]
|
||||
jg convertloop
|
||||
|
||||
done:
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
// Blend 1 pixel at a time, unaligned
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBBlendRow1_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_argb
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // width
|
||||
pcmpeqb xmm7, xmm7 // generate constant 1
|
||||
psrlw xmm7, 15
|
||||
pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
|
||||
psrlw xmm6, 8
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
|
||||
psllw xmm5, 8
|
||||
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
|
||||
pslld xmm4, 24
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
movd xmm3, [eax]
|
||||
lea eax, [eax + 4]
|
||||
movdqa xmm0, xmm3 // src argb
|
||||
pxor xmm3, xmm4 // ~alpha
|
||||
movd xmm2, [edx] // _r_b
|
||||
psrlw xmm3, 8 // alpha
|
||||
pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
||||
pshuflw xmm3, xmm3,0F5h
|
||||
pand xmm2, xmm6 // _r_b
|
||||
paddw xmm3, xmm7 // 256 - alpha
|
||||
pmullw xmm2, xmm3 // _r_b * alpha
|
||||
movd xmm1, [edx] // _a_g
|
||||
psrlw xmm1, 8 // _a_g
|
||||
por xmm0, xmm4 // set alpha to 255
|
||||
pmullw xmm1, xmm3 // _a_g * alpha
|
||||
psrlw xmm2, 8 // _r_b convert to 8 bits again
|
||||
paddusb xmm0, xmm2 // + src argb
|
||||
pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
||||
paddusb xmm0, xmm1 // + src argb
|
||||
sub ecx, 1
|
||||
movd [edx], xmm0
|
||||
lea edx, [edx + 4]
|
||||
jg convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
void ARGBBlendRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
// Do 1 to 3 pixels to get destination aligned.
|
||||
if ((uintptr_t)(dst_argb) & 15) {
|
||||
int count = width;
|
||||
if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
|
||||
count = (-(intptr_t)(dst_argb) >> 2) & 3;
|
||||
}
|
||||
ARGBBlendRow1_SSE2(src_argb, dst_argb, count);
|
||||
src_argb += count * 4;
|
||||
dst_argb += count * 4;
|
||||
width -= count;
|
||||
}
|
||||
// Do multiple of 4 pixels
|
||||
if (width & ~3) {
|
||||
ARGBBlendRow_Aligned_SSE2(src_argb, dst_argb, width & ~3);
|
||||
}
|
||||
// Do remaining 1 to 3 pixels
|
||||
if (width & 3) {
|
||||
src_argb += (width & ~3) * 4;
|
||||
dst_argb += (width & ~3) * 4;
|
||||
width &= 3;
|
||||
ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBBLENDROW_SSE2
|
||||
|
||||
#ifdef HAS_ARGBBLENDROW_SSSE3
|
||||
// Blend 8 pixels at a time
|
||||
// Shuffle table for reversing the bytes.
|
||||
static const uvec8 kShuffleAlpha = {
|
||||
3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
|
||||
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
|
||||
};
|
||||
|
||||
// Same as SSE2, but replaces
|
||||
// psrlw xmm3, 8 // alpha
|
||||
// pshufhw xmm3, xmm3,0F5h // 8 alpha words
|
||||
// pshuflw xmm3, xmm3,0F5h
|
||||
// with..
|
||||
// pshufb xmm3, kShuffleAlpha // alpha
|
||||
|
||||
// Destination aligned to 16 bytes, multiple of 4 pixels
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
||||
int width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_argb
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // width
|
||||
pcmpeqb xmm7, xmm7 // generate constant 1
|
||||
psrlw xmm7, 15
|
||||
pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff
|
||||
psrlw xmm6, 8
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00
|
||||
psllw xmm5, 8
|
||||
pcmpeqb xmm4, xmm4 // generate mask 0xff000000
|
||||
pslld xmm4, 24
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
movdqu xmm3, [eax]
|
||||
movdqa xmm0, xmm3 // src argb
|
||||
pxor xmm3, xmm4 // ~alpha
|
||||
pshufb xmm3, kShuffleAlpha // alpha
|
||||
movdqa xmm2, [edx] // _r_b
|
||||
pand xmm2, xmm6 // _r_b
|
||||
paddw xmm3, xmm7 // 256 - alpha
|
||||
pmullw xmm2, xmm3 // _r_b * alpha
|
||||
movdqa xmm1, [edx] // _a_g
|
||||
psrlw xmm1, 8 // _a_g
|
||||
por xmm0, xmm4 // set alpha to 255
|
||||
pmullw xmm1, xmm3 // _a_g * alpha
|
||||
movdqu xmm3, [eax + 16]
|
||||
lea eax, [eax + 32]
|
||||
psrlw xmm2, 8 // _r_b convert to 8 bits again
|
||||
paddusb xmm0, xmm2 // + src argb
|
||||
pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
||||
paddusb xmm0, xmm1 // + src argb
|
||||
sub ecx, 4
|
||||
movdqa [edx], xmm0
|
||||
jle done
|
||||
|
||||
movdqa xmm0, xmm3 // src argb
|
||||
pxor xmm3, xmm4 // ~alpha
|
||||
movdqa xmm2, [edx + 16] // _r_b
|
||||
pshufb xmm3, kShuffleAlpha // alpha
|
||||
pand xmm2, xmm6 // _r_b
|
||||
paddw xmm3, xmm7 // 256 - alpha
|
||||
pmullw xmm2, xmm3 // _r_b * alpha
|
||||
movdqa xmm1, [edx + 16] // _a_g
|
||||
psrlw xmm1, 8 // _a_g
|
||||
por xmm0, xmm4 // set alpha to 255
|
||||
pmullw xmm1, xmm3 // _a_g * alpha
|
||||
psrlw xmm2, 8 // _r_b convert to 8 bits again
|
||||
paddusb xmm0, xmm2 // + src argb
|
||||
pand xmm1, xmm5 // a_g_ convert to 8 bits again
|
||||
paddusb xmm0, xmm1 // + src argb
|
||||
sub ecx, 4
|
||||
movdqa [edx + 16], xmm0
|
||||
lea edx, [edx + 32]
|
||||
jg convertloop
|
||||
|
||||
done:
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
void ARGBBlendRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
|
||||
// Do 1 to 3 pixels to get destination aligned.
|
||||
if ((uintptr_t)(dst_argb) & 15) {
|
||||
int count = width;
|
||||
if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
|
||||
count = (-(intptr_t)(dst_argb) >> 2) & 3;
|
||||
}
|
||||
ARGBBlendRow1_SSE2(src_argb, dst_argb, count);
|
||||
src_argb += count * 4;
|
||||
dst_argb += count * 4;
|
||||
width -= count;
|
||||
}
|
||||
// Do multiple of 4 pixels
|
||||
if (width & ~3) {
|
||||
ARGBBlendRow_Aligned_SSSE3(src_argb, dst_argb, width & ~3);
|
||||
}
|
||||
// Do remaining 1 to 3 pixels
|
||||
if (width & 3) {
|
||||
src_argb += (width & ~3) * 4;
|
||||
dst_argb += (width & ~3) * 4;
|
||||
width &= 3;
|
||||
ARGBBlendRow1_SSE2(src_argb, dst_argb, width);
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBBLENDROW_SSSE3
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
///////////////////////////////////////
|
||||
///////////////////// 2 source versions
|
||||
///////////////////////////////////////
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#ifdef HAS_ARGBBLENDROW_SSE2
|
||||
// Blend 8 pixels at a time
|
||||
// Destination aligned to 16 bytes, multiple of 4 pixels
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
void ARGBBlendRow_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) {
|
||||
__asm {
|
||||
push esi
|
||||
@ -2418,7 +2152,7 @@ void ARGBBlend2Row_Aligned_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
|
||||
// Blend 1 pixel at a time, unaligned
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
void ARGBBlendRow1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) {
|
||||
__asm {
|
||||
push esi
|
||||
@ -2467,7 +2201,7 @@ void ARGBBlend2Row1_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
}
|
||||
}
|
||||
|
||||
void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
void ARGBBlendRow_Any_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) {
|
||||
// Do 1 to 3 pixels to get destination aligned.
|
||||
if ((uintptr_t)(dst_argb) & 15) {
|
||||
@ -2475,7 +2209,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
|
||||
count = (-(intptr_t)(dst_argb) >> 2) & 3;
|
||||
}
|
||||
ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count);
|
||||
ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count);
|
||||
src_argb0 += count * 4;
|
||||
src_argb1 += count * 4;
|
||||
dst_argb += count * 4;
|
||||
@ -2483,7 +2217,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
}
|
||||
// Do multiple of 4 pixels
|
||||
if (width & ~3) {
|
||||
ARGBBlend2Row_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
|
||||
ARGBBlendRow_Aligned_SSE2(src_argb0, src_argb1, dst_argb, width & ~3);
|
||||
}
|
||||
// Do remaining 1 to 3 pixels
|
||||
if (width & 3) {
|
||||
@ -2491,12 +2225,18 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
src_argb1 += (width & ~3) * 4;
|
||||
dst_argb += (width & ~3) * 4;
|
||||
width &= 3;
|
||||
ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width);
|
||||
ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width);
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBBLENDROW_SSE2
|
||||
|
||||
#ifdef HAS_ARGBBLENDROW_SSSE3
|
||||
// Shuffle table for reversing the bytes.
|
||||
static const uvec8 kShuffleAlpha = {
|
||||
3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
|
||||
11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
|
||||
};
|
||||
|
||||
// Blend 8 pixels at a time
|
||||
// Shuffle table for reversing the bytes.
|
||||
|
||||
@ -2509,7 +2249,7 @@ void ARGBBlend2Row_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
|
||||
// Destination aligned to 16 bytes, multiple of 4 pixels
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ARGBBlend2Row_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
void ARGBBlendRow_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) {
|
||||
__asm {
|
||||
push esi
|
||||
@ -2577,7 +2317,7 @@ void ARGBBlend2Row_Aligned_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
}
|
||||
}
|
||||
|
||||
void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
void ARGBBlendRow_Any_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) {
|
||||
// Do 1 to 3 pixels to get destination aligned.
|
||||
if ((uintptr_t)(dst_argb) & 15) {
|
||||
@ -2585,7 +2325,7 @@ void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
if (count > 4 && ((intptr_t)(dst_argb) & 3) == 0) {
|
||||
count = (-(intptr_t)(dst_argb) >> 2) & 3;
|
||||
}
|
||||
ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, count);
|
||||
ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, count);
|
||||
src_argb0 += count * 4;
|
||||
src_argb1 += count * 4;
|
||||
dst_argb += count * 4;
|
||||
@ -2593,7 +2333,7 @@ void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
}
|
||||
// Do multiple of 4 pixels
|
||||
if (width & ~3) {
|
||||
ARGBBlend2Row_Aligned_SSSE3(src_argb0, src_argb1, dst_argb, width & ~3);
|
||||
ARGBBlendRow_Aligned_SSSE3(src_argb0, src_argb1, dst_argb, width & ~3);
|
||||
}
|
||||
// Do remaining 1 to 3 pixels
|
||||
if (width & 3) {
|
||||
@ -2601,7 +2341,7 @@ void ARGBBlend2Row_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
|
||||
src_argb1 += (width & ~3) * 4;
|
||||
dst_argb += (width & ~3) * 4;
|
||||
width &= 3;
|
||||
ARGBBlend2Row1_SSE2(src_argb0, src_argb1, dst_argb, width);
|
||||
ARGBBlendRow1_SSE2(src_argb0, src_argb1, dst_argb, width);
|
||||
}
|
||||
}
|
||||
#endif // HAS_ARGBBLENDROW_SSSE3
|
||||
|
||||
@ -55,7 +55,7 @@ void SetUseReferenceImpl(bool use) {
|
||||
*
|
||||
*/
|
||||
|
||||
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(__ARM_NEON__)
|
||||
#define HAS_SCALEROWDOWN2_NEON
|
||||
void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */,
|
||||
uint8* dst, int dst_width) {
|
||||
@ -566,12 +566,13 @@ static void ScaleFilterRows_NEON(uint8* dst_ptr,
|
||||
*/
|
||||
|
||||
// Constants for SSE2 code
|
||||
#elif defined(_M_IX86) || defined(__i386__) || defined(__x86_64__) && \
|
||||
!defined(YUV_DISABLE_ASM)
|
||||
#elif !defined(YUV_DISABLE_ASM) && \
|
||||
(defined(_M_IX86) || defined(__i386__) || defined(__x86_64__))
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define TALIGN16(t, var) __declspec(align(16)) t _ ## var
|
||||
#elif defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__) && \
|
||||
defined(__i386__)
|
||||
#elif defined(__i386__) && \
|
||||
(defined(__APPLE__) || defined(__MINGW32__) || defined(__CYGWIN__))
|
||||
#define TALIGN16(t, var) t var __attribute__((aligned(16)))
|
||||
#else
|
||||
#define TALIGN16(t, var) t _ ## var __attribute__((aligned(16)))
|
||||
@ -670,12 +671,12 @@ extern "C" TALIGN16(const uint16, scaleab2[8]) =
|
||||
{ 65536 / 3, 65536 / 3, 65536 / 2, 65536 / 3, 65536 / 3, 65536 / 2, 0, 0 };
|
||||
#endif
|
||||
|
||||
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
|
||||
#define HAS_SCALEROWDOWN2_SSE2
|
||||
// Reads 32 pixels, throws half away and writes 16 pixels.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -704,7 +705,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, int src_stride,
|
||||
}
|
||||
// Blends 32x2 rectangle to 16x1.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned.
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -749,7 +750,7 @@ void ScaleRowDown2Int_SSE2(const uint8* src_ptr, int src_stride,
|
||||
#define HAS_SCALEROWDOWN4_SSE2
|
||||
// Point samples 32 pixels to 8 pixels.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -780,7 +781,7 @@ static void ScaleRowDown4_SSE2(const uint8* src_ptr, int src_stride,
|
||||
|
||||
// Blends 32x4 rectangle to 8x1.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -842,7 +843,7 @@ static void ScaleRowDown4Int_SSE2(const uint8* src_ptr, int src_stride,
|
||||
#define HAS_SCALEROWDOWN8_SSE2
|
||||
// Point samples 32 pixels to 4 pixels.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -874,7 +875,7 @@ static void ScaleRowDown8_SSE2(const uint8* src_ptr, int src_stride,
|
||||
|
||||
// Blends 32x8 rectangle to 4x1.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 4 byte aligned.
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -952,7 +953,7 @@ static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
|
||||
|
||||
// Note that movdqa+palign may be better than movdqu.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -1001,7 +1002,7 @@ static void ScaleRowDown34_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
|
||||
// Note that movdqa+palign may be better than movdqu.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -1059,7 +1060,7 @@ static void ScaleRowDown34_1_Int_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
|
||||
// Note that movdqa+palign may be better than movdqu.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -1122,7 +1123,7 @@ static void ScaleRowDown34_0_Int_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
// 3/8 point sampler
|
||||
|
||||
// Scale 32 pixels to 12
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -1154,7 +1155,7 @@ static void ScaleRowDown38_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
}
|
||||
|
||||
// Scale 16x3 pixels to 6x1 with interpolation
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -1221,7 +1222,7 @@ static void ScaleRowDown38_3_Int_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
}
|
||||
|
||||
// Scale 16x2 pixels to 6x1 with interpolation
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
__asm {
|
||||
@ -1269,7 +1270,7 @@ static void ScaleRowDown38_2_Int_SSSE3(const uint8* src_ptr, int src_stride,
|
||||
#define HAS_SCALEADDROWS_SSE2
|
||||
|
||||
// Reads 16xN bytes and produces 16 shorts at a time.
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
|
||||
uint16* dst_ptr, int src_width,
|
||||
int src_height) {
|
||||
@ -1329,7 +1330,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
|
||||
|
||||
// Bilinear row filtering combines 16x2 -> 16x1. SSE2 version.
|
||||
#define HAS_SCALEFILTERROWS_SSE2
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int src_stride, int dst_width,
|
||||
int source_y_fraction) {
|
||||
@ -1420,7 +1421,7 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
|
||||
// Bilinear row filtering combines 16x2 -> 16x1. SSSE3 version.
|
||||
#define HAS_SCALEFILTERROWS_SSSE3
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int src_stride, int dst_width,
|
||||
int source_y_fraction) {
|
||||
@ -1501,7 +1502,7 @@ static void ScaleFilterRows_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
|
||||
// Note that movdqa+palign may be better than movdqu.
|
||||
// Alignment requirement: src_ptr 16 byte aligned, dst_ptr 8 byte aligned.
|
||||
__declspec(naked)
|
||||
__declspec(naked) __declspec(align(16))
|
||||
static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
int dst_width) {
|
||||
__asm {
|
||||
@ -1547,7 +1548,7 @@ static void ScaleFilterCols34_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
}
|
||||
}
|
||||
|
||||
#elif defined(__x86_64__) || defined(__i386__) && !defined(YUV_DISABLE_ASM)
|
||||
#elif !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
|
||||
|
||||
// GCC versions of row functions are verbatim conversions from Visual C.
|
||||
// Generated using gcc disassembly on Visual C object file:
|
||||
@ -1766,7 +1767,7 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
#if defined(__i386__)
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(__i386__)
|
||||
extern "C" void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
asm(
|
||||
@ -2260,7 +2261,7 @@ extern "C" void ScaleFilterRows_SSSE3(uint8* dst_ptr,
|
||||
"ret \n"
|
||||
);
|
||||
|
||||
#elif defined(__x86_64__)
|
||||
#elif !defined(YUV_DISABLE_ASM) && defined(__x86_64__)
|
||||
static void ScaleRowDown8Int_SSE2(const uint8* src_ptr, int src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
asm volatile (
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user