mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-01-01 03:12:16 +08:00
Math functions - add, substract, multiply and shade adapted to nacl friendly addressing.
BUG=253 TEST=out\release\libyuv_unittest --gtest_filter=*Add* R=dingkai@google.com, nfullagar@chromium.org Review URL: https://webrtc-codereview.appspot.com/1972004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@746 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
008ecea4fe
commit
abfeea9b81
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 745
|
||||
Version: 746
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -38,8 +38,17 @@ extern "C" {
|
||||
// The following are available on all x86 platforms, including NaCL:
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
|
||||
#define HAS_ARGBBLENDROW_SSSE3
|
||||
// Effects:
|
||||
#define HAS_ARGBADDROW_SSE2
|
||||
#define HAS_ARGBATTENUATEROW_SSSE3
|
||||
#define HAS_ARGBBLENDROW_SSSE3
|
||||
#define HAS_ARGBMULTIPLYROW_SSE2
|
||||
#define HAS_ARGBSHADEROW_SSE2
|
||||
#define HAS_ARGBSUBTRACTROW_SSE2
|
||||
|
||||
// Conversions:
|
||||
#define HAS_FIXEDDIV_X86
|
||||
|
||||
#endif
|
||||
|
||||
// The following are available on all x86 platforms except NaCL x64:
|
||||
@ -47,7 +56,7 @@ extern "C" {
|
||||
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
|
||||
!(defined(__native_client__) && defined(__x86_64__))
|
||||
|
||||
// Conversions.
|
||||
// Conversions:
|
||||
#define HAS_ABGRTOUVROW_SSSE3
|
||||
#define HAS_ABGRTOYROW_SSSE3
|
||||
#define HAS_ARGB1555TOARGBROW_SSE2
|
||||
@ -110,19 +119,14 @@ extern "C" {
|
||||
#define HAS_YUY2TOUV422ROW_SSE2
|
||||
#define HAS_YUY2TOUVROW_SSE2
|
||||
#define HAS_YUY2TOYROW_SSE2
|
||||
#define HAS_FIXEDDIV
|
||||
|
||||
// Effects
|
||||
#define HAS_ARGBADDROW_SSE2
|
||||
// Effects:
|
||||
#define HAS_ARGBAFFINEROW_SSE2
|
||||
#define HAS_ARGBCOLORMATRIXROW_SSSE3
|
||||
#define HAS_ARGBGRAYROW_SSSE3
|
||||
#define HAS_ARGBMIRRORROW_SSSE3
|
||||
#define HAS_ARGBMULTIPLYROW_SSE2
|
||||
#define HAS_ARGBQUANTIZEROW_SSE2
|
||||
#define HAS_ARGBSEPIAROW_SSSE3
|
||||
#define HAS_ARGBSHADEROW_SSE2
|
||||
#define HAS_ARGBSUBTRACTROW_SSE2
|
||||
#define HAS_ARGBUNATTENUATEROW_SSE2
|
||||
#define HAS_COMPUTECUMULATIVESUMROW_SSE2
|
||||
#define HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
|
||||
@ -134,12 +138,12 @@ extern "C" {
|
||||
#define HAS_SOBELYROW_SSSE3
|
||||
#endif
|
||||
|
||||
// The following are Windows only.
|
||||
// The following are Windows only:
|
||||
// TODO(fbarchard): Port to gcc.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||
#define HAS_ARGBCOLORTABLEROW_X86
|
||||
#define HAS_RGBCOLORTABLEROW_X86
|
||||
// Visual C 2012 required for AVX2.
|
||||
// Caveat: Visual C 2012 required for AVX2.
|
||||
#if _MSC_VER >= 1700
|
||||
#define HAS_ARGBSHUFFLEROW_AVX2
|
||||
#define HAS_ARGBTOUVROW_AVX2
|
||||
@ -157,7 +161,7 @@ extern "C" {
|
||||
#define HAS_YUY2TOUVROW_AVX2
|
||||
#define HAS_YUY2TOYROW_AVX2
|
||||
|
||||
// Effects
|
||||
// Effects:
|
||||
#define HAS_ARGBADDROW_AVX2
|
||||
#define HAS_ARGBATTENUATEROW_AVX2
|
||||
#define HAS_ARGBMIRRORROW_AVX2
|
||||
@ -167,7 +171,7 @@ extern "C" {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// The following are Yasm x86 only.
|
||||
// The following are Yasm x86 only:
|
||||
// TODO(fbarchard): Port AVX2 to inline.
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(HAVE_YASM)
|
||||
(defined(_M_IX86) || defined(_M_X64) || \
|
||||
@ -194,7 +198,7 @@ extern "C" {
|
||||
#endif
|
||||
#endif
|
||||
|
||||
// The following are available on Neon platforms
|
||||
// The following are available on Neon platforms:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && \
|
||||
(defined(__ARM_NEON__) || defined(LIBYUV_NEON))
|
||||
#define HAS_ABGRTOUVROW_NEON
|
||||
@ -267,7 +271,7 @@ extern "C" {
|
||||
#define HAS_YUY2TOUVROW_NEON
|
||||
#define HAS_YUY2TOYROW_NEON
|
||||
|
||||
// Effects
|
||||
// Effects:
|
||||
#define HAS_ARGBADDROW_NEON
|
||||
#define HAS_ARGBATTENUATEROW_NEON
|
||||
#define HAS_ARGBBLENDROW_NEON
|
||||
@ -286,7 +290,7 @@ extern "C" {
|
||||
#define HAS_INTERPOLATEROW_NEON
|
||||
#endif
|
||||
|
||||
// The following are available on Mips platforms
|
||||
// The following are available on Mips platforms:
|
||||
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
|
||||
#define HAS_COPYROW_MIPS
|
||||
#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
|
||||
@ -1534,8 +1538,9 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
|
||||
|
||||
// Divide num by div and return as 16.16 fixed point result.
|
||||
int FixedDiv_C(int num, int div);
|
||||
#ifdef HAS_FIXEDDIV
|
||||
int FixedDiv(int num, int div);
|
||||
int FixedDiv_X86(int num, int div);
|
||||
#ifdef HAS_FIXEDDIV_X86
|
||||
#define FixedDiv FixedDiv_X86
|
||||
#else
|
||||
#define FixedDiv FixedDiv_C
|
||||
#endif
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 745
|
||||
#define LIBYUV_VERSION 746
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -30,7 +30,9 @@ extern "C" {
|
||||
uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
|
||||
|
||||
// This module is for Visual C x86
|
||||
#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
!(defined(__native_client__) && defined(__x86_64__)) && \
|
||||
(defined(_M_IX86) || \
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
|
||||
#define HAS_HASHDJB2_SSE41
|
||||
|
||||
@ -73,8 +75,9 @@ uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
|
||||
#define HAS_SUMSQUAREERROR_NEON
|
||||
uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
|
||||
#endif
|
||||
#if !defined(LIBYUV_DISABLE_X86) && (defined(_M_IX86) || \
|
||||
defined(__x86_64__) || defined(__i386__))
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
!(defined(__native_client__) && defined(__x86_64__)) && \
|
||||
(defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
|
||||
#define HAS_SUMSQUAREERROR_SSE2
|
||||
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
|
||||
#endif
|
||||
|
||||
@ -16,7 +16,9 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
!(defined(__native_client__) && defined(__x86_64__)) && \
|
||||
(defined(__x86_64__) || defined(__i386__))
|
||||
|
||||
uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
uint32 sse;
|
||||
@ -65,6 +67,7 @@ uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
|
||||
#endif // defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_X86) && \
|
||||
!(defined(__native_client__) && defined(__x86_64__)) && \
|
||||
(defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
|
||||
#define HAS_HASHDJB2_SSE41
|
||||
static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
|
||||
|
||||
@ -1904,7 +1904,7 @@ void I422ToUYVYRow_C(const uint8* src_y,
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(LIBYUV_DISABLE_X86)
|
||||
#if !defined(LIBYUV_DISABLE_X86) && defined(HAS_I422TOARGBROW_SSSE3)
|
||||
// row_win.cc has asm version, but GCC uses 2 step wrapper. 5% slower.
|
||||
// TODO(fbarchard): Handle width > kMaxStride here instead of calling code.
|
||||
#if defined(__x86_64__) || defined(__i386__)
|
||||
@ -2001,7 +2001,6 @@ void UYVYToARGBRow_Unaligned_SSSE3(const uint8* src_uyvy,
|
||||
UYVYToYRow_Unaligned_SSE2(src_uyvy, row_y, width);
|
||||
I422ToARGBRow_Unaligned_SSSE3(row_y, row_u, row_v, dst_argb, width);
|
||||
}
|
||||
|
||||
#endif // defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)
|
||||
#endif // !defined(LIBYUV_DISABLE_X86)
|
||||
#undef clamp0
|
||||
|
||||
@ -3027,6 +3027,7 @@ void CopyRow_X86(const uint8* src, uint8* dst, int width) {
|
||||
}
|
||||
#endif // HAS_COPYROW_X86
|
||||
|
||||
#ifdef HAS_COPYROW_ERMS
|
||||
// Unaligned Multiple of 1.
|
||||
void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
|
||||
size_t width_tmp = static_cast<size_t>(width);
|
||||
@ -3039,6 +3040,7 @@ void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
|
||||
: "memory", "cc"
|
||||
);
|
||||
}
|
||||
#endif // HAS_COPYROW_ERMS
|
||||
|
||||
#ifdef HAS_SETROW_X86
|
||||
void SetRow_X86(uint8* dst, uint32 v32, int width) {
|
||||
@ -4167,14 +4169,14 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
uint32 value) {
|
||||
asm volatile (
|
||||
"movd %3,%%xmm2 \n"
|
||||
"sub %0,%1 \n"
|
||||
"punpcklbw %%xmm2,%%xmm2 \n"
|
||||
"punpcklqdq %%xmm2,%%xmm2 \n"
|
||||
|
||||
// 4 pixel loop.
|
||||
".p2align 2 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa "MEMACCESS(0)",%%xmm0 \n"
|
||||
"lea "MEMLEA(0x10,0)",%0 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"punpcklbw %%xmm0,%%xmm0 \n"
|
||||
"punpckhbw %%xmm1,%%xmm1 \n"
|
||||
@ -4184,8 +4186,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
"psrlw $0x8,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%2 \n"
|
||||
"movdqa %%xmm0,(%0,%1,1) \n"
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"movdqa %%xmm0,"MEMACCESS(1)" \n"
|
||||
"lea "MEMLEA(0x10,1)",%1 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
@ -4205,14 +4207,14 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
"pxor %%xmm5,%%xmm5 \n"
|
||||
"sub %0,%1 \n"
|
||||
"sub %0,%2 \n"
|
||||
|
||||
// 4 pixel loop.
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu (%0,%1),%%xmm2 \n"
|
||||
"movdqu "MEMACCESS(0)",%%xmm0 \n"
|
||||
"lea "MEMLEA(0x10,0)",%0 \n"
|
||||
"movdqu "MEMACCESS(1)",%%xmm2 \n"
|
||||
"lea "MEMLEA(0x10,1)",%1 \n"
|
||||
"movdqu %%xmm0,%%xmm1 \n"
|
||||
"movdqu %%xmm2,%%xmm3 \n"
|
||||
"punpcklbw %%xmm0,%%xmm0 \n"
|
||||
@ -4223,8 +4225,8 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
"pmulhuw %%xmm3,%%xmm1 \n"
|
||||
"packuswb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"movdqu %%xmm0,(%0,%2,1) \n"
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"movdqu %%xmm0,"MEMACCESS(2)" \n"
|
||||
"lea "MEMLEA(0x10,2)",%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(src_argb1), // %1
|
||||
@ -4244,18 +4246,17 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
"sub %0,%1 \n"
|
||||
"sub %0,%2 \n"
|
||||
|
||||
// 4 pixel loop.
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu (%0,%1),%%xmm1 \n"
|
||||
"movdqu "MEMACCESS(0)",%%xmm0 \n"
|
||||
"lea "MEMLEA(0x10,0)",%0 \n"
|
||||
"movdqu "MEMACCESS(1)",%%xmm1 \n"
|
||||
"lea "MEMLEA(0x10,1)",%1 \n"
|
||||
"paddusb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"movdqu %%xmm0,(%0,%2,1) \n"
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"movdqu %%xmm0,"MEMACCESS(2)" \n"
|
||||
"lea "MEMLEA(0x10,2)",%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(src_argb1), // %1
|
||||
@ -4275,18 +4276,17 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
uint8* dst_argb, int width) {
|
||||
asm volatile (
|
||||
"sub %0,%1 \n"
|
||||
"sub %0,%2 \n"
|
||||
|
||||
// 4 pixel loop.
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu (%0,%1),%%xmm1 \n"
|
||||
"movdqu "MEMACCESS(0)",%%xmm0 \n"
|
||||
"lea "MEMLEA(0x10,0)",%0 \n"
|
||||
"movdqu "MEMACCESS(1)",%%xmm1 \n"
|
||||
"lea "MEMLEA(0x10,1)",%1 \n"
|
||||
"psubusb %%xmm1,%%xmm0 \n"
|
||||
"sub $0x4,%3 \n"
|
||||
"movdqu %%xmm0,(%0,%2,1) \n"
|
||||
"lea 0x10(%0),%0 \n"
|
||||
"movdqu %%xmm0,"MEMACCESS(2)" \n"
|
||||
"lea "MEMLEA(0x10,2)",%2 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_argb0), // %0
|
||||
"+r"(src_argb1), // %1
|
||||
@ -4793,6 +4793,7 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
|
||||
}
|
||||
#endif // HAS_ARGBAFFINEROW_SSE2
|
||||
|
||||
#ifdef HAS_INTERPOLATEROW_SSSE3
|
||||
// Bilinear filter 16x2 -> 16x1
|
||||
void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
ptrdiff_t src_stride, int dst_width,
|
||||
@ -4895,6 +4896,7 @@ void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif // HAS_INTERPOLATEROW_SSSE3
|
||||
|
||||
#ifdef HAS_INTERPOLATEROW_SSE2
|
||||
// Bilinear filter 16x2 -> 16x1
|
||||
@ -5009,6 +5011,7 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
}
|
||||
#endif // HAS_INTERPOLATEROW_SSE2
|
||||
|
||||
#ifdef HAS_INTERPOLATEROW_SSSE3
|
||||
// Bilinear filter 16x2 -> 16x1
|
||||
void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
ptrdiff_t src_stride, int dst_width,
|
||||
@ -5111,6 +5114,7 @@ void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif // HAS_INTERPOLATEROW_SSSE3
|
||||
|
||||
#ifdef HAS_INTERPOLATEROW_SSE2
|
||||
// Bilinear filter 16x2 -> 16x1
|
||||
@ -5225,6 +5229,7 @@ void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
|
||||
}
|
||||
#endif // HAS_INTERPOLATEROW_SSE2
|
||||
|
||||
#ifdef HAS_HALFROW_SSE2
|
||||
void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
|
||||
uint8* dst_uv, int pix) {
|
||||
asm volatile (
|
||||
@ -5247,7 +5252,9 @@ void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif // HAS_HALFROW_SSE2
|
||||
|
||||
#ifdef HAS_ARGBTOBAYERROW_SSSE3
|
||||
void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
|
||||
uint32 selector, int pix) {
|
||||
asm volatile (
|
||||
@ -5275,7 +5282,9 @@ void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif // HAS_ARGBTOBAYERROW_SSSE3
|
||||
|
||||
#ifdef HAS_ARGBSHUFFLEROW_SSSE3
|
||||
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
|
||||
void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
||||
const uint8* shuffler, int pix) {
|
||||
@ -5330,7 +5339,9 @@ void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif // HAS_ARGBSHUFFLEROW_SSSE3
|
||||
|
||||
#ifdef HAS_I422TOYUY2ROW_SSE2
|
||||
void I422ToYUY2Row_SSE2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
@ -5365,7 +5376,9 @@ void I422ToYUY2Row_SSE2(const uint8* src_y,
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif // HAS_I422TOYUY2ROW_SSE2
|
||||
|
||||
#ifdef HAS_I422TOUYVYROW_SSE2
|
||||
void I422ToUYVYRow_SSE2(const uint8* src_y,
|
||||
const uint8* src_u,
|
||||
const uint8* src_v,
|
||||
@ -5400,9 +5413,11 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif // HAS_I422TOUYVYROW_SSE2
|
||||
|
||||
#ifdef HAS_FIXEDDIV_X86
|
||||
// Divide num by div and return as 16.16 fixed point result.
|
||||
int FixedDiv(int num, int div) {
|
||||
int FixedDiv_X86(int num, int div) {
|
||||
asm volatile (
|
||||
"cdq \n"
|
||||
"shld $0x10,%%eax,%%edx \n"
|
||||
@ -5415,6 +5430,7 @@ int FixedDiv(int num, int div) {
|
||||
);
|
||||
return num;
|
||||
}
|
||||
#endif // HAS_FIXEDDIV_X86
|
||||
#endif // defined(__x86_64__) || defined(__i386__)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -5239,13 +5239,13 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // width
|
||||
movd xmm2, [esp + 16] // value
|
||||
sub edx, eax
|
||||
punpcklbw xmm2, xmm2
|
||||
punpcklqdq xmm2, xmm2
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
movdqa xmm0, [eax] // read 4 pixels
|
||||
lea eax, [eax + 16]
|
||||
movdqa xmm1, xmm0
|
||||
punpcklbw xmm0, xmm0 // first 2
|
||||
punpckhbw xmm1, xmm1 // next 2
|
||||
@ -5255,8 +5255,8 @@ void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
psrlw xmm1, 8
|
||||
packuswb xmm0, xmm1
|
||||
sub ecx, 4
|
||||
movdqa [eax + edx], xmm0
|
||||
lea eax, [eax + 16]
|
||||
movdqa [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
jg convertloop
|
||||
|
||||
ret
|
||||
@ -5276,25 +5276,25 @@ void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
mov edx, [esp + 4 + 12] // dst_argb
|
||||
mov ecx, [esp + 4 + 16] // width
|
||||
pxor xmm5, xmm5 // constant 0
|
||||
sub esi, eax
|
||||
sub edx, eax
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
movdqu xmm0, [eax] // read 4 pixels from src_argb0
|
||||
movdqu xmm2, [eax + esi] // read 4 pixels from src_argb1
|
||||
movdqu xmm2, [esi] // read 4 pixels from src_argb1
|
||||
movdqu xmm1, xmm0
|
||||
movdqu xmm3, xmm2
|
||||
punpcklbw xmm0, xmm0 // first 2
|
||||
punpckhbw xmm1, xmm1 // next 2
|
||||
punpcklbw xmm2, xmm5 // first 2
|
||||
punpckhbw xmm3, xmm5 // next 2
|
||||
pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
|
||||
pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
|
||||
punpcklbw xmm0, xmm0 // first 2
|
||||
punpckhbw xmm1, xmm1 // next 2
|
||||
punpcklbw xmm2, xmm5 // first 2
|
||||
punpckhbw xmm3, xmm5 // next 2
|
||||
pmulhuw xmm0, xmm2 // src_argb0 * src_argb1 first 2
|
||||
pmulhuw xmm1, xmm3 // src_argb0 * src_argb1 next 2
|
||||
lea eax, [eax + 16]
|
||||
lea esi, [esi + 16]
|
||||
packuswb xmm0, xmm1
|
||||
sub ecx, 4
|
||||
movdqu [eax + edx], xmm0
|
||||
lea eax, [eax + 16]
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
jg convertloop
|
||||
|
||||
pop esi
|
||||
@ -5315,8 +5315,6 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
mov esi, [esp + 4 + 8] // src_argb1
|
||||
mov edx, [esp + 4 + 12] // dst_argb
|
||||
mov ecx, [esp + 4 + 16] // width
|
||||
sub esi, eax
|
||||
sub edx, eax
|
||||
|
||||
sub ecx, 4
|
||||
jl convertloop49
|
||||
@ -5324,11 +5322,13 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
align 16
|
||||
convertloop4:
|
||||
movdqu xmm0, [eax] // read 4 pixels from src_argb0
|
||||
movdqu xmm1, [eax + esi] // read 4 pixels from src_argb1
|
||||
lea eax, [eax + 16]
|
||||
movdqu xmm1, [esi] // read 4 pixels from src_argb1
|
||||
lea esi, [esi + 16]
|
||||
paddusb xmm0, xmm1 // src_argb0 + src_argb1
|
||||
sub ecx, 4
|
||||
movdqu [eax + edx], xmm0
|
||||
lea eax, [eax + 16]
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
jge convertloop4
|
||||
|
||||
convertloop49:
|
||||
@ -5337,11 +5337,13 @@ void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
|
||||
convertloop1:
|
||||
movd xmm0, [eax] // read 1 pixels from src_argb0
|
||||
movd xmm1, [eax + esi] // read 1 pixels from src_argb1
|
||||
lea eax, [eax + 4]
|
||||
movd xmm1, [esi] // read 1 pixels from src_argb1
|
||||
lea esi, [esi + 4]
|
||||
paddusb xmm0, xmm1 // src_argb0 + src_argb1
|
||||
sub ecx, 1
|
||||
movd [eax + edx], xmm0
|
||||
lea eax, [eax + 4]
|
||||
movd [edx], xmm0
|
||||
lea edx, [edx + 4]
|
||||
jge convertloop1
|
||||
|
||||
convertloop19:
|
||||
@ -5362,17 +5364,17 @@ void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
mov esi, [esp + 4 + 8] // src_argb1
|
||||
mov edx, [esp + 4 + 12] // dst_argb
|
||||
mov ecx, [esp + 4 + 16] // width
|
||||
sub esi, eax
|
||||
sub edx, eax
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
movdqu xmm0, [eax] // read 4 pixels from src_argb0
|
||||
movdqu xmm1, [eax + esi] // read 4 pixels from src_argb1
|
||||
lea eax, [eax + 16]
|
||||
movdqu xmm1, [esi] // read 4 pixels from src_argb1
|
||||
lea esi, [esi + 16]
|
||||
psubusb xmm0, xmm1 // src_argb0 - src_argb1
|
||||
sub ecx, 4
|
||||
movdqu [eax + edx], xmm0
|
||||
lea eax, [eax + 16]
|
||||
movdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
jg convertloop
|
||||
|
||||
pop esi
|
||||
@ -5392,14 +5394,14 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
mov esi, [esp + 4 + 8] // src_argb1
|
||||
mov edx, [esp + 4 + 12] // dst_argb
|
||||
mov ecx, [esp + 4 + 16] // width
|
||||
vpxor ymm5, ymm5, ymm5 // constant 0
|
||||
sub esi, eax
|
||||
sub edx, eax
|
||||
vpxor ymm5, ymm5, ymm5 // constant 0
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
vmovdqu ymm1, [eax] // read 8 pixels from src_argb0
|
||||
vmovdqu ymm3, [eax + esi] // read 8 pixels from src_argb1
|
||||
lea eax, [eax + 32]
|
||||
vmovdqu ymm3, [esi] // read 8 pixels from src_argb1
|
||||
lea esi, [esi + 32]
|
||||
vpunpcklbw ymm0, ymm1, ymm1 // low 4
|
||||
vpunpckhbw ymm1, ymm1, ymm1 // high 4
|
||||
vpunpcklbw ymm2, ymm3, ymm5 // low 4
|
||||
@ -5407,8 +5409,8 @@ void ARGBMultiplyRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
vpmulhuw ymm0, ymm0, ymm2 // src_argb0 * src_argb1 low 4
|
||||
vpmulhuw ymm1, ymm1, ymm3 // src_argb0 * src_argb1 high 4
|
||||
vpackuswb ymm0, ymm0, ymm1
|
||||
vmovdqu [eax + edx], ymm0
|
||||
lea eax, [eax + 32]
|
||||
vmovdqu [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
|
||||
@ -5430,15 +5432,15 @@ void ARGBAddRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
mov esi, [esp + 4 + 8] // src_argb1
|
||||
mov edx, [esp + 4 + 12] // dst_argb
|
||||
mov ecx, [esp + 4 + 16] // width
|
||||
sub esi, eax
|
||||
sub edx, eax
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
|
||||
vpaddusb ymm0, ymm0, [eax + esi] // add 8 pixels from src_argb1
|
||||
vmovdqu [eax + edx], ymm0
|
||||
lea eax, [eax + 32]
|
||||
vpaddusb ymm0, ymm0, [esi] // add 8 pixels from src_argb1
|
||||
lea esi, [esi + 32]
|
||||
vmovdqu [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
|
||||
@ -5460,15 +5462,15 @@ void ARGBSubtractRow_AVX2(const uint8* src_argb0, const uint8* src_argb1,
|
||||
mov esi, [esp + 4 + 8] // src_argb1
|
||||
mov edx, [esp + 4 + 12] // dst_argb
|
||||
mov ecx, [esp + 4 + 16] // width
|
||||
sub esi, eax
|
||||
sub edx, eax
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
vmovdqu ymm0, [eax] // read 8 pixels from src_argb0
|
||||
vpsubusb ymm0, ymm0, [eax + esi] // src_argb0 - src_argb1
|
||||
vmovdqu [eax + edx], ymm0
|
||||
lea eax, [eax + 32]
|
||||
vpsubusb ymm0, ymm0, [esi] // src_argb0 - src_argb1
|
||||
lea esi, [esi + 32]
|
||||
vmovdqu [edx], ymm0
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
jg convertloop
|
||||
|
||||
@ -6646,9 +6648,10 @@ void I422ToUYVYRow_SSE2(const uint8* src_y,
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef HAS_FIXEDDIV_X86
|
||||
// Divide num by div and return as 16.16 fixed point result.
|
||||
__declspec(naked) __declspec(align(16))
|
||||
int FixedDiv(int num, int div) {
|
||||
int FixedDiv_X86(int num, int div) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // num
|
||||
cdq // extend num to 64 bits
|
||||
@ -6658,6 +6661,7 @@ int FixedDiv(int num, int div) {
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_FIXEDDIV_X86
|
||||
#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user