mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
reorder stores for FastConvertYUVToABGRRow_SSSE3 and FastConvertYUVToBGRARow_SSSE3. ReverseRow_SSE2. cpu detect allow environment variable override set LIBYUV_DISABLE_SSSE3=1 set LIBYUV_DISABLE_SSE2=1. Reorder stores in rotate for core2
BUG=none TEST=none Review URL: http://webrtc-codereview.appspot.com/317010 git-svn-id: http://libyuv.googlecode.com/svn/trunk@107 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
8b9759c4a7
commit
373cdbdc58
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 106
|
Version: 107
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -10,6 +10,7 @@
|
|||||||
|
|
||||||
#include "libyuv/cpu_id.h"
|
#include "libyuv/cpu_id.h"
|
||||||
|
|
||||||
|
#include <stdlib.h> // for getenv
|
||||||
#ifdef _MSC_VER
|
#ifdef _MSC_VER
|
||||||
#include <intrin.h>
|
#include <intrin.h>
|
||||||
#endif
|
#endif
|
||||||
@ -55,6 +56,15 @@ int InitCpuFlags() {
|
|||||||
cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
|
cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
|
||||||
(cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
|
(cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
|
||||||
kCpuInitialized;
|
kCpuInitialized;
|
||||||
|
|
||||||
|
// environment variable overrides for testing.
|
||||||
|
if (getenv("LIBYUV_DISABLE_SSE2")) {
|
||||||
|
cpu_info_ &= ~kCpuHasSSE2;
|
||||||
|
}
|
||||||
|
// environment variable overrides for testing.
|
||||||
|
if (getenv("LIBYUV_DISABLE_SSSE3")) {
|
||||||
|
cpu_info_ &= ~kCpuHasSSSE3;
|
||||||
|
}
|
||||||
#elif defined(__ANDROID__) && defined(__ARM_NEON__)
|
#elif defined(__ANDROID__) && defined(__ARM_NEON__)
|
||||||
uint64_t features = android_getCpuFeatures();
|
uint64_t features = android_getCpuFeatures();
|
||||||
cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) |
|
cpu_info_ = ((features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) |
|
||||||
|
|||||||
@ -340,6 +340,18 @@ int I420Mirror(const uint8* src_y, int src_stride_y,
|
|||||||
IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
|
IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
|
||||||
ReverseRow = ReverseRow_SSSE3;
|
ReverseRow = ReverseRow_SSSE3;
|
||||||
} else
|
} else
|
||||||
|
#endif
|
||||||
|
#if defined(HAS_REVERSE_ROW_SSE2)
|
||||||
|
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||||
|
IS_ALIGNED(width, 32) &&
|
||||||
|
IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
|
||||||
|
IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
|
||||||
|
IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
|
||||||
|
IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16) &&
|
||||||
|
IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
|
||||||
|
IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
|
||||||
|
ReverseRow = ReverseRow_SSE2;
|
||||||
|
} else
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
ReverseRow = ReverseRow_C;
|
ReverseRow = ReverseRow_C;
|
||||||
|
|||||||
@ -867,6 +867,14 @@ void RotatePlane180(const uint8* src, int src_stride,
|
|||||||
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
|
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
|
||||||
ReverseRow = ReverseRow_SSSE3;
|
ReverseRow = ReverseRow_SSSE3;
|
||||||
} else
|
} else
|
||||||
|
#endif
|
||||||
|
#if defined(HAS_REVERSE_ROW_SSE2)
|
||||||
|
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||||
|
IS_ALIGNED(width, 16) &&
|
||||||
|
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
|
||||||
|
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
|
||||||
|
ReverseRow = ReverseRow_SSE2;
|
||||||
|
} else
|
||||||
#endif
|
#endif
|
||||||
{
|
{
|
||||||
ReverseRow = ReverseRow_C;
|
ReverseRow = ReverseRow_C;
|
||||||
@ -1019,8 +1027,8 @@ __asm {
|
|||||||
lea eax, [eax - 16]
|
lea eax, [eax - 16]
|
||||||
pshufb xmm0, xmm5
|
pshufb xmm0, xmm5
|
||||||
movlpd qword ptr [edx], xmm0
|
movlpd qword ptr [edx], xmm0
|
||||||
lea edx, [edx + 8]
|
|
||||||
movhpd qword ptr [edi], xmm0
|
movhpd qword ptr [edi], xmm0
|
||||||
|
lea edx, [edx + 8]
|
||||||
lea edi, [edi + 8]
|
lea edi, [edi + 8]
|
||||||
sub ecx, 8
|
sub ecx, 8
|
||||||
ja convertloop
|
ja convertloop
|
||||||
@ -1044,8 +1052,8 @@ void ReverseRowUV_SSSE3(const uint8* src,
|
|||||||
"lea -16(%0),%0 \n"
|
"lea -16(%0),%0 \n"
|
||||||
"pshufb %%xmm5,%%xmm0 \n"
|
"pshufb %%xmm5,%%xmm0 \n"
|
||||||
"movlpd %%xmm0,(%1) \n"
|
"movlpd %%xmm0,(%1) \n"
|
||||||
"lea 8(%1),%1 \n"
|
|
||||||
"movhpd %%xmm0,(%2) \n"
|
"movhpd %%xmm0,(%2) \n"
|
||||||
|
"lea 8(%1),%1 \n"
|
||||||
"lea 8(%2),%2 \n"
|
"lea 8(%2),%2 \n"
|
||||||
"sub $8,%3 \n"
|
"sub $8,%3 \n"
|
||||||
"ja 1b \n"
|
"ja 1b \n"
|
||||||
|
|||||||
@ -65,6 +65,7 @@ void FastConvertYUVToABGRRow_NEON(const uint8* y_buf,
|
|||||||
#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
|
#define HAS_FASTCONVERTYUVTOABGRROW_SSSE3
|
||||||
#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
|
#define HAS_FASTCONVERTYUV444TOARGBROW_SSSE3
|
||||||
#define HAS_REVERSE_ROW_SSSE3
|
#define HAS_REVERSE_ROW_SSSE3
|
||||||
|
#define HAS_REVERSE_ROW_SSE2
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// The following are available on Neon platforms
|
// The following are available on Neon platforms
|
||||||
@ -102,6 +103,9 @@ void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
|||||||
#ifdef HAS_REVERSE_ROW_SSSE3
|
#ifdef HAS_REVERSE_ROW_SSSE3
|
||||||
void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width);
|
void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width);
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef HAS_REVERSE_ROW_SSE2
|
||||||
|
void ReverseRow_SSE2(const uint8* src, uint8* dst, int width);
|
||||||
|
#endif
|
||||||
#ifdef HAS_REVERSE_ROW_NEON
|
#ifdef HAS_REVERSE_ROW_NEON
|
||||||
void ReverseRow_NEON(const uint8* src, uint8* dst, int width);
|
void ReverseRow_NEON(const uint8* src, uint8* dst, int width);
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
@ -17,16 +17,22 @@ namespace libyuv {
|
|||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef __APPLE__
|
||||||
|
#define CONST
|
||||||
|
#else
|
||||||
|
#define CONST static const
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef HAS_ARGBTOUVROW_SSSE3
|
#ifdef HAS_ARGBTOUVROW_SSSE3
|
||||||
vec8 kARGBToU = {
|
CONST vec8 kARGBToU = {
|
||||||
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
|
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
|
||||||
};
|
};
|
||||||
|
|
||||||
uvec8 kARGBToV = {
|
CONST uvec8 kARGBToV = {
|
||||||
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
|
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0
|
||||||
};
|
};
|
||||||
|
|
||||||
uvec8 kAddUV128 = {
|
CONST uvec8 kAddUV128 = {
|
||||||
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
|
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
|
||||||
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
|
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
|
||||||
};
|
};
|
||||||
@ -35,31 +41,31 @@ uvec8 kAddUV128 = {
|
|||||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||||
|
|
||||||
// Constant multiplication table for converting ARGB to I400.
|
// Constant multiplication table for converting ARGB to I400.
|
||||||
vec8 kARGBToY = {
|
CONST vec8 kARGBToY = {
|
||||||
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
|
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
|
||||||
};
|
};
|
||||||
|
|
||||||
uvec8 kAddY16 = {
|
CONST uvec8 kAddY16 = {
|
||||||
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
|
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
|
||||||
};
|
};
|
||||||
|
|
||||||
// Shuffle table for converting BG24 to ARGB.
|
// Shuffle table for converting BG24 to ARGB.
|
||||||
uvec8 kShuffleMaskBG24ToARGB = {
|
CONST uvec8 kShuffleMaskBG24ToARGB = {
|
||||||
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
|
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
|
||||||
};
|
};
|
||||||
|
|
||||||
// Shuffle table for converting RAW to ARGB.
|
// Shuffle table for converting RAW to ARGB.
|
||||||
uvec8 kShuffleMaskRAWToARGB = {
|
CONST uvec8 kShuffleMaskRAWToARGB = {
|
||||||
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
|
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
|
||||||
};
|
};
|
||||||
|
|
||||||
// Shuffle table for converting ABGR to ARGB.
|
// Shuffle table for converting ABGR to ARGB.
|
||||||
uvec8 kShuffleMaskABGRToARGB = {
|
CONST uvec8 kShuffleMaskABGRToARGB = {
|
||||||
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
|
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
|
||||||
};
|
};
|
||||||
|
|
||||||
// Shuffle table for converting BGRA to ARGB.
|
// Shuffle table for converting BGRA to ARGB.
|
||||||
uvec8 kShuffleMaskBGRAToARGB = {
|
CONST uvec8 kShuffleMaskBGRAToARGB = {
|
||||||
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
|
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -352,7 +358,7 @@ struct {
|
|||||||
vec16 kUVBiasR;
|
vec16 kUVBiasR;
|
||||||
vec16 kYSub16;
|
vec16 kYSub16;
|
||||||
vec16 kYToRgb;
|
vec16 kYToRgb;
|
||||||
} SIMD_ALIGNED(kYuvConstants) = {
|
} CONST SIMD_ALIGNED(kYuvConstants) = {
|
||||||
{ UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
|
{ UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
|
||||||
{ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
|
{ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
|
||||||
{ UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
|
{ UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
|
||||||
@ -445,8 +451,8 @@ void OMITFP FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf, // rdi
|
|||||||
"punpcklbw %%xmm2,%%xmm5 \n"
|
"punpcklbw %%xmm2,%%xmm5 \n"
|
||||||
"movdqa %%xmm5,%%xmm0 \n"
|
"movdqa %%xmm5,%%xmm0 \n"
|
||||||
"punpcklwd %%xmm1,%%xmm5 \n"
|
"punpcklwd %%xmm1,%%xmm5 \n"
|
||||||
"movdqa %%xmm5,(%3) \n"
|
|
||||||
"punpckhwd %%xmm1,%%xmm0 \n"
|
"punpckhwd %%xmm1,%%xmm0 \n"
|
||||||
|
"movdqa %%xmm5,(%3) \n"
|
||||||
"movdqa %%xmm0,0x10(%3) \n"
|
"movdqa %%xmm0,0x10(%3) \n"
|
||||||
"lea 0x20(%3),%3 \n"
|
"lea 0x20(%3),%3 \n"
|
||||||
"sub $0x8,%4 \n"
|
"sub $0x8,%4 \n"
|
||||||
@ -480,8 +486,8 @@ void OMITFP FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf, // rdi
|
|||||||
"punpcklbw %%xmm5,%%xmm0 \n"
|
"punpcklbw %%xmm5,%%xmm0 \n"
|
||||||
"movdqa %%xmm2,%%xmm1 \n"
|
"movdqa %%xmm2,%%xmm1 \n"
|
||||||
"punpcklwd %%xmm0,%%xmm2 \n"
|
"punpcklwd %%xmm0,%%xmm2 \n"
|
||||||
"movdqa %%xmm2,(%3) \n"
|
|
||||||
"punpckhwd %%xmm0,%%xmm1 \n"
|
"punpckhwd %%xmm0,%%xmm1 \n"
|
||||||
|
"movdqa %%xmm2,(%3) \n"
|
||||||
"movdqa %%xmm1,0x10(%3) \n"
|
"movdqa %%xmm1,0x10(%3) \n"
|
||||||
"lea 0x20(%3),%3 \n"
|
"lea 0x20(%3),%3 \n"
|
||||||
"sub $0x8,%4 \n"
|
"sub $0x8,%4 \n"
|
||||||
@ -640,11 +646,8 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
|
|||||||
|
|
||||||
#ifdef HAS_REVERSE_ROW_SSSE3
|
#ifdef HAS_REVERSE_ROW_SSSE3
|
||||||
|
|
||||||
// TODO(fbarchard): define CONST macro that is static const for linux, but
|
|
||||||
// does nothing for gcc on OSX (which has an internal compiler fault)
|
|
||||||
|
|
||||||
// Shuffle table for reversing the bytes.
|
// Shuffle table for reversing the bytes.
|
||||||
uvec8 kShuffleReverse = {
|
CONST uvec8 kShuffleReverse = {
|
||||||
15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
|
15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -653,14 +656,14 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
|||||||
asm volatile (
|
asm volatile (
|
||||||
"movdqa %3,%%xmm5 \n"
|
"movdqa %3,%%xmm5 \n"
|
||||||
"lea -0x10(%0,%2,1),%0 \n"
|
"lea -0x10(%0,%2,1),%0 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"movdqa (%0),%%xmm0 \n"
|
"movdqa (%0),%%xmm0 \n"
|
||||||
"lea -0x10(%0),%0 \n"
|
"lea -0x10(%0),%0 \n"
|
||||||
"pshufb %%xmm5,%%xmm0 \n"
|
"pshufb %%xmm5,%%xmm0 \n"
|
||||||
"movdqa %%xmm0,(%1) \n"
|
"movdqa %%xmm0,(%1) \n"
|
||||||
"lea 0x10(%1),%1 \n"
|
"lea 0x10(%1),%1 \n"
|
||||||
"sub $0x10,%2 \n"
|
"sub $0x10,%2 \n"
|
||||||
"ja 1b \n"
|
"ja 1b \n"
|
||||||
: "+r"(src), // %0
|
: "+r"(src), // %0
|
||||||
"+r"(dst), // %1
|
"+r"(dst), // %1
|
||||||
"+r"(temp_width) // %2
|
"+r"(temp_width) // %2
|
||||||
@ -673,6 +676,38 @@ void ReverseRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAS_REVERSE_ROW_SSE2
|
||||||
|
|
||||||
|
void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
|
||||||
|
intptr_t temp_width = static_cast<intptr_t>(width);
|
||||||
|
asm volatile (
|
||||||
|
"lea -0x10(%0,%2,1),%0 \n"
|
||||||
|
"1: \n"
|
||||||
|
"movdqa (%0),%%xmm0 \n"
|
||||||
|
"lea -0x10(%0),%0 \n"
|
||||||
|
"movdqa %%xmm0,%%xmm1 \n"
|
||||||
|
"psllw $0x8,%%xmm0 \n"
|
||||||
|
"psrlw $0x8,%%xmm1 \n"
|
||||||
|
"por %%xmm1,%%xmm0 \n"
|
||||||
|
"pshuflw $0x1b,%%xmm0,%%xmm0 \n"
|
||||||
|
"pshufhw $0x1b,%%xmm0,%%xmm0 \n"
|
||||||
|
"pshufd $0x4e,%%xmm0,%%xmm0 \n"
|
||||||
|
"movdqa %%xmm0,(%1) \n"
|
||||||
|
"lea 0x10(%1),%1 \n"
|
||||||
|
"sub $0x10,%2 \n"
|
||||||
|
"ja 1b \n"
|
||||||
|
: "+r"(src), // %0
|
||||||
|
"+r"(dst), // %1
|
||||||
|
"+r"(temp_width) // %2
|
||||||
|
:
|
||||||
|
: "memory", "cc"
|
||||||
|
#if defined(__SSE2__)
|
||||||
|
, "xmm0", "xmm1"
|
||||||
|
#endif
|
||||||
|
);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
} // namespace libyuv
|
} // namespace libyuv
|
||||||
|
|||||||
@ -654,8 +654,8 @@ void FastConvertYUVToBGRARow_SSSE3(const uint8* y_buf,
|
|||||||
punpcklbw xmm5, xmm2 // AR
|
punpcklbw xmm5, xmm2 // AR
|
||||||
movdqa xmm0, xmm5
|
movdqa xmm0, xmm5
|
||||||
punpcklwd xmm5, xmm1 // BGRA first 4 pixels
|
punpcklwd xmm5, xmm1 // BGRA first 4 pixels
|
||||||
movdqa [edx], xmm5
|
|
||||||
punpckhwd xmm0, xmm1 // BGRA next 4 pixels
|
punpckhwd xmm0, xmm1 // BGRA next 4 pixels
|
||||||
|
movdqa [edx], xmm5
|
||||||
movdqa [edx + 16], xmm0
|
movdqa [edx + 16], xmm0
|
||||||
lea edx, [edx + 32]
|
lea edx, [edx + 32]
|
||||||
|
|
||||||
@ -694,8 +694,8 @@ void FastConvertYUVToABGRRow_SSSE3(const uint8* y_buf,
|
|||||||
punpcklbw xmm0, xmm5 // BA
|
punpcklbw xmm0, xmm5 // BA
|
||||||
movdqa xmm1, xmm2
|
movdqa xmm1, xmm2
|
||||||
punpcklwd xmm2, xmm0 // RGBA first 4 pixels
|
punpcklwd xmm2, xmm0 // RGBA first 4 pixels
|
||||||
movdqa [edx], xmm2
|
|
||||||
punpckhwd xmm1, xmm0 // RGBA next 4 pixels
|
punpckhwd xmm1, xmm0 // RGBA next 4 pixels
|
||||||
|
movdqa [edx], xmm2
|
||||||
movdqa [edx + 16], xmm1
|
movdqa [edx + 16], xmm1
|
||||||
lea edx, [edx + 32]
|
lea edx, [edx + 32]
|
||||||
|
|
||||||
@ -794,7 +794,7 @@ void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
|
|||||||
|
|
||||||
convertloop:
|
convertloop:
|
||||||
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
|
// Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
|
||||||
movq xmm0, [eax]
|
movq xmm0, qword ptr [eax]
|
||||||
lea eax, [eax + 8]
|
lea eax, [eax + 8]
|
||||||
punpcklbw xmm0, xmm0 // Y.Y
|
punpcklbw xmm0, xmm0 // Y.Y
|
||||||
psubusw xmm0, xmm3
|
psubusw xmm0, xmm3
|
||||||
@ -849,6 +849,33 @@ __asm {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef HAS_REVERSE_ROW_SSE2
|
||||||
|
|
||||||
|
__declspec(naked)
|
||||||
|
void ReverseRow_SSE2(const uint8* src, uint8* dst, int width) {
|
||||||
|
__asm {
|
||||||
|
mov eax, [esp + 4] // src
|
||||||
|
mov edx, [esp + 8] // dst
|
||||||
|
mov ecx, [esp + 12] // width
|
||||||
|
lea eax, [eax + ecx - 16]
|
||||||
|
convertloop:
|
||||||
|
movdqa xmm0, [eax]
|
||||||
|
lea eax, [eax - 16]
|
||||||
|
movdqa xmm1, xmm0 // swap bytes
|
||||||
|
psllw xmm0, 8
|
||||||
|
psrlw xmm1, 8
|
||||||
|
por xmm0, xmm1
|
||||||
|
pshuflw xmm0, xmm0, 0x1b // swap words
|
||||||
|
pshufhw xmm0, xmm0, 0x1b
|
||||||
|
pshufd xmm0, xmm0, 0x4e
|
||||||
|
movdqa [edx], xmm0
|
||||||
|
lea edx, [edx + 16]
|
||||||
|
sub ecx, 16
|
||||||
|
ja convertloop
|
||||||
|
ret
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
} // extern "C"
|
} // extern "C"
|
||||||
} // namespace libyuv
|
} // namespace libyuv
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user