mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
Remove alignment from ARGBToRGB24 and ARGBToRAW to allow fast code to be used all of the time. Improves performance on Westmere and beyond, hurts performance for aligned buffers on older CPUs.
BUG=230 TESTED=try bot R=nfullagar@google.com Review URL: https://webrtc-codereview.appspot.com/2197007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@785 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
1390aaf69a
commit
7e7c7753ba
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 784
|
||||
Version: 785
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 784
|
||||
#define LIBYUV_VERSION 785
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -744,9 +744,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
|
||||
void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
|
||||
ARGBToRGB24Row_C;
|
||||
#if defined(HAS_ARGBTORGB24ROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
|
||||
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
|
||||
IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
|
||||
ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
|
||||
@ -792,9 +790,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
|
||||
void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
|
||||
ARGBToRAWRow_C;
|
||||
#if defined(HAS_ARGBTORAWROW_SSSE3)
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
|
||||
IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
|
||||
IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
|
||||
if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
|
||||
ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
ARGBToRAWRow = ARGBToRAWRow_SSSE3;
|
||||
|
||||
@ -569,10 +569,10 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
|
||||
"movdqa %3,%%xmm6 \n"
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa 0x10(%0),%%xmm1 \n"
|
||||
"movdqa 0x20(%0),%%xmm2 \n"
|
||||
"movdqa 0x30(%0),%%xmm3 \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu 0x10(%0),%%xmm1 \n"
|
||||
"movdqu 0x20(%0),%%xmm2 \n"
|
||||
"movdqu 0x30(%0),%%xmm3 \n"
|
||||
"lea 0x40(%0),%0 \n"
|
||||
"pshufb %%xmm6,%%xmm0 \n"
|
||||
"pshufb %%xmm6,%%xmm1 \n"
|
||||
@ -584,13 +584,13 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
|
||||
"movdqa %%xmm2,%%xmm5 \n"
|
||||
"por %%xmm4,%%xmm0 \n"
|
||||
"pslldq $0x8,%%xmm5 \n"
|
||||
"movdqa %%xmm0,(%1) \n"
|
||||
"movdqu %%xmm0,(%1) \n"
|
||||
"por %%xmm5,%%xmm1 \n"
|
||||
"psrldq $0x8,%%xmm2 \n"
|
||||
"pslldq $0x4,%%xmm3 \n"
|
||||
"por %%xmm3,%%xmm2 \n"
|
||||
"movdqa %%xmm1,0x10(%1) \n"
|
||||
"movdqa %%xmm2,0x20(%1) \n"
|
||||
"movdqu %%xmm1,0x10(%1) \n"
|
||||
"movdqu %%xmm2,0x20(%1) \n"
|
||||
"lea 0x30(%1),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
@ -610,10 +610,10 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
|
||||
"movdqa %3,%%xmm6 \n"
|
||||
".p2align 4 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"movdqa 0x10(%0),%%xmm1 \n"
|
||||
"movdqa 0x20(%0),%%xmm2 \n"
|
||||
"movdqa 0x30(%0),%%xmm3 \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqu 0x10(%0),%%xmm1 \n"
|
||||
"movdqu 0x20(%0),%%xmm2 \n"
|
||||
"movdqu 0x30(%0),%%xmm3 \n"
|
||||
"lea 0x40(%0),%0 \n"
|
||||
"pshufb %%xmm6,%%xmm0 \n"
|
||||
"pshufb %%xmm6,%%xmm1 \n"
|
||||
@ -625,13 +625,13 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
|
||||
"movdqa %%xmm2,%%xmm5 \n"
|
||||
"por %%xmm4,%%xmm0 \n"
|
||||
"pslldq $0x8,%%xmm5 \n"
|
||||
"movdqa %%xmm0,(%1) \n"
|
||||
"movdqu %%xmm0,(%1) \n"
|
||||
"por %%xmm5,%%xmm1 \n"
|
||||
"psrldq $0x8,%%xmm2 \n"
|
||||
"pslldq $0x4,%%xmm3 \n"
|
||||
"por %%xmm3,%%xmm2 \n"
|
||||
"movdqa %%xmm1,0x10(%1) \n"
|
||||
"movdqa %%xmm2,0x20(%1) \n"
|
||||
"movdqu %%xmm1,0x10(%1) \n"
|
||||
"movdqu %%xmm2,0x20(%1) \n"
|
||||
"lea 0x30(%1),%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
|
||||
@ -479,10 +479,10 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
movdqa xmm0, [eax] // fetch 16 pixels of argb
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm2, [eax + 32]
|
||||
movdqa xmm3, [eax + 48]
|
||||
movdqu xmm0, [eax] // fetch 16 pixels of argb
|
||||
movdqu xmm1, [eax + 16]
|
||||
movdqu xmm2, [eax + 32]
|
||||
movdqu xmm3, [eax + 48]
|
||||
lea eax, [eax + 64]
|
||||
pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
|
||||
pshufb xmm1, xmm6
|
||||
@ -494,13 +494,13 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
movdqa xmm5, xmm2 // 8 bytes from 2 for 1
|
||||
por xmm0, xmm4 // 4 bytes from 1 for 0
|
||||
pslldq xmm5, 8 // 8 bytes from 2 for 1
|
||||
movdqa [edx], xmm0 // store 0
|
||||
movdqu [edx], xmm0 // store 0
|
||||
por xmm1, xmm5 // 8 bytes from 2 for 1
|
||||
psrldq xmm2, 8 // 4 bytes from 2
|
||||
pslldq xmm3, 4 // 12 bytes from 3 for 2
|
||||
por xmm2, xmm3 // 12 bytes from 3 for 2
|
||||
movdqa [edx + 16], xmm1 // store 1
|
||||
movdqa [edx + 32], xmm2 // store 2
|
||||
movdqu [edx + 16], xmm1 // store 1
|
||||
movdqu [edx + 32], xmm2 // store 2
|
||||
lea edx, [edx + 48]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
@ -518,10 +518,10 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
|
||||
align 16
|
||||
convertloop:
|
||||
movdqa xmm0, [eax] // fetch 16 pixels of argb
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm2, [eax + 32]
|
||||
movdqa xmm3, [eax + 48]
|
||||
movdqu xmm0, [eax] // fetch 16 pixels of argb
|
||||
movdqu xmm1, [eax + 16]
|
||||
movdqu xmm2, [eax + 32]
|
||||
movdqu xmm3, [eax + 48]
|
||||
lea eax, [eax + 64]
|
||||
pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB
|
||||
pshufb xmm1, xmm6
|
||||
@ -533,13 +533,13 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
movdqa xmm5, xmm2 // 8 bytes from 2 for 1
|
||||
por xmm0, xmm4 // 4 bytes from 1 for 0
|
||||
pslldq xmm5, 8 // 8 bytes from 2 for 1
|
||||
movdqa [edx], xmm0 // store 0
|
||||
movdqu [edx], xmm0 // store 0
|
||||
por xmm1, xmm5 // 8 bytes from 2 for 1
|
||||
psrldq xmm2, 8 // 4 bytes from 2
|
||||
pslldq xmm3, 4 // 12 bytes from 3 for 2
|
||||
por xmm2, xmm3 // 12 bytes from 3 for 2
|
||||
movdqa [edx + 16], xmm1 // store 1
|
||||
movdqa [edx + 32], xmm2 // store 2
|
||||
movdqu [edx + 16], xmm1 // store 1
|
||||
movdqu [edx + 32], xmm2 // store 2
|
||||
lea edx, [edx + 48]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user