x64 versions of assembly which also work on PIC 32 bit. biplanar take 2 strides for odd width. inversion for src height

TEST=media and planar unittests in Talk

BUG=none
Review URL: http://webrtc-codereview.appspot.com/244004

git-svn-id: http://libyuv.googlecode.com/svn/trunk@33 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
fbarchard@google.com 2011-10-20 06:04:16 +00:00
parent 3de12ae1c6
commit 3faa0f15cb
3 changed files with 1252 additions and 598 deletions

View File

@ -8,7 +8,6 @@
* be found in the AUTHORS file in the root of the source tree. * be found in the AUTHORS file in the root of the source tree.
*/ */
#include "libyuv/planar_functions.h" #include "libyuv/planar_functions.h"
#include <string.h> #include <string.h>
@ -32,9 +31,12 @@ static void SplitUV_NEON(const uint8* src_uv,
"vst1.u8 {q1}, [%2]! \n" // Store V "vst1.u8 {q1}, [%2]! \n" // Store V
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
"bhi 1b \n" "bhi 1b \n"
: // Output registers : "+r"(src_uv),
: "r"(src_uv), "r"(dst_u), "r"(dst_v), "r"(pix) // Input registers "+r"(dst_u),
: "q0", "q1" // Clobber List "+r"(dst_v),
"+r"(pix) // Output registers
: // Input registers
: "q0", "q1" // Clobber List
); );
} }
@ -104,7 +106,7 @@ static void SplitUV_SSE2(const uint8* src_uv,
#define HAS_SPLITUV_SSE2 #define HAS_SPLITUV_SSE2
static void SplitUV_SSE2(const uint8* src_uv, static void SplitUV_SSE2(const uint8* src_uv,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
asm( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" "pcmpeqb %%xmm7,%%xmm7\n"
"psrlw $0x8,%%xmm7\n" "psrlw $0x8,%%xmm7\n"
"1:" "1:"
@ -125,11 +127,11 @@ static void SplitUV_SSE2(const uint8* src_uv,
"lea 0x10(%2),%2\n" "lea 0x10(%2),%2\n"
"sub $0x10,%3\n" "sub $0x10,%3\n"
"ja 1b\n" "ja 1b\n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(pix) // %3
: :
: "r"(src_uv), // %0
"r"(dst_u), // %1
"r"(dst_v), // %2
"r"(pix) // %3
: "memory" : "memory"
); );
} }
@ -191,9 +193,10 @@ int I420Copy(const uint8* src_y, int src_stride_y,
// Negative height means invert the image. // Negative height means invert the image.
if (height < 0) { if (height < 0) {
height = -height; height = -height;
int halfheight = (height + 1) >> 1;
src_y = src_y + (height - 1) * src_stride_y; src_y = src_y + (height - 1) * src_stride_y;
src_u = src_u + (height - 1) * src_stride_u; src_u = src_u + (halfheight - 1) * src_stride_u;
src_v = src_v + (height - 1) * src_stride_v; src_v = src_v + (halfheight - 1) * src_stride_v;
src_stride_y = -src_stride_y; src_stride_y = -src_stride_y;
src_stride_u = -src_stride_u; src_stride_u = -src_stride_u;
src_stride_v = -src_stride_v; src_stride_v = -src_stride_v;
@ -267,9 +270,8 @@ int I422ToI420(const uint8* src_y, int src_stride_y,
// M420 is row biplanar 420: 2 rows of Y and 1 row of VU. // M420 is row biplanar 420: 2 rows of Y and 1 row of VU.
// Chroma is half width / half height. (420) // Chroma is half width / half height. (420)
// src_stride_m420 is row planar. Normally this will be the width in pixels. // src_stride_m420 is row planar. Normally this will be the width in pixels.
// The UV plane is half width, but 2 values, so src_stride_m420 applies to this // The UV plane is half width, but 2 values, so src_stride_m420 applies to
// as well as the two Y planes. // this as well as the two Y planes.
// TODO(fbarchard): Do NV21/NV12 formats with this function
static int X420ToI420(const uint8* src_y, static int X420ToI420(const uint8* src_y,
int src_stride_y0, int src_stride_y1, int src_stride_y0, int src_stride_y1,
const uint8* src_uv, int src_stride_uv, const uint8* src_uv, int src_stride_uv,
@ -280,9 +282,10 @@ static int X420ToI420(const uint8* src_y,
// Negative height means invert the image. // Negative height means invert the image.
if (height < 0) { if (height < 0) {
height = -height; height = -height;
int halfheight = (height + 1) >> 1;
dst_y = dst_y + (height - 1) * dst_stride_y; dst_y = dst_y + (height - 1) * dst_stride_y;
dst_u = dst_u + (height - 1) * dst_stride_u; dst_u = dst_u + (halfheight - 1) * dst_stride_u;
dst_v = dst_v + (height - 1) * dst_stride_v; dst_v = dst_v + (halfheight - 1) * dst_stride_v;
dst_stride_y = -dst_stride_y; dst_stride_y = -dst_stride_y;
dst_stride_u = -dst_stride_u; dst_stride_u = -dst_stride_u;
dst_stride_v = -dst_stride_v; dst_stride_v = -dst_stride_v;
@ -340,6 +343,21 @@ int M420ToI420(const uint8* src_m420, int src_stride_m420,
} }
// Convert NV12 to I420. // Convert NV12 to I420.
int NV12ToI420(const uint8* src_y, int src_stride_y,
const uint8* src_uv, int src_stride_uv,
uint8* dst_y, int dst_stride_y,
uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v,
int width, int height) {
return X420ToI420(src_y, src_stride_y, src_stride_y,
src_uv, src_stride_uv,
dst_y, dst_stride_y,
dst_u, dst_stride_u,
dst_v, dst_stride_v,
width, height);
}
// Convert NV12 to I420. Deprecated.
int NV12ToI420(const uint8* src_y, int NV12ToI420(const uint8* src_y,
const uint8* src_uv, const uint8* src_uv,
int src_stride, int src_stride,
@ -402,12 +420,13 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
ret ret
} }
} }
#elif (defined(__x86_64__) || defined(__i386__)) && \ #elif (defined(__x86_64__) || defined(__i386__)) && \
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR) !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
#define HAS_SPLITYUY2_SSE2 #define HAS_SPLITYUY2_SSE2
static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y, static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
asm( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" "pcmpeqb %%xmm7,%%xmm7\n"
"psrlw $0x8,%%xmm7\n" "psrlw $0x8,%%xmm7\n"
"1:" "1:"
@ -435,12 +454,12 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
"lea 0x8(%3),%3\n" "lea 0x8(%3),%3\n"
"sub $0x10,%4\n" "sub $0x10,%4\n"
"ja 1b\n" "ja 1b\n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
"+r"(pix) // %4
: :
: "r"(src_yuy2), // %0
"r"(dst_y), // %1
"r"(dst_u), // %2
"r"(dst_v), // %3
"r"(pix) // %4
: "memory" : "memory"
); );
} }
@ -469,6 +488,17 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
int halfheight = (height + 1) >> 1;
dst_y = dst_y + (height - 1) * dst_stride_y;
dst_u = dst_u + (halfheight - 1) * dst_stride_u;
dst_v = dst_v + (halfheight - 1) * dst_stride_v;
dst_stride_y = -dst_stride_y;
dst_stride_u = -dst_stride_u;
dst_stride_v = -dst_stride_v;
}
void (*SplitYUY2)(const uint8* src_yuy2, void (*SplitYUY2)(const uint8* src_yuy2,
uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix); uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix);
#if defined(HAS_SPLITYUY2_SSE2) #if defined(HAS_SPLITYUY2_SSE2)
@ -642,7 +672,7 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
#define HAS_YUY2TOI420ROW_SSE2 #define HAS_YUY2TOI420ROW_SSE2
static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2, static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
uint8* dst_y, int pix) { uint8* dst_y, int pix) {
asm( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" "pcmpeqb %%xmm7,%%xmm7\n"
"psrlw $0x8,%%xmm7\n" "psrlw $0x8,%%xmm7\n"
"1:" "1:"
@ -656,24 +686,24 @@ static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
"lea 0x10(%1),%1\n" "lea 0x10(%1),%1\n"
"sub $0x10,%2\n" "sub $0x10,%2\n"
"ja 1b\n" "ja 1b\n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
: :
: "r"(src_yuy2), // %0
"r"(dst_y), // %1
"r"(pix) // %2
: "memory" : "memory"
); );
} }
static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2, static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
uint8* dst_u, uint8* dst_y, int pix) { uint8* dst_u, uint8* dst_y, int pix) {
asm( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" "pcmpeqb %%xmm7,%%xmm7\n"
"psrlw $0x8,%%xmm7\n" "psrlw $0x8,%%xmm7\n"
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n" "movdqa 0x10(%0),%%xmm1\n"
"movdqa (%0,%1,1),%%xmm2\n" "movdqa (%0,%4,1),%%xmm2\n"
"movdqa 0x10(%0,%1,1),%%xmm3\n" "movdqa 0x10(%0,%4,1),%%xmm3\n"
"lea 0x20(%0),%0\n" "lea 0x20(%0),%0\n"
"pavgb %%xmm2,%%xmm0\n" "pavgb %%xmm2,%%xmm0\n"
"pavgb %%xmm3,%%xmm1\n" "pavgb %%xmm3,%%xmm1\n"
@ -683,27 +713,26 @@ static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
"movdqa %%xmm0,%%xmm1\n" "movdqa %%xmm0,%%xmm1\n"
"pand %%xmm7,%%xmm0\n" "pand %%xmm7,%%xmm0\n"
"packuswb %%xmm0,%%xmm0\n" "packuswb %%xmm0,%%xmm0\n"
"movq %%xmm0,(%2)\n" "movq %%xmm0,(%1)\n"
"lea 0x8(%2),%2\n" "lea 0x8(%1),%1\n"
"psrlw $0x8,%%xmm1\n" "psrlw $0x8,%%xmm1\n"
"packuswb %%xmm1,%%xmm1\n" "packuswb %%xmm1,%%xmm1\n"
"movq %%xmm1,(%3)\n" "movq %%xmm1,(%2)\n"
"lea 0x8(%3),%3\n" "lea 0x8(%2),%2\n"
"sub $0x10,%4\n" "sub $0x10,%3\n"
"ja 1b\n" "ja 1b\n"
: : "+r"(src_yuy2), // %0
: "r"(src_yuy2), // %0 "+r"(dst_u), // %1
"r"((intptr_t)stride_yuy2), // %1 "+r"(dst_y), // %2
"r"(dst_u), // %2 "+r"(pix) // %3
"r"(dst_y), // %3 : "r"(static_cast<intptr_t>(stride_yuy2)) // %4
"r"(pix) // %4
: "memory" : "memory"
); );
} }
#define HAS_UYVYTOI420ROW_SSE2 #define HAS_UYVYTOI420ROW_SSE2
static void UYVYToI420RowY_SSE2(const uint8* src_uyvy, static void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
uint8* dst_y, int pix) { uint8* dst_y, int pix) {
asm( asm volatile(
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n" "movdqa 0x10(%0),%%xmm1\n"
@ -715,24 +744,24 @@ static void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
"lea 0x10(%1),%1\n" "lea 0x10(%1),%1\n"
"sub $0x10,%2\n" "sub $0x10,%2\n"
"ja 1b\n" "ja 1b\n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
"+r"(pix) // %2
: :
: "r"(src_uyvy), // %0
"r"(dst_y), // %1
"r"(pix) // %2
: "memory" : "memory"
); );
} }
static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy, static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
uint8* dst_u, uint8* dst_y, int pix) { uint8* dst_u, uint8* dst_y, int pix) {
asm( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" "pcmpeqb %%xmm7,%%xmm7\n"
"psrlw $0x8,%%xmm7\n" "psrlw $0x8,%%xmm7\n"
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
"movdqa 0x10(%0),%%xmm1\n" "movdqa 0x10(%0),%%xmm1\n"
"movdqa (%0,%1,1),%%xmm2\n" "movdqa (%0,%4,1),%%xmm2\n"
"movdqa 0x10(%0,%1,1),%%xmm3\n" "movdqa 0x10(%0,%4,1),%%xmm3\n"
"lea 0x20(%0),%0\n" "lea 0x20(%0),%0\n"
"pavgb %%xmm2,%%xmm0\n" "pavgb %%xmm2,%%xmm0\n"
"pavgb %%xmm3,%%xmm1\n" "pavgb %%xmm3,%%xmm1\n"
@ -742,28 +771,28 @@ static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
"movdqa %%xmm0,%%xmm1\n" "movdqa %%xmm0,%%xmm1\n"
"pand %%xmm7,%%xmm0\n" "pand %%xmm7,%%xmm0\n"
"packuswb %%xmm0,%%xmm0\n" "packuswb %%xmm0,%%xmm0\n"
"movq %%xmm0,(%2)\n" "movq %%xmm0,(%1)\n"
"lea 0x8(%2),%2\n" "lea 0x8(%1),%1\n"
"psrlw $0x8,%%xmm1\n" "psrlw $0x8,%%xmm1\n"
"packuswb %%xmm1,%%xmm1\n" "packuswb %%xmm1,%%xmm1\n"
"movq %%xmm1,(%3)\n" "movq %%xmm1,(%2)\n"
"lea 0x8(%3),%3\n" "lea 0x8(%2),%2\n"
"sub $0x10,%4\n" "sub $0x10,%3\n"
"ja 1b\n" "ja 1b\n"
: : "+r"(src_uyvy), // %0
: "r"(src_uyvy), // %0 "+r"(dst_u), // %1
"r"((intptr_t)stride_uyvy), // %1 "+r"(dst_y), // %2
"r"(dst_u), // %2 "+r"(pix) // %3
"r"(dst_y), // %3 : "r"(static_cast<intptr_t>(stride_uyvy)) // %4
"r"(pix) // %4
: "memory" : "memory"
); );
} }
#endif #endif
// Filter 2 rows of YUY2 UV's (422) into U and V (420)
void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_stride_yuy2, void YUY2ToI420RowUV_C(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix) { uint8* dst_u, uint8* dst_v, int pix) {
// Copy a row of yuy2 UV values // Output a row of UV values, filtering 2 rows of YUY2
for (int x = 0; x < pix; x += 2) { for (int x = 0; x < pix; x += 2) {
dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1; dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1; dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
@ -811,6 +840,12 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
src_stride_yuy2 = -src_stride_yuy2;
}
void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_stride_yuy2, void (*YUY2ToI420RowUV)(const uint8* src_yuy2, int src_stride_yuy2,
uint8* dst_u, uint8* dst_v, int pix); uint8* dst_u, uint8* dst_v, int pix);
void (*YUY2ToI420RowY)(const uint8* src_yuy2, void (*YUY2ToI420RowY)(const uint8* src_yuy2,
@ -852,6 +887,12 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_u, int dst_stride_u, uint8* dst_u, int dst_stride_u,
uint8* dst_v, int dst_stride_v, uint8* dst_v, int dst_stride_v,
int width, int height) { int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
src_stride_uyvy = -src_stride_uyvy;
}
void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_stride_uyvy, void (*UYVYToI420RowUV)(const uint8* src_uyvy, int src_stride_uyvy,
uint8* dst_u, uint8* dst_v, int pix); uint8* dst_u, uint8* dst_v, int pix);
void (*UYVYToI420RowY)(const uint8* src_uyvy, void (*UYVYToI420RowY)(const uint8* src_uyvy,
@ -894,6 +935,12 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width); FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
@ -914,6 +961,12 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_argb, width); FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
@ -933,6 +986,12 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_argb, width); FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
@ -952,6 +1011,12 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width); FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
@ -970,6 +1035,12 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
const uint8* src_v, int src_stride_v, const uint8* src_v, int src_stride_v,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
FastConvertYUV444ToRGB32Row(src_y, src_u, src_v, dst_argb, width); FastConvertYUV444ToRGB32Row(src_y, src_u, src_v, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
@ -986,6 +1057,12 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
int I400ToARGB_Reference(const uint8* src_y, int src_stride_y, int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
uint8* dst_argb, int dst_stride_argb, uint8* dst_argb, int dst_stride_argb,
int width, int height) { int width, int height) {
// Negative height means invert the image.
if (height < 0) {
height = -height;
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
dst_stride_argb = -dst_stride_argb;
}
for (int y = 0; y < height; ++y) { for (int y = 0; y < height; ++y) {
FastConvertYToRGB32Row(src_y, dst_argb, width); FastConvertYToRGB32Row(src_y, dst_argb, width);
dst_argb += dst_stride_argb; dst_argb += dst_stride_argb;
@ -1157,7 +1234,7 @@ __asm {
// TODO(yuche): consider moving ARGB related codes to a separate file. // TODO(yuche): consider moving ARGB related codes to a separate file.
#define HAS_I400TOARGBROW_SSE2 #define HAS_I400TOARGBROW_SSE2
static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
asm( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" "pcmpeqb %%xmm7,%%xmm7\n"
"pslld $0x18,%%xmm7\n" "pslld $0x18,%%xmm7\n"
"1:" "1:"
@ -1174,10 +1251,10 @@ static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
"lea 0x20(%1),%1\n" "lea 0x20(%1),%1\n"
"sub $0x8,%2\n" "sub $0x8,%2\n"
"ja 1b\n" "ja 1b\n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
"+r"(pix) // %2
: :
: "r"(src_y), // %0
"r"(dst_argb), // %1
"r"(pix) // %2
: "memory" : "memory"
); );
} }
@ -1185,7 +1262,7 @@ static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
#define HAS_ABGRTOARGBROW_SSSE3 #define HAS_ABGRTOARGBROW_SSSE3
static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
int pix) { int pix) {
asm( asm volatile(
"movdqa (%3),%%xmm7\n" "movdqa (%3),%%xmm7\n"
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
@ -1195,11 +1272,10 @@ static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
"lea 0x10(%1),%1\n" "lea 0x10(%1),%1\n"
"sub $0x4,%2\n" "sub $0x4,%2\n"
"ja 1b\n" "ja 1b\n"
: : "+r"(src_abgr), // %0
: "r"(src_abgr), // %0 "+r"(dst_argb), // %1
"r"(dst_argb), // %1 "+r"(pix) // %2
"r"(pix), // %2 : "r"(kShuffleMaskABGRToARGB) // %3
"r"(kShuffleMaskABGRToARGB) // %3
: "memory" : "memory"
); );
} }
@ -1207,7 +1283,7 @@ static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
#define HAS_BGRATOARGBROW_SSSE3 #define HAS_BGRATOARGBROW_SSSE3
static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
int pix) { int pix) {
asm( asm volatile(
"movdqa (%3),%%xmm7\n" "movdqa (%3),%%xmm7\n"
"1:" "1:"
"movdqa (%0),%%xmm0\n" "movdqa (%0),%%xmm0\n"
@ -1217,11 +1293,10 @@ static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
"lea 0x10(%1),%1\n" "lea 0x10(%1),%1\n"
"sub $0x4,%2\n" "sub $0x4,%2\n"
"ja 1b\n" "ja 1b\n"
: : "+r"(src_bgra), // %0
: "r"(src_bgra), // %0 "+r"(dst_argb), // %1
"r"(dst_argb), // %1 "+r"(pix) // %2
"r"(pix), // %2 : "r"(kShuffleMaskBGRAToARGB) // %3
"r"(kShuffleMaskBGRAToARGB) // %3
: "memory" : "memory"
); );
} }
@ -1229,7 +1304,7 @@ static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
#define HAS_BG24TOARGBROW_SSSE3 #define HAS_BG24TOARGBROW_SSSE3
static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb,
int pix) { int pix) {
asm( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000 "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
"pslld $0x18,%%xmm7\n" "pslld $0x18,%%xmm7\n"
"movdqa (%3),%%xmm6\n" "movdqa (%3),%%xmm6\n"
@ -1257,11 +1332,10 @@ static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb,
"lea 0x40(%1),%1\n" "lea 0x40(%1),%1\n"
"sub $0x10,%2\n" "sub $0x10,%2\n"
"ja 1b\n" "ja 1b\n"
: : "+r"(src_bg24), // %0
: "r"(src_bg24), // %0 "+r"(dst_argb), // %1
"r"(dst_argb), // %1 "+r"(pix) // %2
"r"(pix), // %2 : "r"(kShuffleMaskBG24ToARGB) // %3
"r"(kShuffleMaskBG24ToARGB) // %3
: "memory" : "memory"
); );
} }
@ -1269,7 +1343,7 @@ static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb,
#define HAS_RAWTOARGBROW_SSSE3 #define HAS_RAWTOARGBROW_SSSE3
static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
int pix) { int pix) {
asm( asm volatile(
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000 "pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
"pslld $0x18,%%xmm7\n" "pslld $0x18,%%xmm7\n"
"movdqa (%3),%%xmm6\n" "movdqa (%3),%%xmm6\n"
@ -1297,11 +1371,10 @@ static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
"lea 0x40(%1),%1\n" "lea 0x40(%1),%1\n"
"sub $0x10,%2\n" "sub $0x10,%2\n"
"ja 1b\n" "ja 1b\n"
: : "+r"(src_raw), // %0
: "r"(src_raw), // %0 "+r"(dst_argb), // %1
"r"(dst_argb), // %1 "+r"(pix) // %2
"r"(pix), // %2 : "r"(kShuffleMaskRAWToARGB) // %3
"r"(kShuffleMaskRAWToARGB) // %3
: "memory" : "memory"
); );
} }
@ -1530,6 +1603,32 @@ int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
return 0; return 0;
} }
static void ARGBToI400Row_C(const uint8* src_argb, uint8* dst_y, int pix) {
for (int x = 0; x < pix; ++x) {
uint32 b = static_cast<uint32>(src_argb[0] * 25u);
uint32 g = static_cast<uint32>(src_argb[1] * 129u);
uint32 r = static_cast<uint32>(src_argb[2] * 66u);
*(dst_y++) = static_cast<uint8>(((b + g + r) >> 8) + 16u);
src_argb += 4;
}
}
// Convert ARGB to I400.
int ARGBToI400(const uint8* src_argb, int src_stride_argb,
uint8* dst_y, int dst_stride_y,
int width, int height) {
if (height < 0) {
height = -height;
src_argb = src_argb + (height - 1) * src_stride_argb;
src_stride_argb = -src_stride_argb;
}
for (int y = 0; y < height; ++y) {
ARGBToI400Row_C(src_argb, dst_y, width);
src_argb += src_stride_argb;
dst_y += dst_stride_y;
}
return 0;
}
} // namespace libyuv } // namespace libyuv

View File

@ -21,7 +21,7 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
const uint8* v_buf, // rdx const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx uint8* rgb_buf, // rcx
int width) { // r8 int width) { // r8
asm( asm volatile(
"1:" "1:"
"movzb (%1),%%r10\n" "movzb (%1),%%r10\n"
"lea 1(%1),%1\n" "lea 1(%1),%1\n"
@ -44,13 +44,12 @@ void FastConvertYUVToRGB32Row(const uint8* y_buf, // rdi
"lea 8(%3),%3\n" "lea 8(%3),%3\n"
"sub $0x2,%4\n" "sub $0x2,%4\n"
"ja 1b\n" "ja 1b\n"
: : "+r"(y_buf), // %0
: "r"(y_buf), // %0 "+r"(u_buf), // %1
"r"(u_buf), // %1 "+r"(v_buf), // %2
"r"(v_buf), // %2 "+r"(rgb_buf), // %3
"r"(rgb_buf), // %3 "+r"(width) // %4
"r"(width), // %4 : "r" (_kCoefficientsRgbY) // %5
"r" (_kCoefficientsRgbY) // %5
: "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
); );
} }
@ -60,7 +59,7 @@ void FastConvertYUVToBGRARow(const uint8* y_buf, // rdi
const uint8* v_buf, // rdx const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx uint8* rgb_buf, // rcx
int width) { // r8 int width) { // r8
asm( asm volatile(
"1:" "1:"
"movzb (%1),%%r10\n" "movzb (%1),%%r10\n"
"lea 1(%1),%1\n" "lea 1(%1),%1\n"
@ -83,13 +82,12 @@ void FastConvertYUVToBGRARow(const uint8* y_buf, // rdi
"lea 8(%3),%3\n" "lea 8(%3),%3\n"
"sub $0x2,%4\n" "sub $0x2,%4\n"
"ja 1b\n" "ja 1b\n"
: : "+r"(y_buf), // %0
: "r"(y_buf), // %0 "+r"(u_buf), // %1
"r"(u_buf), // %1 "+r"(v_buf), // %2
"r"(v_buf), // %2 "+r"(rgb_buf), // %3
"r"(rgb_buf), // %3 "+r"(width) // %4
"r"(width), // %4 : "r" (_kCoefficientsBgraY) // %5
"r" (_kCoefficientsBgraY) // %5
: "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
); );
} }
@ -99,7 +97,7 @@ void FastConvertYUVToABGRRow(const uint8* y_buf, // rdi
const uint8* v_buf, // rdx const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx uint8* rgb_buf, // rcx
int width) { // r8 int width) { // r8
asm( asm volatile(
"1:" "1:"
"movzb (%1),%%r10\n" "movzb (%1),%%r10\n"
"lea 1(%1),%1\n" "lea 1(%1),%1\n"
@ -122,13 +120,12 @@ void FastConvertYUVToABGRRow(const uint8* y_buf, // rdi
"lea 8(%3),%3\n" "lea 8(%3),%3\n"
"sub $0x2,%4\n" "sub $0x2,%4\n"
"ja 1b\n" "ja 1b\n"
: : "+r"(y_buf), // %0
: "r"(y_buf), // %0 "+r"(u_buf), // %1
"r"(u_buf), // %1 "+r"(v_buf), // %2
"r"(v_buf), // %2 "+r"(rgb_buf), // %3
"r"(rgb_buf), // %3 "+r"(width) // %4
"r"(width), // %4 : "r" (_kCoefficientsAbgrY) // %5
"r" (_kCoefficientsAbgrY) // %5
: "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
); );
} }
@ -138,7 +135,7 @@ void FastConvertYUV444ToRGB32Row(const uint8* y_buf, // rdi
const uint8* v_buf, // rdx const uint8* v_buf, // rdx
uint8* rgb_buf, // rcx uint8* rgb_buf, // rcx
int width) { // r8 int width) { // r8
asm( asm volatile(
"1:" "1:"
"movzb (%1),%%r10\n" "movzb (%1),%%r10\n"
"lea 1(%1),%1\n" "lea 1(%1),%1\n"
@ -158,13 +155,12 @@ void FastConvertYUV444ToRGB32Row(const uint8* y_buf, // rdi
"lea 4(%3),%3\n" "lea 4(%3),%3\n"
"sub $0x1,%4\n" "sub $0x1,%4\n"
"ja 1b\n" "ja 1b\n"
: : "+r"(y_buf), // %0
: "r"(y_buf), // %0 "+r"(u_buf), // %1
"r"(u_buf), // %1 "+r"(v_buf), // %2
"r"(v_buf), // %2 "+r"(rgb_buf), // %3
"r"(rgb_buf), // %3 "+r"(width) // %4
"r"(width), // %4 : "r" (_kCoefficientsRgbY) // %5
"r" (_kCoefficientsRgbY) // %5
: "memory", "r10", "r11", "xmm0", "xmm1", "xmm2" : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2"
); );
} }
@ -172,7 +168,7 @@ void FastConvertYUV444ToRGB32Row(const uint8* y_buf, // rdi
void FastConvertYToRGB32Row(const uint8* y_buf, // rdi void FastConvertYToRGB32Row(const uint8* y_buf, // rdi
uint8* rgb_buf, // rcx uint8* rgb_buf, // rcx
int width) { // r8 int width) { // r8
asm( asm volatile(
"1:" "1:"
"movzb (%0),%%r10\n" "movzb (%0),%%r10\n"
"movzb 0x1(%0),%%r11\n" "movzb 0x1(%0),%%r11\n"
@ -186,11 +182,10 @@ void FastConvertYToRGB32Row(const uint8* y_buf, // rdi
"lea 8(%1),%1\n" "lea 8(%1),%1\n"
"sub $0x2,%2\n" "sub $0x2,%2\n"
"ja 1b\n" "ja 1b\n"
: : "+r"(y_buf), // %0
: "r"(y_buf), // %0 "+r"(rgb_buf), // %1
"r"(rgb_buf), // %1 "+r"(width) // %2
"r"(width), // %2 : "r" (_kCoefficientsRgbY) // %3
"r" (_kCoefficientsRgbY) // %3
: "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" : "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
); );
} }

File diff suppressed because it is too large Load Diff