mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-12 06:29:57 +08:00
ARGB To I420 and variations using row functions
BUG=none TEST=media_unittests from talk used to benchmark Review URL: http://webrtc-codereview.appspot.com/254001 git-svn-id: http://libyuv.googlecode.com/svn/trunk@51 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
7472021e53
commit
9394ed99fc
@ -20,6 +20,9 @@ static const int kCpuHasSSSE3 = 2;
|
||||
// These flags are only valid on ARM processors
|
||||
static const int kCpuHasNEON = 4;
|
||||
|
||||
// Internal flag to indicate cpuid is initialized.
|
||||
static const int kCpuInitialized = 8;
|
||||
|
||||
// Detect CPU has SSE2 etc.
|
||||
bool TestCpuFlag(int flag);
|
||||
|
||||
|
||||
@ -636,185 +636,6 @@ int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// ARGBToI420Row_C etc row functions use the following macro, generating
|
||||
// code with RGB offsets/strides different for each version. Less error
|
||||
// prone than duplicating the code.
|
||||
// template could be used, but macro method works for C and asm and this is
|
||||
// performance critical code.
|
||||
|
||||
#define MAKEROWRGBTOI420(NAME,R,G,B,BPP) \
|
||||
static void \
|
||||
NAME(const uint8* src_row0, const uint8* src_row1, \
|
||||
uint8* dst_yplane0, uint8* dst_yplane1, \
|
||||
uint8* dst_u, \
|
||||
uint8* dst_v, \
|
||||
int width) { \
|
||||
for (int x = 0; x < width - 1; x += 2) { \
|
||||
dst_yplane0[0] = (uint8)((src_row0[R] * 66 + \
|
||||
src_row0[G] * 129 + \
|
||||
src_row0[B] * 25 + 128) >> 8) + 16; \
|
||||
dst_yplane0[1] = (uint8)((src_row0[R + BPP] * 66 + \
|
||||
src_row0[G + BPP] * 129 + \
|
||||
src_row0[B + BPP] * 25 + 128) >> 8) + 16; \
|
||||
dst_yplane1[0] = (uint8)((src_row1[R] * 66 + \
|
||||
src_row1[G] * 129 + \
|
||||
src_row1[B] * 25 + 128) >> 8) + 16; \
|
||||
dst_yplane1[1] = (uint8)((src_row1[R + BPP] * 66 + \
|
||||
src_row1[G + BPP] * 129 + \
|
||||
src_row1[B + BPP] * 25 + 128) >> 8) + 16; \
|
||||
dst_u[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \
|
||||
src_row1[R] + src_row1[R + BPP]) * -38 + \
|
||||
(src_row0[G] + src_row0[G + BPP] + \
|
||||
src_row1[G] + src_row1[G + BPP]) * -74 + \
|
||||
(src_row0[B] + src_row0[B + BPP] + \
|
||||
src_row1[B] + src_row1[B + BPP]) * 112 + \
|
||||
+ 512) >> 10) + 128; \
|
||||
dst_v[0] = (uint8)(((src_row0[R] + src_row0[R + BPP] + \
|
||||
src_row1[R] + src_row1[R + BPP]) * 112 + \
|
||||
(src_row0[G] + src_row0[G + BPP] + \
|
||||
src_row1[G] + src_row1[G + BPP]) * -94 + \
|
||||
(src_row0[B] + src_row0[B + BPP] + \
|
||||
src_row1[B] + src_row1[B + BPP]) * -18 + \
|
||||
+ 512) >> 10) + 128; \
|
||||
dst_yplane0 += 2; \
|
||||
dst_yplane1 += 2; \
|
||||
++dst_u; \
|
||||
++dst_v; \
|
||||
src_row0 += BPP * 2; \
|
||||
src_row1 += BPP * 2; \
|
||||
} \
|
||||
if (width & 1) { \
|
||||
dst_yplane0[0] = (uint8)((src_row0[R] * 66 + \
|
||||
src_row0[G] * 129 + \
|
||||
src_row0[B] * 25 + 128) >> 8) + 16; \
|
||||
dst_yplane1[0] = (uint8)((src_row1[R] * 66 + \
|
||||
src_row1[G] * 129 + \
|
||||
src_row1[B] * 25 + 128) >> 8) + 16; \
|
||||
dst_u[0] = (uint8)(((src_row0[R] + \
|
||||
src_row1[R]) * -38 + \
|
||||
(src_row0[G] + \
|
||||
src_row1[G]) * -74 + \
|
||||
(src_row0[B] + \
|
||||
src_row1[B]) * 112 + \
|
||||
+ 256) >> 9) + 128; \
|
||||
dst_v[0] = (uint8)(((src_row0[R] + \
|
||||
src_row1[R]) * 112 + \
|
||||
(src_row0[G] + \
|
||||
src_row1[G]) * -94 + \
|
||||
(src_row0[B] + \
|
||||
src_row1[B]) * -18 + \
|
||||
+ 256) >> 9) + 128; \
|
||||
} \
|
||||
}
|
||||
|
||||
// Generate variations of RGBToI420. Parameters are r,g,b offsets within a
|
||||
// pixel, and number of bytes per pixel.
|
||||
MAKEROWRGBTOI420(ARGBToI420Row_C, 2, 1, 0, 4)
|
||||
MAKEROWRGBTOI420(BGRAToI420Row_C, 1, 2, 3, 4)
|
||||
MAKEROWRGBTOI420(ABGRToI420Row_C, 0, 1, 2, 4)
|
||||
MAKEROWRGBTOI420(RGB24ToI420Row_C, 2, 1, 0, 3)
|
||||
MAKEROWRGBTOI420(RAWToI420Row_C, 0, 1, 2, 3)
|
||||
|
||||
static int RGBToI420(const uint8* src_frame, int src_stride_frame,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height,
|
||||
void (*RGBToI420Row)(const uint8* src_row0,
|
||||
const uint8* src_row1,
|
||||
uint8* dst_yplane0,
|
||||
uint8* dst_yplane1,
|
||||
uint8* dst_u,
|
||||
uint8* dst_v,
|
||||
int width)) {
|
||||
if (src_frame == NULL || dst_y == NULL ||
|
||||
dst_v == NULL || dst_v == NULL)
|
||||
return -1;
|
||||
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_frame = src_frame + src_stride_frame * (height -1);
|
||||
src_stride_frame = -src_stride_frame;
|
||||
}
|
||||
for (int y = 0; y < height - 1; y += 2) {
|
||||
RGBToI420Row(src_frame, src_frame + src_stride_frame,
|
||||
dst_y, dst_y + dst_stride_y,
|
||||
dst_u, dst_v,
|
||||
width);
|
||||
src_frame += src_stride_frame * 2;
|
||||
dst_y += dst_stride_y * 2;
|
||||
dst_u += dst_stride_u;
|
||||
dst_v += dst_stride_v;
|
||||
}
|
||||
if (height & 1) {
|
||||
RGBToI420Row(src_frame, src_frame,
|
||||
dst_y, dst_y,
|
||||
dst_u, dst_v,
|
||||
width);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ARGBToI420_Reference(const uint8* src_frame, int src_stride_frame,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height) {
|
||||
return RGBToI420(src_frame, src_stride_frame,
|
||||
dst_y, dst_stride_y,
|
||||
dst_u, dst_stride_u,
|
||||
dst_v, dst_stride_v,
|
||||
width, height, ARGBToI420Row_C);
|
||||
}
|
||||
|
||||
int BGRAToI420(const uint8* src_frame, int src_stride_frame,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height) {
|
||||
return RGBToI420(src_frame, src_stride_frame,
|
||||
dst_y, dst_stride_y,
|
||||
dst_u, dst_stride_u,
|
||||
dst_v, dst_stride_v,
|
||||
width, height, BGRAToI420Row_C);
|
||||
}
|
||||
|
||||
int ABGRToI420(const uint8* src_frame, int src_stride_frame,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height) {
|
||||
return RGBToI420(src_frame, src_stride_frame,
|
||||
dst_y, dst_stride_y,
|
||||
dst_u, dst_stride_u,
|
||||
dst_v, dst_stride_v,
|
||||
width, height, ABGRToI420Row_C);
|
||||
}
|
||||
|
||||
int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height) {
|
||||
return RGBToI420(src_frame, src_stride_frame,
|
||||
dst_y, dst_stride_y,
|
||||
dst_u, dst_stride_u,
|
||||
dst_v, dst_stride_v,
|
||||
width, height, RGB24ToI420Row_C);
|
||||
}
|
||||
|
||||
int RAWToI420(const uint8* src_frame, int src_stride_frame,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height) {
|
||||
return RGBToI420(src_frame, src_stride_frame,
|
||||
dst_y, dst_stride_y,
|
||||
dst_u, dst_stride_u,
|
||||
dst_v, dst_stride_v,
|
||||
width, height, RAWToI420Row_C);
|
||||
}
|
||||
|
||||
int ARGBToI420(const uint8* src_frame, int src_stride_frame,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
@ -830,9 +651,9 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
#if defined(HAS_ARGBTOYROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 8 == 0) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_y, 8) && (dst_stride_y % 8 == 0)) {
|
||||
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
|
||||
ARGBToYRow = ARGBToYRow_SSSE3;
|
||||
} else
|
||||
#endif
|
||||
@ -841,10 +662,10 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
|
||||
}
|
||||
#if defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 8 == 0) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_u, 4) && (dst_stride_u % 4 == 0) &&
|
||||
IS_ALIGNED(dst_v, 4) && (dst_stride_v % 4 == 0)) {
|
||||
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
|
||||
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
|
||||
ARGBToUVRow = ARGBToUVRow_SSSE3;
|
||||
} else
|
||||
#endif
|
||||
@ -853,17 +674,229 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
|
||||
}
|
||||
|
||||
for (int y = 0; y < (height - 1); y += 2) {
|
||||
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
|
||||
ARGBToYRow(src_frame, dst_y, width);
|
||||
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
|
||||
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
|
||||
src_frame += src_stride_frame * 2;
|
||||
dst_y += dst_stride_y * 2;
|
||||
dst_u += dst_stride_u;
|
||||
dst_v += dst_stride_v;
|
||||
}
|
||||
if (height & 1) {
|
||||
ARGBToYRow(src_frame, dst_y, width);
|
||||
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
|
||||
ARGBToYRow(src_frame, dst_y, width);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int BGRAToI420(const uint8* src_frame, int src_stride_frame,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height) {
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_frame = src_frame + (height - 1) * src_stride_frame;
|
||||
src_stride_frame = -src_stride_frame;
|
||||
}
|
||||
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
#if defined(HAS_BGRATOYROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
|
||||
ARGBToYRow = BGRAToYRow_SSSE3;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
ARGBToYRow = BGRAToYRow_C;
|
||||
}
|
||||
#if defined(HAS_BGRATOUVROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
|
||||
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
|
||||
ARGBToUVRow = BGRAToUVRow_SSSE3;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
ARGBToUVRow = BGRAToUVRow_C;
|
||||
}
|
||||
|
||||
for (int y = 0; y < (height - 1); y += 2) {
|
||||
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
|
||||
ARGBToYRow(src_frame, dst_y, width);
|
||||
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
|
||||
src_frame += src_stride_frame * 2;
|
||||
dst_y += dst_stride_y * 2;
|
||||
dst_u += dst_stride_u;
|
||||
dst_v += dst_stride_v;
|
||||
}
|
||||
if (height & 1) {
|
||||
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
|
||||
ARGBToYRow(src_frame, dst_y, width);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int ABGRToI420(const uint8* src_frame, int src_stride_frame,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height) {
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_frame = src_frame + (height - 1) * src_stride_frame;
|
||||
src_stride_frame = -src_stride_frame;
|
||||
}
|
||||
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
#if defined(HAS_ABGRTOYROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
|
||||
ARGBToYRow = ABGRToYRow_SSSE3;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
ARGBToYRow = ABGRToYRow_C;
|
||||
}
|
||||
#if defined(HAS_ABGRTOUVROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
|
||||
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
|
||||
ARGBToUVRow = ABGRToUVRow_SSSE3;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
ARGBToUVRow = ABGRToUVRow_C;
|
||||
}
|
||||
|
||||
for (int y = 0; y < (height - 1); y += 2) {
|
||||
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
|
||||
ARGBToYRow(src_frame, dst_y, width);
|
||||
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
|
||||
src_frame += src_stride_frame * 2;
|
||||
dst_y += dst_stride_y * 2;
|
||||
dst_u += dst_stride_u;
|
||||
dst_v += dst_stride_v;
|
||||
}
|
||||
if (height & 1) {
|
||||
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
|
||||
ARGBToYRow(src_frame, dst_y, width);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height) {
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_frame = src_frame + (height - 1) * src_stride_frame;
|
||||
src_stride_frame = -src_stride_frame;
|
||||
}
|
||||
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
#if defined(HAS_RGB24TOYROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
|
||||
ARGBToYRow = RGB24ToYRow_SSSE3;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
ARGBToYRow = RGB24ToYRow_C;
|
||||
}
|
||||
#if defined(HAS_RGB24TOUVROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
|
||||
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
|
||||
ARGBToUVRow = RGB24ToUVRow_SSSE3;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
ARGBToUVRow = RGB24ToUVRow_C;
|
||||
}
|
||||
|
||||
for (int y = 0; y < (height - 1); y += 2) {
|
||||
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
|
||||
ARGBToYRow(src_frame, dst_y, width);
|
||||
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
|
||||
src_frame += src_stride_frame * 2;
|
||||
dst_y += dst_stride_y * 2;
|
||||
dst_u += dst_stride_u;
|
||||
dst_v += dst_stride_v;
|
||||
}
|
||||
if (height & 1) {
|
||||
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
|
||||
ARGBToYRow(src_frame, dst_y, width);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int RAWToI420(const uint8* src_frame, int src_stride_frame,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height) {
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_frame = src_frame + (height - 1) * src_stride_frame;
|
||||
src_stride_frame = -src_stride_frame;
|
||||
}
|
||||
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
#if defined(HAS_RAWTOYROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
|
||||
ARGBToYRow = RAWToYRow_SSSE3;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
ARGBToYRow = RAWToYRow_C;
|
||||
}
|
||||
#if defined(HAS_RAWTOUVROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
|
||||
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
|
||||
ARGBToUVRow = RAWToUVRow_SSSE3;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
ARGBToUVRow = RAWToUVRow_C;
|
||||
}
|
||||
|
||||
for (int y = 0; y < (height - 1); y += 2) {
|
||||
ARGBToUVRow(src_frame, src_stride_frame, dst_u, dst_v, width);
|
||||
ARGBToYRow(src_frame, dst_y, width);
|
||||
ARGBToYRow(src_frame + src_stride_frame, dst_y + dst_stride_y, width);
|
||||
src_frame += src_stride_frame * 2;
|
||||
dst_y += dst_stride_y * 2;
|
||||
dst_u += dst_stride_u;
|
||||
dst_v += dst_stride_v;
|
||||
}
|
||||
if (height & 1) {
|
||||
ARGBToUVRow(src_frame, 0, dst_u, dst_v, width);
|
||||
ARGBToYRow(src_frame, dst_y, width);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -15,9 +15,6 @@
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
// Internal flag to indicate cpuid is initialized.
|
||||
static const int kCpuInitialized = 16;
|
||||
|
||||
// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
|
||||
#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
|
||||
static inline void __cpuid(int cpu_info[4], int info_type) {
|
||||
@ -64,11 +61,11 @@ static void InitCpuFlags() {
|
||||
|
||||
void MaskCpuFlags(int enable_flags) {
|
||||
InitCpuFlags();
|
||||
cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
|
||||
cpu_info_ &= enable_flags;
|
||||
}
|
||||
|
||||
bool TestCpuFlag(int flag) {
|
||||
if (!cpu_info_) {
|
||||
if (0 == cpu_info_) {
|
||||
InitCpuFlags();
|
||||
}
|
||||
return cpu_info_ & flag ? true : false;
|
||||
|
||||
@ -14,6 +14,8 @@
|
||||
#include "video_common.h"
|
||||
#include "row.h"
|
||||
|
||||
#define kMaxStride (2048 * 4)
|
||||
|
||||
namespace libyuv {
|
||||
|
||||
// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
|
||||
@ -329,6 +331,9 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int width, int height) {
|
||||
if (width * 4 > kMaxStride) {
|
||||
return -1;
|
||||
}
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
@ -347,23 +352,29 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
|
||||
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
#define kMaxStride (2048 * 4)
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
|
||||
|
||||
#if defined(HAS_ARGBTOYROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 8 == 0) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) &&
|
||||
IS_ALIGNED(dst_y, 8) && (dst_stride_y % 8 == 0)) {
|
||||
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
|
||||
ARGBToYRow = ARGBToYRow_SSSE3;
|
||||
#if defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
ARGBToUVRow = ARGBToUVRow_SSSE3;
|
||||
#else
|
||||
ARGBToUVRow = ARGBToUVRow_C;
|
||||
#endif
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
ARGBToYRow = ARGBToYRow_C;
|
||||
}
|
||||
#if defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(row, 16) && (kMaxStride % 16 == 0) &&
|
||||
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
|
||||
IS_ALIGNED(dst_v, 8) && (dst_stride_v % 8 == 0)) {
|
||||
ARGBToUVRow = ARGBToUVRow_SSSE3;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
ARGBToUVRow = ARGBToUVRow_C;
|
||||
}
|
||||
|
||||
@ -392,9 +403,9 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
|
||||
BayerRow0(src_bayer, src_stride_bayer, row, width);
|
||||
BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
|
||||
row + kMaxStride, width);
|
||||
ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
|
||||
ARGBToYRow(row, dst_y, width);
|
||||
ARGBToYRow(row + kMaxStride, dst_y + dst_stride_y, width);
|
||||
ARGBToUVRow(row, kMaxStride, dst_u, dst_v, width);
|
||||
src_bayer += src_stride_bayer * 2;
|
||||
dst_y += dst_stride_y * 2;
|
||||
dst_u += dst_stride_u;
|
||||
@ -403,8 +414,8 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
|
||||
// TODO(fbarchard): Make sure this filters properly
|
||||
if (height & 1) {
|
||||
BayerRow0(src_bayer, src_stride_bayer, row, width);
|
||||
ARGBToYRow(row, dst_y, width);
|
||||
ARGBToUVRow(row, 0, dst_u, dst_v, width);
|
||||
ARGBToYRow(row, dst_y, width);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@ -58,16 +58,6 @@ extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = {
|
||||
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
|
||||
};
|
||||
|
||||
// Shuffle table for converting BG24 to ARGB.
|
||||
extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
|
||||
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
|
||||
};
|
||||
|
||||
// Shuffle table for converting RAW to ARGB.
|
||||
extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
|
||||
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
|
||||
};
|
||||
|
||||
#if defined(WIN32) && !defined(COVERAGE_ENABLED)
|
||||
#define HAS_SPLITUV_SSE2
|
||||
__declspec(naked)
|
||||
@ -206,7 +196,7 @@ int I420Copy(const uint8* src_y, int src_stride_y,
|
||||
static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
|
||||
__asm__ volatile
|
||||
(
|
||||
"vdup.u32 {q0}, %2 \n" // duplicate 4 ints
|
||||
"vdup.u32 q0, %2 \n" // duplicate 4 ints
|
||||
"1:\n"
|
||||
"vst1.u32 {q0}, [%0]! \n" // store
|
||||
"subs %1, %1, #16 \n" // 16 processed per loop
|
||||
@ -1282,85 +1272,6 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
#define HAS_BG24TOARGBROW_SSSE3
|
||||
__declspec(naked)
|
||||
static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb,
|
||||
int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_bg24
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
|
||||
pslld xmm7, 24
|
||||
movdqa xmm6, _kShuffleMaskBG24ToARGB
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm3, [eax + 32]
|
||||
lea eax, [eax + 48]
|
||||
movdqa xmm2, xmm3
|
||||
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
|
||||
pshufb xmm2, xmm6
|
||||
por xmm2, xmm7
|
||||
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
|
||||
pshufb xmm0, xmm6
|
||||
movdqa [edx + 32], xmm2
|
||||
por xmm0, xmm7
|
||||
pshufb xmm1, xmm6
|
||||
movdqa [edx], xmm0
|
||||
por xmm1, xmm7
|
||||
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
|
||||
pshufb xmm3, xmm6
|
||||
movdqa [edx + 16], xmm1
|
||||
por xmm3, xmm7
|
||||
movdqa [edx + 48], xmm3
|
||||
lea edx, [edx + 64]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
#define HAS_RAWTOARGBROW_SSSE3
|
||||
__declspec(naked)
|
||||
static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
|
||||
int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_raw
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
|
||||
pslld xmm7, 24
|
||||
movdqa xmm6, _kShuffleMaskRAWToARGB
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm3, [eax + 32]
|
||||
lea eax, [eax + 48]
|
||||
movdqa xmm2, xmm3
|
||||
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
|
||||
pshufb xmm2, xmm6
|
||||
por xmm2, xmm7
|
||||
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
|
||||
pshufb xmm0, xmm6
|
||||
movdqa [edx + 32], xmm2
|
||||
por xmm0, xmm7
|
||||
pshufb xmm1, xmm6
|
||||
movdqa [edx], xmm0
|
||||
por xmm1, xmm7
|
||||
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
|
||||
pshufb xmm3, xmm6
|
||||
movdqa [edx + 16], xmm1
|
||||
por xmm3, xmm7
|
||||
movdqa [edx + 48], xmm3
|
||||
lea edx, [edx + 64]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
#elif (defined(__x86_64__) || defined(__i386__)) && \
|
||||
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
|
||||
@ -1435,84 +1346,6 @@ static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
|
||||
);
|
||||
}
|
||||
|
||||
#define HAS_BG24TOARGBROW_SSSE3
|
||||
static void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb,
|
||||
int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
|
||||
"pslld $0x18,%%xmm7\n"
|
||||
"movdqa (%3),%%xmm6\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
"movdqa 0x20(%0),%%xmm3\n"
|
||||
"lea 0x30(%0),%0\n"
|
||||
"movdqa %%xmm3,%%xmm2\n"
|
||||
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
|
||||
"pshufb %%xmm6,%%xmm2\n"
|
||||
"por %%xmm7,%%xmm2\n"
|
||||
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
|
||||
"pshufb %%xmm6,%%xmm0\n"
|
||||
"movdqa %%xmm2,0x20(%1)\n"
|
||||
"por %%xmm7,%%xmm0\n"
|
||||
"pshufb %%xmm6,%%xmm1\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"por %%xmm7,%%xmm1\n"
|
||||
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
|
||||
"pshufb %%xmm6,%%xmm3\n"
|
||||
"movdqa %%xmm1,0x10(%1)\n"
|
||||
"por %%xmm7,%%xmm3\n"
|
||||
"movdqa %%xmm3,0x30(%1)\n"
|
||||
"lea 0x40(%1),%1\n"
|
||||
"sub $0x10,%2\n"
|
||||
"ja 1b\n"
|
||||
: "+r"(src_bg24), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(pix) // %2
|
||||
: "r"(kShuffleMaskBG24ToARGB) // %3
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#define HAS_RAWTOARGBROW_SSSE3
|
||||
static void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
|
||||
int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
|
||||
"pslld $0x18,%%xmm7\n"
|
||||
"movdqa (%3),%%xmm6\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
"movdqa 0x20(%0),%%xmm3\n"
|
||||
"lea 0x30(%0),%0\n"
|
||||
"movdqa %%xmm3,%%xmm2\n"
|
||||
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
|
||||
"pshufb %%xmm6,%%xmm2\n"
|
||||
"por %%xmm7,%%xmm2\n"
|
||||
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
|
||||
"pshufb %%xmm6,%%xmm0\n"
|
||||
"movdqa %%xmm2,0x20(%1)\n"
|
||||
"por %%xmm7,%%xmm0\n"
|
||||
"pshufb %%xmm6,%%xmm1\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"por %%xmm7,%%xmm1\n"
|
||||
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
|
||||
"pshufb %%xmm6,%%xmm3\n"
|
||||
"movdqa %%xmm1,0x10(%1)\n"
|
||||
"por %%xmm7,%%xmm3\n"
|
||||
"movdqa %%xmm3,0x30(%1)\n"
|
||||
"lea 0x40(%1),%1\n"
|
||||
"sub $0x10,%2\n"
|
||||
"ja 1b\n"
|
||||
: "+r"(src_raw), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(pix) // %2
|
||||
: "r"(kShuffleMaskRAWToARGB) // %3
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
@ -1556,97 +1389,6 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
uint8 r = src_raw[0];
|
||||
uint8 g = src_raw[1];
|
||||
uint8 b = src_raw[2];
|
||||
dst_argb[0] = b;
|
||||
dst_argb[1] = g;
|
||||
dst_argb[2] = r;
|
||||
dst_argb[3] = 255u;
|
||||
dst_argb += 4;
|
||||
src_raw += 3;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert RAW to ARGB.
|
||||
int RAWToARGB(const uint8* src_raw, int src_stride_raw,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height) {
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_raw = src_raw + (height - 1) * src_stride_raw;
|
||||
src_stride_raw = -src_stride_raw;
|
||||
}
|
||||
void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
|
||||
#if defined(HAS_RAWTOARGBROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
|
||||
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
|
||||
RAWToARGBRow = RAWToARGBRow_SSSE3;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
RAWToARGBRow = RAWToARGBRow_C;
|
||||
}
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
RAWToARGBRow(src_raw, dst_argb, width);
|
||||
src_raw += src_stride_raw;
|
||||
dst_argb += dst_stride_argb;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
uint8 b = src_bg24[0];
|
||||
uint8 g = src_bg24[1];
|
||||
uint8 r = src_bg24[2];
|
||||
dst_argb[0] = b;
|
||||
dst_argb[1] = g;
|
||||
dst_argb[2] = r;
|
||||
dst_argb[3] = 255u;
|
||||
dst_argb[3] = 255u;
|
||||
dst_argb += 4;
|
||||
src_bg24 += 3;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert BG24 to ARGB.
|
||||
int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height) {
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_bg24 = src_bg24 + (height - 1) * src_stride_bg24;
|
||||
src_stride_bg24 = -src_stride_bg24;
|
||||
}
|
||||
void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
|
||||
#if defined(HAS_BG24TOARGBROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
|
||||
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
|
||||
BG24ToARGBRow = BG24ToARGBRow_SSSE3;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
BG24ToARGBRow = BG24ToARGBRow_C;
|
||||
}
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
BG24ToARGBRow(src_bg24, dst_argb, width);
|
||||
src_bg24 += src_stride_bg24;
|
||||
dst_argb += dst_stride_argb;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
// To support in-place conversion.
|
||||
@ -1768,5 +1510,66 @@ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
// Convert RAW to ARGB.
|
||||
int RAWToARGB(const uint8* src_raw, int src_stride_raw,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height) {
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_raw = src_raw + (height - 1) * src_stride_raw;
|
||||
src_stride_raw = -src_stride_raw;
|
||||
}
|
||||
void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
|
||||
#if defined(HAS_RAWTOARGBROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
|
||||
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
|
||||
RAWToARGBRow = RAWToARGBRow_SSSE3;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
RAWToARGBRow = RAWToARGBRow_C;
|
||||
}
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
RAWToARGBRow(src_raw, dst_argb, width);
|
||||
src_raw += src_stride_raw;
|
||||
dst_argb += dst_stride_argb;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Convert BG24 to ARGB.
|
||||
int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height) {
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_bg24 = src_bg24 + (height - 1) * src_stride_bg24;
|
||||
src_stride_bg24 = -src_stride_bg24;
|
||||
}
|
||||
void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
|
||||
#if defined(HAS_BG24TOARGBROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
|
||||
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
|
||||
BG24ToARGBRow = BG24ToARGBRow_SSSE3;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
BG24ToARGBRow = BG24ToARGBRow_C;
|
||||
}
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
BG24ToARGBRow(src_bg24, dst_argb, width);
|
||||
src_bg24 += src_stride_bg24;
|
||||
dst_argb += dst_stride_argb;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace libyuv
|
||||
|
||||
|
||||
154
source/rotate.cc
154
source/rotate.cc
@ -497,6 +497,143 @@ extern "C" void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
);
|
||||
|
||||
#if defined (__x86_64__)
|
||||
// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
|
||||
#define HAS_TRANSPOSE_WX8_FAST_SSSE3
|
||||
static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width) {
|
||||
asm volatile(
|
||||
"1:"
|
||||
// Read in the data from the source pointer.
|
||||
// First round of bit swap.
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa (%0,%3),%%xmm1\n"
|
||||
"lea (%0,%3,2),%0\n"
|
||||
"movdqa %%xmm0,%%xmm8\n"
|
||||
"punpcklbw %%xmm1,%%xmm0\n"
|
||||
"punpckhbw %%xmm1,%%xmm8\n"
|
||||
"movdqa (%0),%%xmm2\n"
|
||||
"movdqa %%xmm0,%%xmm1\n"
|
||||
"movdqa %%xmm8,%%xmm9\n"
|
||||
"palignr $0x8,%%xmm1,%%xmm1\n"
|
||||
"palignr $0x8,%%xmm9,%%xmm9\n"
|
||||
"movdqa (%0,%3),%%xmm3\n"
|
||||
"lea (%0,%3,2),%0\n"
|
||||
"movdqa %%xmm2,%%xmm10\n"
|
||||
"punpcklbw %%xmm3,%%xmm2\n"
|
||||
"punpckhbw %%xmm3,%%xmm10\n"
|
||||
"movdqa %%xmm2,%%xmm3\n"
|
||||
"movdqa %%xmm10,%%xmm11\n"
|
||||
"movdqa (%0),%%xmm4\n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3\n"
|
||||
"palignr $0x8,%%xmm11,%%xmm11\n"
|
||||
"movdqa (%0,%3),%%xmm5\n"
|
||||
"lea (%0,%3,2),%0\n"
|
||||
"movdqa %%xmm4,%%xmm12\n"
|
||||
"punpcklbw %%xmm5,%%xmm4\n"
|
||||
"punpckhbw %%xmm5,%%xmm12\n"
|
||||
"movdqa %%xmm4,%%xmm5\n"
|
||||
"movdqa %%xmm12,%%xmm13\n"
|
||||
"movdqa (%0),%%xmm6\n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5\n"
|
||||
"palignr $0x8,%%xmm13,%%xmm13\n"
|
||||
"movdqa (%0,%3),%%xmm7\n"
|
||||
"lea (%0,%3,2),%0\n"
|
||||
"movdqa %%xmm6,%%xmm14\n"
|
||||
"punpcklbw %%xmm7,%%xmm6\n"
|
||||
"punpckhbw %%xmm7,%%xmm14\n"
|
||||
"neg %3\n"
|
||||
"movdqa %%xmm6,%%xmm7\n"
|
||||
"movdqa %%xmm14,%%xmm15\n"
|
||||
"lea 0x10(%0,%3,8),%0\n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7\n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15\n"
|
||||
"neg %3\n"
|
||||
// Second round of bit swap.
|
||||
"punpcklwd %%xmm2,%%xmm0\n"
|
||||
"punpcklwd %%xmm3,%%xmm1\n"
|
||||
"movdqa %%xmm0,%%xmm2\n"
|
||||
"movdqa %%xmm1,%%xmm3\n"
|
||||
"palignr $0x8,%%xmm2,%%xmm2\n"
|
||||
"palignr $0x8,%%xmm3,%%xmm3\n"
|
||||
"punpcklwd %%xmm6,%%xmm4\n"
|
||||
"punpcklwd %%xmm7,%%xmm5\n"
|
||||
"movdqa %%xmm4,%%xmm6\n"
|
||||
"movdqa %%xmm5,%%xmm7\n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6\n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7\n"
|
||||
"punpcklwd %%xmm10,%%xmm8\n"
|
||||
"punpcklwd %%xmm11,%%xmm9\n"
|
||||
"movdqa %%xmm8,%%xmm10\n"
|
||||
"movdqa %%xmm9,%%xmm11\n"
|
||||
"palignr $0x8,%%xmm10,%%xmm10\n"
|
||||
"palignr $0x8,%%xmm11,%%xmm11\n"
|
||||
"punpcklwd %%xmm14,%%xmm12\n"
|
||||
"punpcklwd %%xmm15,%%xmm13\n"
|
||||
"movdqa %%xmm12,%%xmm14\n"
|
||||
"movdqa %%xmm13,%%xmm15\n"
|
||||
"palignr $0x8,%%xmm14,%%xmm14\n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15\n"
|
||||
// Third round of bit swap.
|
||||
// Write to the destination pointer.
|
||||
"punpckldq %%xmm4,%%xmm0\n"
|
||||
"movq %%xmm0,(%1)\n"
|
||||
"movdqa %%xmm0,%%xmm4\n"
|
||||
"palignr $0x8,%%xmm4,%%xmm4\n"
|
||||
"movq %%xmm4,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"punpckldq %%xmm6,%%xmm2\n"
|
||||
"movdqa %%xmm2,%%xmm6\n"
|
||||
"movq %%xmm2,(%1)\n"
|
||||
"palignr $0x8,%%xmm6,%%xmm6\n"
|
||||
"punpckldq %%xmm5,%%xmm1\n"
|
||||
"movq %%xmm6,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"movdqa %%xmm1,%%xmm5\n"
|
||||
"movq %%xmm1,(%1)\n"
|
||||
"palignr $0x8,%%xmm5,%%xmm5\n"
|
||||
"movq %%xmm5,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"punpckldq %%xmm7,%%xmm3\n"
|
||||
"movq %%xmm3,(%1)\n"
|
||||
"movdqa %%xmm3,%%xmm7\n"
|
||||
"palignr $0x8,%%xmm7,%%xmm7\n"
|
||||
"movq %%xmm7,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"punpckldq %%xmm12,%%xmm8\n"
|
||||
"movq %%xmm8,(%1)\n"
|
||||
"movdqa %%xmm8,%%xmm12\n"
|
||||
"palignr $0x8,%%xmm12,%%xmm12\n"
|
||||
"movq %%xmm12,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"punpckldq %%xmm14,%%xmm10\n"
|
||||
"movdqa %%xmm10,%%xmm14\n"
|
||||
"movq %%xmm10,(%1)\n"
|
||||
"palignr $0x8,%%xmm14,%%xmm14\n"
|
||||
"punpckldq %%xmm13,%%xmm9\n"
|
||||
"movq %%xmm14,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"movdqa %%xmm9,%%xmm13\n"
|
||||
"movq %%xmm9,(%1)\n"
|
||||
"palignr $0x8,%%xmm13,%%xmm13\n"
|
||||
"movq %%xmm13,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"punpckldq %%xmm15,%%xmm11\n"
|
||||
"movq %%xmm11,(%1)\n"
|
||||
"movdqa %%xmm11,%%xmm15\n"
|
||||
"palignr $0x8,%%xmm15,%%xmm15\n"
|
||||
"movq %%xmm15,(%1,%4)\n"
|
||||
"lea (%1,%4,2),%1\n"
|
||||
"sub $0x10,%2\n"
|
||||
"ja 1b\n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(static_cast<intptr_t>(src_stride)), // %3
|
||||
"r"(static_cast<intptr_t>(dst_stride)) // %4
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#define HAS_TRANSPOSE_UVWX8_SSE2
|
||||
static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
@ -644,17 +781,26 @@ void TransposePlane(const uint8* src, int src_stride,
|
||||
#if defined(HAS_TRANSPOSE_WX8_NEON)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
|
||||
(width % 8 == 0) &&
|
||||
IS_ALIGNED(src, 16) && (src_stride % 8 == 0) &&
|
||||
IS_ALIGNED(dst, 16) && (dst_stride % 8 == 0)) {
|
||||
IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
|
||||
IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
|
||||
TransposeWx8 = TransposeWx8_NEON;
|
||||
TransposeWxH = TransposeWxH_C;
|
||||
} else
|
||||
#endif
|
||||
#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src, 16) && (src_stride % 16 == 0) &&
|
||||
IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
|
||||
TransposeWx8 = TransposeWx8_FAST_SSSE3;
|
||||
TransposeWxH = TransposeWxH_C;
|
||||
} else
|
||||
#endif
|
||||
#if defined(HAS_TRANSPOSE_WX8_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
(width % 8 == 0) &&
|
||||
IS_ALIGNED(src, 16) && (src_stride % 8 == 0) &&
|
||||
IS_ALIGNED(dst, 16) && (dst_stride % 8 == 0)) {
|
||||
IS_ALIGNED(src, 8) && (src_stride % 8 == 0) &&
|
||||
IS_ALIGNED(dst, 8) && (dst_stride % 8 == 0)) {
|
||||
TransposeWx8 = TransposeWx8_SSSE3;
|
||||
TransposeWxH = TransposeWxH_C;
|
||||
} else
|
||||
|
||||
102
source/row.h
102
source/row.h
@ -13,17 +13,91 @@
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
// The following are available on all x86 platforms
|
||||
#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
|
||||
&& !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
|
||||
#define HAS_ARGBTOYROW_SSSE3
|
||||
#define HAS_BG24TOARGBROW_SSSE3
|
||||
#define HAS_RAWTOARGBROW_SSSE3
|
||||
#define HAS_RGB24TOYROW_SSSE3
|
||||
#define HAS_RAWTOYROW_SSSE3
|
||||
#define HAS_RGB24TOUVROW_SSSE3
|
||||
#define HAS_RAWTOUVROW_SSSE3
|
||||
#endif
|
||||
|
||||
// The following are available only on Windows
|
||||
#if defined(WIN32) \
|
||||
&& !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
|
||||
#define HAS_BGRATOYROW_SSSE3
|
||||
#define HAS_ABGRTOYROW_SSSE3
|
||||
#define HAS_ARGBTOUVROW_SSSE3
|
||||
#define HAS_BGRATOUVROW_SSSE3
|
||||
#define HAS_ABGRTOUVROW_SSSE3
|
||||
#endif
|
||||
|
||||
extern "C" {
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
#endif
|
||||
#if defined(HAS_BG24TOARGBROW_SSSE3) && defined(HAS_ARGBTOYROW_SSSE3)
|
||||
#define HASRGB24TOYROW_SSSE3
|
||||
#endif
|
||||
#ifdef HASRGB24TOYROW_SSSE3
|
||||
void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void RGB24ToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void RAWToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
#endif
|
||||
void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void BGRAToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void ABGRToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void BGRAToUVRow_C(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void ABGRToUVRow_C(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void RGB24ToUVRow_C(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
|
||||
#ifdef HAS_BG24TOARGBROW_SSSE3
|
||||
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
|
||||
void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
|
||||
#endif
|
||||
void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
|
||||
void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define SIMD_ALIGNED(var) __declspec(align(16)) var
|
||||
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
|
||||
#else
|
||||
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
|
||||
#define TALIGN16(t, var) t var __attribute__((aligned(16)))
|
||||
#endif
|
||||
|
||||
#ifdef OSX
|
||||
extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
|
||||
extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
|
||||
extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
|
||||
#else
|
||||
extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]);
|
||||
extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]);
|
||||
extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]);
|
||||
#endif
|
||||
void FastConvertYUVToRGB32Row(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -52,34 +126,6 @@ void FastConvertYToRGB32Row(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
#endif
|
||||
void ARGBToYRow_C(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define SIMD_ALIGNED(var) __declspec(align(16)) var
|
||||
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
|
||||
#else
|
||||
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
|
||||
#define TALIGN16(t, var) t var __attribute__((aligned(16)))
|
||||
#endif
|
||||
|
||||
#ifdef OSX
|
||||
extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
|
||||
extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
|
||||
extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
|
||||
#else
|
||||
extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]);
|
||||
extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]);
|
||||
extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]);
|
||||
#endif
|
||||
|
||||
// Method to force C version.
|
||||
//#define USE_MMX 0
|
||||
//#define USE_SSE2 0
|
||||
|
||||
@ -23,6 +23,16 @@ extern "C" TALIGN16(const uint8, kAdd16[16]) = {
|
||||
1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u, 1u
|
||||
};
|
||||
|
||||
// Shuffle table for converting BG24 to ARGB.
|
||||
extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
|
||||
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
|
||||
};
|
||||
|
||||
// Shuffle table for converting RAW to ARGB.
|
||||
extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
|
||||
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
|
||||
};
|
||||
|
||||
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
asm volatile(
|
||||
"movdqa (%3),%%xmm7\n"
|
||||
@ -55,47 +65,81 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
}
|
||||
#endif
|
||||
|
||||
static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
|
||||
return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
|
||||
#ifdef HAS_BG24TOARGBROW_SSSE3
|
||||
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
|
||||
"pslld $0x18,%%xmm7\n"
|
||||
"movdqa (%3),%%xmm6\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
"movdqa 0x20(%0),%%xmm3\n"
|
||||
"lea 0x30(%0),%0\n"
|
||||
"movdqa %%xmm3,%%xmm2\n"
|
||||
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
|
||||
"pshufb %%xmm6,%%xmm2\n"
|
||||
"por %%xmm7,%%xmm2\n"
|
||||
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
|
||||
"pshufb %%xmm6,%%xmm0\n"
|
||||
"movdqa %%xmm2,0x20(%1)\n"
|
||||
"por %%xmm7,%%xmm0\n"
|
||||
"pshufb %%xmm6,%%xmm1\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"por %%xmm7,%%xmm1\n"
|
||||
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
|
||||
"pshufb %%xmm6,%%xmm3\n"
|
||||
"movdqa %%xmm1,0x10(%1)\n"
|
||||
"por %%xmm7,%%xmm3\n"
|
||||
"movdqa %%xmm3,0x30(%1)\n"
|
||||
"lea 0x40(%1),%1\n"
|
||||
"sub $0x10,%2\n"
|
||||
"ja 1b\n"
|
||||
: "+r"(src_bg24), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(pix) // %2
|
||||
: "r"(kShuffleMaskBG24ToARGB) // %3
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
|
||||
return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
|
||||
}
|
||||
static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
|
||||
return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
|
||||
}
|
||||
|
||||
void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]);
|
||||
src_argb0 += 4;
|
||||
dst_y += 1;
|
||||
}
|
||||
}
|
||||
|
||||
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
const uint8* src_argb1 = src_argb0 + src_stride_argb;
|
||||
for (int x = 0; x < width - 1; x += 2) {
|
||||
uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2;
|
||||
uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2;
|
||||
uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2;
|
||||
dst_u[0] = RGBToU(ar, ag, ab);
|
||||
dst_v[0] = RGBToV(ar, ag, ab);
|
||||
src_argb0 += 8;
|
||||
src_argb1 += 8;
|
||||
dst_u += 1;
|
||||
dst_v += 1;
|
||||
}
|
||||
if (width & 1) {
|
||||
uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1;
|
||||
uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1;
|
||||
uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1;
|
||||
dst_u[0] = RGBToU(ar, ag, ab);
|
||||
dst_v[0] = RGBToV(ar, ag, ab);
|
||||
}
|
||||
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm7,%%xmm7\n" // generate mask 0xff000000
|
||||
"pslld $0x18,%%xmm7\n"
|
||||
"movdqa (%3),%%xmm6\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
"movdqa 0x20(%0),%%xmm3\n"
|
||||
"lea 0x30(%0),%0\n"
|
||||
"movdqa %%xmm3,%%xmm2\n"
|
||||
"palignr $0x8,%%xmm1,%%xmm2\n" // xmm2 = { xmm3[0:3] xmm1[8:15] }
|
||||
"pshufb %%xmm6,%%xmm2\n"
|
||||
"por %%xmm7,%%xmm2\n"
|
||||
"palignr $0xc,%%xmm0,%%xmm1\n" // xmm1 = { xmm3[0:7] xmm0[12:15] }
|
||||
"pshufb %%xmm6,%%xmm0\n"
|
||||
"movdqa %%xmm2,0x20(%1)\n"
|
||||
"por %%xmm7,%%xmm0\n"
|
||||
"pshufb %%xmm6,%%xmm1\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"por %%xmm7,%%xmm1\n"
|
||||
"palignr $0x4,%%xmm3,%%xmm3\n" // xmm3 = { xmm3[4:15] }
|
||||
"pshufb %%xmm6,%%xmm3\n"
|
||||
"movdqa %%xmm1,0x10(%1)\n"
|
||||
"por %%xmm7,%%xmm3\n"
|
||||
"movdqa %%xmm3,0x30(%1)\n"
|
||||
"lea 0x40(%1),%1\n"
|
||||
"sub $0x10,%2\n"
|
||||
"ja 1b\n"
|
||||
: "+r"(src_raw), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(pix) // %2
|
||||
: "r"(kShuffleMaskRAWToARGB) // %3
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__x86_64__)
|
||||
|
||||
@ -611,4 +655,5 @@ void FastConvertYToRGB32Row(const uint8* y_buf,
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} // extern "C"
|
||||
|
||||
@ -10,6 +10,8 @@
|
||||
|
||||
#include "row.h"
|
||||
|
||||
#define kMaxStride (2048 * 4)
|
||||
|
||||
extern "C" {
|
||||
|
||||
#define MAKETABLE(NAME) \
|
||||
@ -301,4 +303,167 @@ MAKETABLE(kCoefficientsAbgrY)
|
||||
MAKETABLE(_kCoefficientsAbgrY)
|
||||
#endif
|
||||
|
||||
|
||||
void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
uint8 r = src_raw[0];
|
||||
uint8 g = src_raw[1];
|
||||
uint8 b = src_raw[2];
|
||||
dst_argb[0] = b;
|
||||
dst_argb[1] = g;
|
||||
dst_argb[2] = r;
|
||||
dst_argb[3] = 255u;
|
||||
dst_argb += 4;
|
||||
src_raw += 3;
|
||||
}
|
||||
}
|
||||
|
||||
void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
uint8 b = src_bg24[0];
|
||||
uint8 g = src_bg24[1];
|
||||
uint8 r = src_bg24[2];
|
||||
dst_argb[0] = b;
|
||||
dst_argb[1] = g;
|
||||
dst_argb[2] = r;
|
||||
dst_argb[3] = 255u;
|
||||
dst_argb[3] = 255u;
|
||||
dst_argb += 4;
|
||||
src_bg24 += 3;
|
||||
}
|
||||
}
|
||||
|
||||
// C versions do the same
|
||||
void RGB24ToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride]);
|
||||
BG24ToARGBRow_C(src_argb, row, pix);
|
||||
ARGBToYRow_C(row, dst_y, pix);
|
||||
}
|
||||
|
||||
void RAWToYRow_C(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride]);
|
||||
RAWToARGBRow_C(src_argb, row, pix);
|
||||
ARGBToYRow_C(row, dst_y, pix);
|
||||
}
|
||||
|
||||
void RGB24ToUVRow_C(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int pix) {
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
|
||||
BG24ToARGBRow_C(src_argb, row, pix);
|
||||
BG24ToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
|
||||
ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
|
||||
}
|
||||
|
||||
void RAWToUVRow_C(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int pix) {
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
|
||||
RAWToARGBRow_C(src_argb, row, pix);
|
||||
RAWToARGBRow_C(src_argb + src_stride_argb, row + kMaxStride, pix);
|
||||
ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
|
||||
}
|
||||
|
||||
static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
|
||||
return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
|
||||
}
|
||||
|
||||
static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
|
||||
return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
|
||||
}
|
||||
static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
|
||||
return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
|
||||
}
|
||||
|
||||
#define MAKEROWY(NAME,R,G,B) \
|
||||
void NAME ## ToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) { \
|
||||
for (int x = 0; x < width; ++x) { \
|
||||
dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
|
||||
src_argb0 += 4; \
|
||||
dst_y += 1; \
|
||||
} \
|
||||
} \
|
||||
void NAME ## ToUVRow_C(const uint8* src_rgb0, int src_stride_rgb, \
|
||||
uint8* dst_u, uint8* dst_v, int width) { \
|
||||
const uint8* src_rgb1 = src_rgb0 + src_stride_rgb; \
|
||||
for (int x = 0; x < width - 1; x += 2) { \
|
||||
uint8 ab = (src_rgb0[B] + src_rgb0[B + 4] + \
|
||||
src_rgb1[B] + src_rgb1[B + 4]) >> 2; \
|
||||
uint8 ag = (src_rgb0[G] + src_rgb0[G + 4] + \
|
||||
src_rgb1[G] + src_rgb1[G + 4]) >> 2; \
|
||||
uint8 ar = (src_rgb0[R] + src_rgb0[R + 4] + \
|
||||
src_rgb1[R] + src_rgb1[R + 4]) >> 2; \
|
||||
dst_u[0] = RGBToU(ar, ag, ab); \
|
||||
dst_v[0] = RGBToV(ar, ag, ab); \
|
||||
src_rgb0 += 8; \
|
||||
src_rgb1 += 8; \
|
||||
dst_u += 1; \
|
||||
dst_v += 1; \
|
||||
} \
|
||||
if (width & 1) { \
|
||||
uint8 ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
|
||||
uint8 ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
|
||||
uint8 ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
|
||||
dst_u[0] = RGBToU(ar, ag, ab); \
|
||||
dst_v[0] = RGBToV(ar, ag, ab); \
|
||||
} \
|
||||
}
|
||||
|
||||
MAKEROWY(ARGB,2,1,0)
|
||||
MAKEROWY(BGRA,1,2,3)
|
||||
MAKEROWY(ABGR,0,1,2)
|
||||
|
||||
#if defined(HAS_RAWTOYROW_SSSE3)
|
||||
|
||||
void RGB24ToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride]);
|
||||
BG24ToARGBRow_SSSE3(src_argb, row, pix);
|
||||
ARGBToYRow_SSSE3(row, dst_y, pix);
|
||||
}
|
||||
|
||||
void RAWToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride]);
|
||||
RAWToARGBRow_SSSE3(src_argb, row, pix);
|
||||
ARGBToYRow_SSSE3(row, dst_y, pix);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#if defined(HAS_RAWTOUVROW_SSSE3)
|
||||
#if defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int pix) {
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
|
||||
BG24ToARGBRow_SSSE3(src_argb, row, pix);
|
||||
BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
|
||||
ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
|
||||
}
|
||||
|
||||
void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int pix) {
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
|
||||
RAWToARGBRow_SSSE3(src_argb, row, pix);
|
||||
RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
|
||||
ARGBToUVRow_SSSE3(row, kMaxStride, dst_u, dst_v, pix);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
void RGB24ToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int pix) {
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
|
||||
BG24ToARGBRow_SSSE3(src_argb, row, pix);
|
||||
BG24ToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
|
||||
ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
|
||||
}
|
||||
|
||||
void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int pix) {
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride * 2]);
|
||||
RAWToARGBRow_SSSE3(src_argb, row, pix);
|
||||
RAWToARGBRow_SSSE3(src_argb + src_stride_argb, row + kMaxStride, pix);
|
||||
ARGBToUVRow_C(row, kMaxStride, dst_u, dst_v, pix);
|
||||
}
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
} // extern "C"
|
||||
|
||||
@ -16,59 +16,160 @@ extern "C" {
|
||||
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
|
||||
|
||||
// Constant multiplication table for converting ARGB to I400.
|
||||
extern "C" TALIGN16(const int8, kRGBToY[16]) = {
|
||||
extern "C" TALIGN16(const int8, kARGBToY[16]) = {
|
||||
13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
|
||||
};
|
||||
|
||||
extern "C" TALIGN16(const int8, kRGBToU[16]) = {
|
||||
extern "C" TALIGN16(const int8, kARGBToU[16]) = {
|
||||
112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
|
||||
};
|
||||
|
||||
extern "C" TALIGN16(const int8, kRGBToV[16]) = {
|
||||
extern "C" TALIGN16(const int8, kARGBToV[16]) = {
|
||||
-18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
|
||||
};
|
||||
|
||||
// Constants for BGRA
|
||||
extern "C" TALIGN16(const int8, kBGRAToY[16]) = {
|
||||
0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
|
||||
};
|
||||
|
||||
extern "C" TALIGN16(const int8, kBGRAToU[16]) = {
|
||||
0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
|
||||
};
|
||||
|
||||
extern "C" TALIGN16(const int8, kBGRAToV[16]) = {
|
||||
0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
|
||||
};
|
||||
|
||||
// Constants for ABGR
|
||||
extern "C" TALIGN16(const int8, kABGRToY[16]) = {
|
||||
33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
|
||||
};
|
||||
|
||||
extern "C" TALIGN16(const int8, kABGRToU[16]) = {
|
||||
-38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
|
||||
};
|
||||
|
||||
extern "C" TALIGN16(const int8, kABGRToV[16]) = {
|
||||
112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
|
||||
};
|
||||
|
||||
extern "C" TALIGN16(const uint8, kAddY16[16]) = {
|
||||
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
|
||||
16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
|
||||
};
|
||||
|
||||
extern "C" TALIGN16(const uint8, kAddUV128[16]) = {
|
||||
128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u,
|
||||
128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u
|
||||
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
|
||||
128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
|
||||
};
|
||||
|
||||
// Shuffle table for converting BG24 to ARGB.
|
||||
extern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
|
||||
0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
|
||||
};
|
||||
|
||||
// Shuffle table for converting RAW to ARGB.
|
||||
extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
|
||||
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
|
||||
};
|
||||
|
||||
// Convert 16 ARGB pixels (64 bytes) to 16 Y values
|
||||
__declspec(naked)
|
||||
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_argb
|
||||
mov edx, [esp + 8] // dst_y
|
||||
mov ecx, [esp + 12] // pix
|
||||
movdqa xmm7, _kRGBToY
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
mov edx, [esp + 8] /* dst_y */
|
||||
mov ecx, [esp + 12] /* pix */
|
||||
movdqa xmm7, _kARGBToY
|
||||
movdqa xmm6, _kAddY16
|
||||
pcmpeqb xmm5, xmm5 // Generate mask 0x0000ffff
|
||||
psrld xmm5, 16
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
pmaddubsw xmm0, xmm7
|
||||
lea eax, [eax + 32]
|
||||
pmaddubsw xmm1, xmm7 // BG ra BG ra BG ra BG ra
|
||||
palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx
|
||||
paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx
|
||||
pand xmm2, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
|
||||
palignr xmm3, xmm1, 2
|
||||
paddw xmm3, xmm1
|
||||
pand xmm3, xmm5 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
|
||||
packssdw xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA
|
||||
psrlw xmm2, 7 // 0B xx 0B xx 0B xx 0B xx
|
||||
packuswb xmm2, xmm2
|
||||
paddb xmm2, xmm6
|
||||
movq qword ptr [edx], xmm2
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 8
|
||||
ja convertloop
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm2, [eax + 32]
|
||||
movdqa xmm3, [eax + 48]
|
||||
pmaddubsw xmm0, xmm7
|
||||
pmaddubsw xmm1, xmm7
|
||||
pmaddubsw xmm2, xmm7
|
||||
pmaddubsw xmm3, xmm7
|
||||
lea eax, [eax + 64]
|
||||
phaddw xmm0, xmm1
|
||||
phaddw xmm2, xmm3
|
||||
psrlw xmm0, 7
|
||||
psrlw xmm2, 7
|
||||
packuswb xmm0, xmm2
|
||||
paddb xmm0, xmm6
|
||||
movdqa [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
mov edx, [esp + 8] /* dst_y */
|
||||
mov ecx, [esp + 12] /* pix */
|
||||
movdqa xmm7, _kBGRAToY
|
||||
movdqa xmm6, _kAddY16
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm2, [eax + 32]
|
||||
movdqa xmm3, [eax + 48]
|
||||
pmaddubsw xmm0, xmm7
|
||||
pmaddubsw xmm1, xmm7
|
||||
pmaddubsw xmm2, xmm7
|
||||
pmaddubsw xmm3, xmm7
|
||||
lea eax, [eax + 64]
|
||||
phaddw xmm0, xmm1
|
||||
phaddw xmm2, xmm3
|
||||
psrlw xmm0, 7
|
||||
psrlw xmm2, 7
|
||||
packuswb xmm0, xmm2
|
||||
paddb xmm0, xmm6
|
||||
movdqa [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
mov edx, [esp + 8] /* dst_y */
|
||||
mov ecx, [esp + 12] /* pix */
|
||||
movdqa xmm7, _kABGRToY
|
||||
movdqa xmm6, _kAddY16
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm2, [eax + 32]
|
||||
movdqa xmm3, [eax + 48]
|
||||
pmaddubsw xmm0, xmm7
|
||||
pmaddubsw xmm1, xmm7
|
||||
pmaddubsw xmm2, xmm7
|
||||
pmaddubsw xmm3, xmm7
|
||||
lea eax, [eax + 64]
|
||||
phaddw xmm0, xmm1
|
||||
phaddw xmm2, xmm3
|
||||
psrlw xmm0, 7
|
||||
psrlw xmm2, 7
|
||||
packuswb xmm0, xmm2
|
||||
paddb xmm0, xmm6
|
||||
movdqa [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
@ -84,55 +185,52 @@ __asm {
|
||||
mov edx, [esp + 8 + 12] // dst_u
|
||||
mov edi, [esp + 8 + 16] // dst_v
|
||||
mov ecx, [esp + 8 + 20] // pix
|
||||
movdqa xmm7, _kRGBToU
|
||||
movdqa xmm6, _kRGBToV
|
||||
movdqa xmm7, _kARGBToU
|
||||
movdqa xmm6, _kARGBToV
|
||||
movdqa xmm5, _kAddUV128
|
||||
pcmpeqb xmm4, xmm4 // Generate mask 0x0000ffff
|
||||
psrld xmm4, 16
|
||||
sub edi, edx // stride from u to v
|
||||
|
||||
convertloop :
|
||||
// step 1 - subsample 8x2 argb pixels to 4x1
|
||||
movdqa xmm0, [eax] // 32x2 -> 32x1
|
||||
/* step 1 - subsample 16x2 argb pixels to 8x1 */
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm2, [eax + esi]
|
||||
movdqa xmm3, [eax + esi + 16]
|
||||
lea eax, [eax + 32]
|
||||
pavgb xmm0, xmm2
|
||||
pavgb xmm1, xmm3
|
||||
|
||||
movdqa xmm2, xmm0 // 32x1 -> 16x1
|
||||
movdqa xmm2, [eax + 32]
|
||||
movdqa xmm3, [eax + 48]
|
||||
pavgb xmm0, [eax + esi]
|
||||
pavgb xmm1, [eax + esi + 16]
|
||||
pavgb xmm2, [eax + esi + 32]
|
||||
pavgb xmm3, [eax + esi + 48]
|
||||
lea eax, [eax + 64]
|
||||
movdqa xmm4, xmm0
|
||||
shufps xmm0, xmm1, 0x88
|
||||
shufps xmm2, xmm1, 0xdd
|
||||
pavgb xmm0, xmm2
|
||||
shufps xmm4, xmm1, 0xdd
|
||||
pavgb xmm0, xmm4
|
||||
movdqa xmm4, xmm2
|
||||
shufps xmm2, xmm3, 0x88
|
||||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 8 different pixels, its 4 pixels of U and 4 of V
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
pmaddubsw xmm1, xmm6 // V
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
pmaddubsw xmm2, xmm7
|
||||
pmaddubsw xmm1, xmm6 // V
|
||||
pmaddubsw xmm3, xmm6
|
||||
phaddw xmm0, xmm2
|
||||
phaddw xmm1, xmm3
|
||||
psraw xmm0, 8
|
||||
psraw xmm1, 8
|
||||
packsswb xmm0, xmm1
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
palignr xmm2, xmm0, 2 // AR xx AR xx AR xx AR xx
|
||||
paddw xmm2, xmm0 // BGRA xx BGRA xx BGRA xx BGRA xx
|
||||
pand xmm2, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
|
||||
|
||||
palignr xmm3, xmm1, 2
|
||||
paddw xmm3, xmm1
|
||||
pand xmm3, xmm4 // BGRA 00 BGRA 00 BGRA 00 BGRA 00
|
||||
|
||||
psraw xmm2, 8
|
||||
psraw xmm3, 8
|
||||
packsswb xmm2, xmm3 // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA
|
||||
paddb xmm2, xmm5 // -> unsigned
|
||||
packuswb xmm2, xmm2 // 8 bytes. 4 U, 4 V
|
||||
|
||||
// step 3 - store 4 U and 4 V values
|
||||
movd dword ptr [edx], xmm2 // U
|
||||
lea edx, [edx + 4]
|
||||
pshufd xmm0, xmm2, 0x55 // V
|
||||
movd dword ptr [edi], xmm0
|
||||
lea edi, [edi + 4]
|
||||
sub ecx, 8
|
||||
// step 3 - store 8 U and 8 V values
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
pop edi
|
||||
pop esi
|
||||
@ -140,45 +238,208 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
|
||||
return (( 66 * r + 129 * g + 25 * b + 128) >> 8) + 16;
|
||||
}
|
||||
__declspec(naked)
|
||||
void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
mov eax, [esp + 8 + 4] // src_argb
|
||||
mov esi, [esp + 8 + 8] // src_stride_argb
|
||||
mov edx, [esp + 8 + 12] // dst_u
|
||||
mov edi, [esp + 8 + 16] // dst_v
|
||||
mov ecx, [esp + 8 + 20] // pix
|
||||
movdqa xmm7, _kBGRAToU
|
||||
movdqa xmm6, _kBGRAToV
|
||||
movdqa xmm5, _kAddUV128
|
||||
sub edi, edx // stride from u to v
|
||||
|
||||
static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
|
||||
return ((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128;
|
||||
}
|
||||
static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
|
||||
return ((112 * r - 94 * g - 18 * b + 128) >> 8) + 128;
|
||||
}
|
||||
convertloop :
|
||||
/* step 1 - subsample 16x2 argb pixels to 8x1 */
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm2, [eax + 32]
|
||||
movdqa xmm3, [eax + 48]
|
||||
pavgb xmm0, [eax + esi]
|
||||
pavgb xmm1, [eax + esi + 16]
|
||||
pavgb xmm2, [eax + esi + 32]
|
||||
pavgb xmm3, [eax + esi + 48]
|
||||
lea eax, [eax + 64]
|
||||
movdqa xmm4, xmm0
|
||||
shufps xmm0, xmm1, 0x88
|
||||
shufps xmm4, xmm1, 0xdd
|
||||
pavgb xmm0, xmm4
|
||||
movdqa xmm4, xmm2
|
||||
shufps xmm2, xmm3, 0x88
|
||||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]);
|
||||
src_argb0 += 4;
|
||||
dst_y += 1;
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
pmaddubsw xmm2, xmm7
|
||||
pmaddubsw xmm1, xmm6 // V
|
||||
pmaddubsw xmm3, xmm6
|
||||
phaddw xmm0, xmm2
|
||||
phaddw xmm1, xmm3
|
||||
psraw xmm0, 8
|
||||
psraw xmm1, 8
|
||||
packsswb xmm0, xmm1
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
const uint8* src_argb1 = src_argb0 + src_stride_argb;
|
||||
for (int x = 0; x < width - 1; x += 2) {
|
||||
uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2;
|
||||
uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2;
|
||||
uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2;
|
||||
dst_u[0] = RGBToU(ar, ag, ab);
|
||||
dst_v[0] = RGBToV(ar, ag, ab);
|
||||
src_argb0 += 8;
|
||||
src_argb1 += 8;
|
||||
dst_u += 1;
|
||||
dst_v += 1;
|
||||
__declspec(naked)
|
||||
void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
__asm {
|
||||
push esi
|
||||
push edi
|
||||
mov eax, [esp + 8 + 4] // src_argb
|
||||
mov esi, [esp + 8 + 8] // src_stride_argb
|
||||
mov edx, [esp + 8 + 12] // dst_u
|
||||
mov edi, [esp + 8 + 16] // dst_v
|
||||
mov ecx, [esp + 8 + 20] // pix
|
||||
movdqa xmm7, _kABGRToU
|
||||
movdqa xmm6, _kABGRToV
|
||||
movdqa xmm5, _kAddUV128
|
||||
sub edi, edx // stride from u to v
|
||||
|
||||
convertloop :
|
||||
/* step 1 - subsample 16x2 argb pixels to 8x1 */
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm2, [eax + 32]
|
||||
movdqa xmm3, [eax + 48]
|
||||
pavgb xmm0, [eax + esi]
|
||||
pavgb xmm1, [eax + esi + 16]
|
||||
pavgb xmm2, [eax + esi + 32]
|
||||
pavgb xmm3, [eax + esi + 48]
|
||||
lea eax, [eax + 64]
|
||||
movdqa xmm4, xmm0
|
||||
shufps xmm0, xmm1, 0x88
|
||||
shufps xmm4, xmm1, 0xdd
|
||||
pavgb xmm0, xmm4
|
||||
movdqa xmm4, xmm2
|
||||
shufps xmm2, xmm3, 0x88
|
||||
shufps xmm4, xmm3, 0xdd
|
||||
pavgb xmm2, xmm4
|
||||
|
||||
// step 2 - convert to U and V
|
||||
// from here down is very similar to Y code except
|
||||
// instead of 16 different pixels, its 8 pixels of U and 8 of V
|
||||
movdqa xmm1, xmm0
|
||||
movdqa xmm3, xmm2
|
||||
pmaddubsw xmm0, xmm7 // U
|
||||
pmaddubsw xmm2, xmm7
|
||||
pmaddubsw xmm1, xmm6 // V
|
||||
pmaddubsw xmm3, xmm6
|
||||
phaddw xmm0, xmm2
|
||||
phaddw xmm1, xmm3
|
||||
psraw xmm0, 8
|
||||
psraw xmm1, 8
|
||||
packsswb xmm0, xmm1
|
||||
paddb xmm0, xmm5 // -> unsigned
|
||||
|
||||
// step 3 - store 8 U and 8 V values
|
||||
movlps qword ptr [edx], xmm0 // U
|
||||
movhps qword ptr [edx + edi], xmm0 // V
|
||||
lea edx, [edx + 8]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
pop edi
|
||||
pop esi
|
||||
ret
|
||||
}
|
||||
if (width & 1) {
|
||||
uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1;
|
||||
uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1;
|
||||
uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1;
|
||||
dst_u[0] = RGBToU(ar, ag, ab);
|
||||
dst_v[0] = RGBToV(ar, ag, ab);
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_bg24
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
|
||||
pslld xmm7, 24
|
||||
movdqa xmm6, _kShuffleMaskBG24ToARGB
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm3, [eax + 32]
|
||||
lea eax, [eax + 48]
|
||||
movdqa xmm2, xmm3
|
||||
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
|
||||
pshufb xmm2, xmm6
|
||||
por xmm2, xmm7
|
||||
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
|
||||
pshufb xmm0, xmm6
|
||||
movdqa [edx + 32], xmm2
|
||||
por xmm0, xmm7
|
||||
pshufb xmm1, xmm6
|
||||
movdqa [edx], xmm0
|
||||
por xmm1, xmm7
|
||||
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
|
||||
pshufb xmm3, xmm6
|
||||
movdqa [edx + 16], xmm1
|
||||
por xmm3, xmm7
|
||||
movdqa [edx + 48], xmm3
|
||||
lea edx, [edx + 64]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
|
||||
int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_raw
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
|
||||
pslld xmm7, 24
|
||||
movdqa xmm6, _kShuffleMaskRAWToARGB
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm3, [eax + 32]
|
||||
lea eax, [eax + 48]
|
||||
movdqa xmm2, xmm3
|
||||
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
|
||||
pshufb xmm2, xmm6
|
||||
por xmm2, xmm7
|
||||
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
|
||||
pshufb xmm0, xmm6
|
||||
movdqa [edx + 32], xmm2
|
||||
por xmm0, xmm7
|
||||
pshufb xmm1, xmm6
|
||||
movdqa [edx], xmm0
|
||||
por xmm1, xmm7
|
||||
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
|
||||
pshufb xmm3, xmm6
|
||||
movdqa [edx + 16], xmm1
|
||||
por xmm3, xmm7
|
||||
movdqa [edx + 48], xmm3
|
||||
lea edx, [edx + 64]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user