mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
ARGB to and from I420 ported to x64
BUG=none TEST=media_unittests Review URL: http://webrtc-codereview.appspot.com/266003 git-svn-id: http://libyuv.googlecode.com/svn/trunk@61 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
755de365c4
commit
b61497636a
@ -19,15 +19,6 @@
|
||||
|
||||
#ifndef INT_TYPES_DEFINED
|
||||
#define INT_TYPES_DEFINED
|
||||
#ifdef COMPILER_MSVC
|
||||
typedef __int64 int64;
|
||||
#else
|
||||
typedef long long int64;
|
||||
#endif /* COMPILER_MSVC */
|
||||
typedef int int32;
|
||||
typedef short int16;
|
||||
typedef char int8;
|
||||
|
||||
#ifdef COMPILER_MSVC
|
||||
typedef unsigned __int64 uint64;
|
||||
typedef __int64 int64;
|
||||
@ -38,7 +29,18 @@ typedef __int64 int64;
|
||||
#define UINT64_C(x) x ## UI64
|
||||
#endif
|
||||
#define INT64_F "I64"
|
||||
#else
|
||||
#else // COMPILER_MSVC
|
||||
#ifdef __LP64__
|
||||
typedef unsigned long uint64;
|
||||
typedef long int64;
|
||||
#ifndef INT64_C
|
||||
#define INT64_C(x) x ## L
|
||||
#endif
|
||||
#ifndef UINT64_C
|
||||
#define UINT64_C(x) x ## UL
|
||||
#endif
|
||||
#define INT64_F "l"
|
||||
#else // __LP64__
|
||||
typedef unsigned long long uint64;
|
||||
typedef long long int64;
|
||||
#ifndef INT64_C
|
||||
@ -48,10 +50,14 @@ typedef long long int64;
|
||||
#define UINT64_C(x) x ## ULL
|
||||
#endif
|
||||
#define INT64_F "ll"
|
||||
#endif /* COMPILER_MSVC */
|
||||
#endif // __LP64__
|
||||
#endif // COMPILER_MSVC
|
||||
typedef unsigned int uint32;
|
||||
typedef int int32;
|
||||
typedef unsigned short uint16;
|
||||
typedef short int16;
|
||||
typedef unsigned char uint8;
|
||||
typedef char int8;
|
||||
#endif // INT_TYPES_DEFINED
|
||||
|
||||
// Detect compiler is for x86 or x64.
|
||||
|
||||
@ -13,6 +13,7 @@
|
||||
#define INCLUDE_LIBYUV_CONVERT_H_
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
#include "libyuv/rotate.h"
|
||||
|
||||
namespace libyuv {
|
||||
|
||||
@ -92,6 +93,17 @@ int NV12ToRGB565(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_frame, int dst_stride_frame,
|
||||
int width, int height);
|
||||
|
||||
} // namespace libyuv
|
||||
// Convert camera sample to I420 with cropping, rotation and vertical flip.
|
||||
int ConvertToI420(const uint8* src_frame, size_t src_size,
|
||||
uint8* dst_y, int dst_stride_y,
|
||||
uint8* dst_u, int dst_stride_u,
|
||||
uint8* dst_v, int dst_stride_v,
|
||||
int horiz_crop, int vert_crop,
|
||||
int w, int h,
|
||||
int dw, int idh,
|
||||
RotationMode rotation,
|
||||
uint32 format);
|
||||
|
||||
} // namespace libyuv
|
||||
|
||||
#endif // INCLUDE_LIBYUV_CONVERT_H_
|
||||
|
||||
@ -27,7 +27,9 @@ static const int kCpuInitialized = 8;
|
||||
bool TestCpuFlag(int flag);
|
||||
|
||||
// For testing, allow CPU flags to be disabled.
|
||||
// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3. -1 to enable all.
|
||||
// ie MaskCpuFlags(~kCpuHasSSSE3) to disable SSSE3.
|
||||
// -1 to enable all cpu specific optimizations.
|
||||
// 0 to disable all cpu specific optimizations.
|
||||
void MaskCpuFlags(int enable_flags);
|
||||
|
||||
} // namespace libyuv
|
||||
|
||||
@ -13,7 +13,11 @@
|
||||
#include "conversion_tables.h"
|
||||
#include "libyuv/basic_types.h"
|
||||
#include "libyuv/cpu_id.h"
|
||||
#include "libyuv/format_conversion.h"
|
||||
#include "libyuv/planar_functions.h"
|
||||
#include "libyuv/rotate.h"
|
||||
#include "row.h"
|
||||
#include "video_common.h"
|
||||
|
||||
//#define SCALEOPT //Currently for windows only. June 2010
|
||||
|
||||
@ -650,7 +654,7 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
|
||||
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
#if defined(HAS_ARGBTOYROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
|
||||
@ -661,7 +665,7 @@ int ARGBToI420(const uint8* src_frame, int src_stride_frame,
|
||||
ARGBToYRow = ARGBToYRow_C;
|
||||
}
|
||||
#if defined(HAS_ARGBTOUVROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
|
||||
@ -703,7 +707,7 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame,
|
||||
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
#if defined(HAS_BGRATOYROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
|
||||
@ -714,7 +718,7 @@ int BGRAToI420(const uint8* src_frame, int src_stride_frame,
|
||||
ARGBToYRow = BGRAToYRow_C;
|
||||
}
|
||||
#if defined(HAS_BGRATOUVROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
|
||||
@ -756,7 +760,7 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame,
|
||||
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
#if defined(HAS_ABGRTOYROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
|
||||
@ -767,7 +771,7 @@ int ABGRToI420(const uint8* src_frame, int src_stride_frame,
|
||||
ARGBToYRow = ABGRToYRow_C;
|
||||
}
|
||||
#if defined(HAS_ABGRTOUVROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
|
||||
@ -809,7 +813,7 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
|
||||
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
#if defined(HAS_RGB24TOYROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
|
||||
@ -820,7 +824,7 @@ int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
|
||||
ARGBToYRow = RGB24ToYRow_C;
|
||||
}
|
||||
#if defined(HAS_RGB24TOUVROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
|
||||
@ -862,7 +866,7 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame,
|
||||
void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
#if defined(HAS_RAWTOYROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
|
||||
@ -873,7 +877,7 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame,
|
||||
ARGBToYRow = RAWToYRow_C;
|
||||
}
|
||||
#if defined(HAS_RAWTOUVROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_frame, 16) && (src_stride_frame % 16 == 0) &&
|
||||
IS_ALIGNED(dst_u, 8) && (dst_stride_u % 8 == 0) &&
|
||||
@ -901,4 +905,163 @@ int RAWToI420(const uint8* src_frame, int src_stride_frame,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Convert camera sample to I420 with cropping, rotation and vertical flip.
|
||||
int ConvertToI420(const uint8* sample, size_t sample_size,
|
||||
uint8* y, int y_stride,
|
||||
uint8* u, int u_stride,
|
||||
uint8* v, int v_stride,
|
||||
int horiz_crop, int vert_crop,
|
||||
int w, int h,
|
||||
int dw, int idh,
|
||||
RotationMode rotation,
|
||||
uint32 format) {
|
||||
int aw = (w + 1) & ~1;
|
||||
const uint8* src;
|
||||
const uint8* src_uv;
|
||||
int abs_h = (h < 0) ? -h : h;
|
||||
switch (format) {
|
||||
// Single plane formats
|
||||
case FOURCC_YUY2:
|
||||
src = sample + (aw * vert_crop + horiz_crop) * 2 ;
|
||||
YUY2ToI420(src, aw * 2,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
dw, idh);
|
||||
break;
|
||||
case FOURCC_UYVY:
|
||||
src = sample + (aw * vert_crop + horiz_crop) * 2;
|
||||
UYVYToI420(src, aw * 2,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
dw, idh);
|
||||
break;
|
||||
case FOURCC_24BG:
|
||||
src = sample + (w * vert_crop + horiz_crop) * 3;
|
||||
RGB24ToI420(src, w * 3,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
dw, idh);
|
||||
break;
|
||||
case FOURCC_RAW:
|
||||
src = sample + (w * vert_crop + horiz_crop) * 3;
|
||||
RAWToI420(src, w * 3,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
dw, idh);
|
||||
break;
|
||||
case FOURCC_ARGB:
|
||||
src = sample + (w * vert_crop + horiz_crop) * 4;
|
||||
ARGBToI420(src, w * 4,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
dw, idh);
|
||||
break;
|
||||
case FOURCC_BGRA:
|
||||
src = sample + (w * vert_crop + horiz_crop) * 4;
|
||||
BGRAToI420(src, w * 4,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
dw, idh);
|
||||
break;
|
||||
case FOURCC_ABGR:
|
||||
src = sample + (w * vert_crop + horiz_crop) * 4;
|
||||
ABGRToI420(src, w * 4,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
dw, idh);
|
||||
break;
|
||||
case FOURCC_BGGR:
|
||||
case FOURCC_RGGB:
|
||||
case FOURCC_GRBG:
|
||||
case FOURCC_GBRG:
|
||||
// TODO(fbarchard): We could support cropping by odd numbers by
|
||||
// adjusting fourcc.
|
||||
src = sample + (w * vert_crop + horiz_crop);
|
||||
BayerRGBToI420(src, w, format,
|
||||
y, y_stride, u, u_stride, v, v_stride,
|
||||
dw, idh);
|
||||
break;
|
||||
// Biplanar formats
|
||||
case FOURCC_M420:
|
||||
src = sample + (w * vert_crop) * 12 / 8 + horiz_crop;
|
||||
M420ToI420(src, w,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
dw, idh);
|
||||
break;
|
||||
case FOURCC_NV12:
|
||||
src = sample + (w * vert_crop + horiz_crop);
|
||||
src_uv = sample + aw * (h + vert_crop / 2) + horiz_crop;
|
||||
NV12ToI420Rotate(src, w,
|
||||
src_uv, aw,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
dw, idh, rotation);
|
||||
break;
|
||||
case FOURCC_NV21:
|
||||
src = sample + (w * vert_crop + horiz_crop);
|
||||
src_uv = sample + aw * (h + vert_crop / 2) + horiz_crop;
|
||||
// Call NV12 but with u and v parameters swapped.
|
||||
NV12ToI420Rotate(src, w,
|
||||
src_uv, aw,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
dw, idh, rotation);
|
||||
break;
|
||||
case FOURCC_Q420:
|
||||
src = sample + (w + aw * 2) * vert_crop + horiz_crop;
|
||||
src_uv = sample + (w + aw * 2) * vert_crop + w + horiz_crop * 2;
|
||||
Q420ToI420(src, w * 3,
|
||||
src_uv, w * 3,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
dw, idh);
|
||||
break;
|
||||
// Triplanar formats
|
||||
case FOURCC_I420:
|
||||
case FOURCC_YV12: {
|
||||
const uint8* src_y = sample + (w * vert_crop + horiz_crop);
|
||||
const uint8* src_u;
|
||||
const uint8* src_v;
|
||||
int halfwidth = (w + 1) / 2;
|
||||
int halfheight = (abs_h + 1) / 2;
|
||||
if (format == FOURCC_I420) {
|
||||
src_u = sample + w * abs_h +
|
||||
(halfwidth * vert_crop + horiz_crop) / 2;
|
||||
src_v = sample + w * abs_h +
|
||||
halfwidth * (halfheight + vert_crop / 2) + horiz_crop / 2;
|
||||
} else {
|
||||
src_v = sample + w * abs_h +
|
||||
(halfwidth * vert_crop + horiz_crop) / 2;
|
||||
src_u = sample + w * abs_h +
|
||||
halfwidth * (halfheight + vert_crop / 2) + horiz_crop / 2;
|
||||
}
|
||||
I420Rotate(src_y, w,
|
||||
src_u, halfwidth,
|
||||
src_v, halfwidth,
|
||||
y, y_stride,
|
||||
u, u_stride,
|
||||
v, v_stride,
|
||||
dw, idh, rotation);
|
||||
break;
|
||||
}
|
||||
// Formats not supported
|
||||
case FOURCC_MJPG:
|
||||
default:
|
||||
return -1; // unknown fourcc - return failure code.
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
} // namespace libyuv
|
||||
|
||||
@ -14,11 +14,14 @@
|
||||
#ifdef _MSC_VER
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
#ifdef __ANDROID__
|
||||
#include <cpu-features.h>
|
||||
#endif
|
||||
|
||||
// TODO(fbarchard): Use cpuid.h when gcc 4.4 is used on OSX and Linux.
|
||||
#if (defined(__pic__) || defined(__APPLE__)) && defined(__i386__)
|
||||
static inline void __cpuid(int cpu_info[4], int info_type) {
|
||||
__asm__ volatile (
|
||||
asm volatile (
|
||||
"mov %%ebx, %%edi\n"
|
||||
"cpuid\n"
|
||||
"xchg %%edi, %%ebx\n"
|
||||
@ -28,7 +31,7 @@ static inline void __cpuid(int cpu_info[4], int info_type) {
|
||||
}
|
||||
#elif defined(__i386__) || defined(__x86_64__)
|
||||
static inline void __cpuid(int cpu_info[4], int info_type) {
|
||||
__asm__ volatile (
|
||||
asm volatile (
|
||||
"cpuid\n"
|
||||
: "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
|
||||
: "a"(info_type)
|
||||
@ -49,6 +52,10 @@ static void InitCpuFlags() {
|
||||
cpu_info_ = (cpu_info[3] & 0x04000000 ? kCpuHasSSE2 : 0) |
|
||||
(cpu_info[2] & 0x00000200 ? kCpuHasSSSE3 : 0) |
|
||||
kCpuInitialized;
|
||||
#elif defined(__ANDROID__) && defined(__ARM_NEON__)
|
||||
features = android_getCpuFeatures();
|
||||
cpu_info_ = (features & ANDROID_CPU_ARM_FEATURE_NEON) ? kCpuHasNEON : 0) |
|
||||
kCpuInitialized;
|
||||
#elif defined(__ARM_NEON__)
|
||||
// gcc -mfpu=neon defines __ARM_NEON__
|
||||
// Enable Neon if you want support for Neon and Arm, and use MaskCpuFlags
|
||||
@ -61,14 +68,14 @@ static void InitCpuFlags() {
|
||||
|
||||
void MaskCpuFlags(int enable_flags) {
|
||||
InitCpuFlags();
|
||||
cpu_info_ &= enable_flags;
|
||||
cpu_info_ = (cpu_info_ & enable_flags) | kCpuInitialized;
|
||||
}
|
||||
|
||||
bool TestCpuFlag(int flag) {
|
||||
if (0 == cpu_info_) {
|
||||
InitCpuFlags();
|
||||
}
|
||||
return cpu_info_ & flag ? true : false;
|
||||
return (cpu_info_ & flag) ? true : false;
|
||||
}
|
||||
|
||||
} // namespace libyuv
|
||||
|
||||
@ -14,8 +14,6 @@
|
||||
#include "video_common.h"
|
||||
#include "row.h"
|
||||
|
||||
#define kMaxStride (2048 * 4)
|
||||
|
||||
namespace libyuv {
|
||||
|
||||
// Note: to do this with Neon vld4.8 would load ARGB values into 4 registers
|
||||
@ -168,7 +166,7 @@ static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
|
||||
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
|
||||
uint8 g = src_bayer0[1];
|
||||
uint8 r = src_bayer1[1];
|
||||
for (int x = 0; x < (pix - 2); x += 2) {
|
||||
for (int x = 0; x < pix - 3; x += 2) {
|
||||
dst_rgb[0] = src_bayer0[0];
|
||||
dst_rgb[1] = AVG(g, src_bayer0[1]);
|
||||
dst_rgb[2] = AVG(r, src_bayer1[1]);
|
||||
@ -187,10 +185,12 @@ static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
|
||||
dst_rgb[1] = AVG(g, src_bayer0[1]);
|
||||
dst_rgb[2] = AVG(r, src_bayer1[1]);
|
||||
dst_rgb[3] = 255U;
|
||||
dst_rgb[4] = src_bayer0[0];
|
||||
dst_rgb[5] = src_bayer0[1];
|
||||
dst_rgb[6] = src_bayer1[1];
|
||||
dst_rgb[7] = 255U;
|
||||
if (pix & 1) {
|
||||
dst_rgb[4] = src_bayer0[0];
|
||||
dst_rgb[5] = src_bayer0[1];
|
||||
dst_rgb[6] = src_bayer1[1];
|
||||
dst_rgb[7] = 255U;
|
||||
}
|
||||
}
|
||||
|
||||
static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
|
||||
@ -198,7 +198,7 @@ static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
|
||||
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
|
||||
uint8 g = src_bayer0[1];
|
||||
uint8 b = src_bayer1[1];
|
||||
for (int x = 0; x < (pix - 2); x += 2) {
|
||||
for (int x = 0; x < pix - 3; x += 2) {
|
||||
dst_rgb[0] = AVG(b, src_bayer1[1]);
|
||||
dst_rgb[1] = AVG(g, src_bayer0[1]);
|
||||
dst_rgb[2] = src_bayer0[0];
|
||||
@ -217,17 +217,19 @@ static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
|
||||
dst_rgb[1] = AVG(g, src_bayer0[1]);
|
||||
dst_rgb[2] = src_bayer0[0];
|
||||
dst_rgb[3] = 255U;
|
||||
dst_rgb[4] = src_bayer1[1];
|
||||
dst_rgb[5] = src_bayer0[1];
|
||||
dst_rgb[6] = src_bayer0[0];
|
||||
dst_rgb[7] = 255U;
|
||||
if (pix & 1) {
|
||||
dst_rgb[4] = src_bayer1[1];
|
||||
dst_rgb[5] = src_bayer0[1];
|
||||
dst_rgb[6] = src_bayer0[0];
|
||||
dst_rgb[7] = 255U;
|
||||
}
|
||||
}
|
||||
|
||||
static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
|
||||
uint8* dst_rgb, int pix) {
|
||||
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
|
||||
uint8 b = src_bayer0[1];
|
||||
for (int x = 0; x < (pix - 2); x += 2) {
|
||||
for (int x = 0; x < pix - 3; x += 2) {
|
||||
dst_rgb[0] = AVG(b, src_bayer0[1]);
|
||||
dst_rgb[1] = src_bayer0[0];
|
||||
dst_rgb[2] = src_bayer1[0];
|
||||
@ -245,17 +247,19 @@ static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
|
||||
dst_rgb[1] = src_bayer0[0];
|
||||
dst_rgb[2] = src_bayer1[0];
|
||||
dst_rgb[3] = 255U;
|
||||
dst_rgb[4] = src_bayer0[1];
|
||||
dst_rgb[5] = src_bayer0[0];
|
||||
dst_rgb[6] = src_bayer1[0];
|
||||
dst_rgb[7] = 255U;
|
||||
if (pix & 1) {
|
||||
dst_rgb[4] = src_bayer0[1];
|
||||
dst_rgb[5] = src_bayer0[0];
|
||||
dst_rgb[6] = src_bayer1[0];
|
||||
dst_rgb[7] = 255U;
|
||||
}
|
||||
}
|
||||
|
||||
static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
|
||||
uint8* dst_rgb, int pix) {
|
||||
const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
|
||||
uint8 r = src_bayer0[1];
|
||||
for (int x = 0; x < (pix - 2); x += 2) {
|
||||
for (int x = 0; x < pix - 3; x += 2) {
|
||||
dst_rgb[0] = src_bayer1[0];
|
||||
dst_rgb[1] = src_bayer0[0];
|
||||
dst_rgb[2] = AVG(r, src_bayer0[1]);
|
||||
@ -273,10 +277,12 @@ static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
|
||||
dst_rgb[1] = src_bayer0[0];
|
||||
dst_rgb[2] = AVG(r, src_bayer0[1]);
|
||||
dst_rgb[3] = 255U;
|
||||
dst_rgb[4] = src_bayer1[0];
|
||||
dst_rgb[5] = src_bayer0[0];
|
||||
dst_rgb[6] = src_bayer0[1];
|
||||
dst_rgb[7] = 255U;
|
||||
if (pix & 1) {
|
||||
dst_rgb[4] = src_bayer1[0];
|
||||
dst_rgb[5] = src_bayer0[0];
|
||||
dst_rgb[6] = src_bayer0[1];
|
||||
dst_rgb[7] = 255U;
|
||||
}
|
||||
}
|
||||
|
||||
// Converts any Bayer RGB format to ARGB.
|
||||
@ -315,7 +321,7 @@ int BayerRGBToARGB(const uint8* src_bayer, int src_stride_bayer,
|
||||
break;
|
||||
}
|
||||
|
||||
for (int y = 0; y < (height - 1); y += 2) {
|
||||
for (int y = 0; y < height - 1; y += 2) {
|
||||
BayerRow0(src_bayer, src_stride_bayer, dst_rgb, width);
|
||||
BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
|
||||
dst_rgb + dst_stride_rgb, width);
|
||||
@ -403,7 +409,7 @@ int BayerRGBToI420(const uint8* src_bayer, int src_stride_bayer,
|
||||
break;
|
||||
}
|
||||
|
||||
for (int y = 0; y < (height - 1); y += 2) {
|
||||
for (int y = 0; y < height - 1; y += 2) {
|
||||
BayerRow0(src_bayer, src_stride_bayer, row, width);
|
||||
BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
|
||||
row + kMaxStride, width);
|
||||
|
||||
@ -26,11 +26,11 @@ static void SplitUV_NEON(const uint8* src_uv,
|
||||
__asm__ volatile
|
||||
(
|
||||
"1:\n"
|
||||
"vld2.u8 {q0,q1}, [%0]! \n" // load 16 pairs of UV
|
||||
"vst1.u8 {q0}, [%1]! \n" // store U
|
||||
"vst1.u8 {q1}, [%2]! \n" // Store V
|
||||
"subs %3, %3, #16 \n" // 16 processed per loop
|
||||
"bhi 1b \n"
|
||||
"vld2.u8 {q0,q1}, [%0]!\n" // load 16 pairs of UV
|
||||
"vst1.u8 {q0}, [%1]!\n" // store U
|
||||
"vst1.u8 {q1}, [%2]!\n" // Store V
|
||||
"subs %3, %3, #16\n" // 16 processed per loop
|
||||
"bhi 1b\n"
|
||||
: "+r"(src_uv),
|
||||
"+r"(dst_u),
|
||||
"+r"(dst_v),
|
||||
@ -48,16 +48,6 @@ static void SplitUV_NEON(const uint8* src_uv,
|
||||
#define TALIGN16(t, var) t var __attribute__((aligned(16)))
|
||||
#endif
|
||||
|
||||
// Shuffle table for converting ABGR to ARGB.
|
||||
extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = {
|
||||
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
|
||||
};
|
||||
|
||||
// Shuffle table for converting BGRA to ARGB.
|
||||
extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = {
|
||||
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
|
||||
};
|
||||
|
||||
#if defined(WIN32) && !defined(COVERAGE_ENABLED)
|
||||
#define HAS_SPLITUV_SSE2
|
||||
__declspec(naked)
|
||||
@ -69,8 +59,8 @@ static void SplitUV_SSE2(const uint8* src_uv,
|
||||
mov edx, [esp + 4 + 8] // dst_u
|
||||
mov edi, [esp + 4 + 12] // dst_v
|
||||
mov ecx, [esp + 4 + 16] // pix
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
|
||||
psrlw xmm7, 8
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
||||
psrlw xmm5, 8
|
||||
|
||||
wloop:
|
||||
movdqa xmm0, [eax]
|
||||
@ -78,8 +68,8 @@ static void SplitUV_SSE2(const uint8* src_uv,
|
||||
lea eax, [eax + 32]
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
pand xmm0, xmm7 // even bytes
|
||||
pand xmm1, xmm7
|
||||
pand xmm0, xmm5 // even bytes
|
||||
pand xmm1, xmm5
|
||||
packuswb xmm0, xmm1
|
||||
movdqa [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
@ -101,16 +91,16 @@ static void SplitUV_SSE2(const uint8* src_uv,
|
||||
static void SplitUV_SSE2(const uint8* src_uv,
|
||||
uint8* dst_u, uint8* dst_v, int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm7,%%xmm7\n"
|
||||
"psrlw $0x8,%%xmm7\n"
|
||||
"pcmpeqb %%xmm5,%%xmm5\n"
|
||||
"psrlw $0x8,%%xmm5\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
"lea 0x20(%0),%0\n"
|
||||
"movdqa %%xmm0,%%xmm2\n"
|
||||
"movdqa %%xmm1,%%xmm3\n"
|
||||
"pand %%xmm7,%%xmm0\n"
|
||||
"pand %%xmm7,%%xmm1\n"
|
||||
"pand %%xmm5,%%xmm0\n"
|
||||
"pand %%xmm5,%%xmm1\n"
|
||||
"packuswb %%xmm1,%%xmm0\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"lea 0x10(%1),%1\n"
|
||||
@ -126,7 +116,10 @@ static void SplitUV_SSE2(const uint8* src_uv,
|
||||
"+r"(dst_v), // %2
|
||||
"+r"(pix) // %3
|
||||
:
|
||||
: "memory"
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif
|
||||
@ -196,15 +189,15 @@ int I420Copy(const uint8* src_y, int src_stride_y,
|
||||
static void SetRow32_NEON(uint8* dst, uint32 v32, int count) {
|
||||
__asm__ volatile
|
||||
(
|
||||
"vdup.u32 q0, %2 \n" // duplicate 4 ints
|
||||
"vdup.u32 q0, %2\n" // duplicate 4 ints
|
||||
"1:\n"
|
||||
"vst1.u32 {q0}, [%0]! \n" // store
|
||||
"subs %1, %1, #16 \n" // 16 processed per loop
|
||||
"bhi 1b \n"
|
||||
"vst1.u32 {q0}, [%0]!\n" // store
|
||||
"subs %1, %1, #16\n" // 16 processed per loop
|
||||
"bhi 1b\n"
|
||||
: "+r"(dst), // %0
|
||||
"+r"(count) // %1
|
||||
: "r"(v32) // %2
|
||||
: "q0", "memory"
|
||||
: "q0", "memory", "cc"
|
||||
);
|
||||
}
|
||||
|
||||
@ -214,12 +207,12 @@ __declspec(naked)
|
||||
static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // dst
|
||||
movd xmm7, [esp + 8] // v32
|
||||
movd xmm5, [esp + 8] // v32
|
||||
mov ecx, [esp + 12] // count
|
||||
pshufd xmm7, xmm7, 0
|
||||
pshufd xmm5, xmm5, 0
|
||||
|
||||
wloop:
|
||||
movdqa [eax], xmm7
|
||||
movdqa [eax], xmm5
|
||||
lea eax, [eax + 16]
|
||||
sub ecx, 16
|
||||
ja wloop
|
||||
@ -233,17 +226,20 @@ static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
|
||||
#define HAS_SETROW_SSE2
|
||||
static void SetRow32_SSE2(uint8* dst, uint32 v32, int count) {
|
||||
asm volatile(
|
||||
"movd %2, %%xmm7\n"
|
||||
"pshufd $0x0,%%xmm7,%%xmm7\n"
|
||||
"movd %2, %%xmm5\n"
|
||||
"pshufd $0x0,%%xmm5,%%xmm5\n"
|
||||
"1:"
|
||||
"movdqa %%xmm7,(%0)\n"
|
||||
"movdqa %%xmm5,(%0)\n"
|
||||
"lea 0x10(%0),%0\n"
|
||||
"sub $0x10,%1\n"
|
||||
"ja 1b\n"
|
||||
: "+r"(dst), // %0
|
||||
"+r"(count) // %1
|
||||
: "r"(v32) // %2
|
||||
: "memory"
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm5"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif
|
||||
@ -257,13 +253,13 @@ static void I420SetPlane(uint8* dst_y, int dst_stride_y,
|
||||
int value) {
|
||||
void (*SetRow)(uint8* dst, uint32 value, int pix);
|
||||
#if defined(HAS_SETROW_NEON)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
|
||||
if (TestCpuFlag(kCpuHasNEON) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
|
||||
SetRow = SetRow32_NEON;
|
||||
} else
|
||||
#elif defined(HAS_SETROW_SSE2)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
|
||||
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
|
||||
SetRow = SetRow32_SSE2;
|
||||
@ -418,7 +414,7 @@ static int X420ToI420(const uint8* src_y,
|
||||
int halfwidth = (width + 1) >> 1;
|
||||
void (*SplitUV)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
|
||||
#if defined(HAS_SPLITUV_NEON)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasNEON) &&
|
||||
if (TestCpuFlag(kCpuHasNEON) &&
|
||||
(halfwidth % 16 == 0) &&
|
||||
IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) &&
|
||||
IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) &&
|
||||
@ -426,7 +422,7 @@ static int X420ToI420(const uint8* src_y,
|
||||
SplitUV = SplitUV_NEON;
|
||||
} else
|
||||
#elif defined(HAS_SPLITUV_SSE2)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
|
||||
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||
(halfwidth % 16 == 0) &&
|
||||
IS_ALIGNED(src_uv, 16) && (src_stride_uv % 16 == 0) &&
|
||||
IS_ALIGNED(dst_u, 16) && (dst_stride_u % 16 == 0) &&
|
||||
@ -510,8 +506,8 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
|
||||
mov esi, [esp + 8 + 12] // dst_u
|
||||
mov edi, [esp + 8 + 16] // dst_v
|
||||
mov ecx, [esp + 8 + 20] // pix
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
|
||||
psrlw xmm7, 8
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
||||
psrlw xmm5, 8
|
||||
|
||||
wloop:
|
||||
movdqa xmm0, [eax]
|
||||
@ -519,8 +515,8 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
|
||||
lea eax, [eax + 32]
|
||||
movdqa xmm2, xmm0
|
||||
movdqa xmm3, xmm1
|
||||
pand xmm2, xmm7 // even bytes are Y
|
||||
pand xmm3, xmm7
|
||||
pand xmm2, xmm5 // even bytes are Y
|
||||
pand xmm3, xmm5
|
||||
packuswb xmm2, xmm3
|
||||
movdqa [edx], xmm2
|
||||
lea edx, [edx + 16]
|
||||
@ -528,7 +524,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
|
||||
psrlw xmm1, 8
|
||||
packuswb xmm0, xmm1
|
||||
movdqa xmm1, xmm0
|
||||
pand xmm0, xmm7 // U
|
||||
pand xmm0, xmm5 // U
|
||||
packuswb xmm0, xmm0
|
||||
movq qword ptr [esi], xmm0
|
||||
lea esi, [esi + 8]
|
||||
@ -551,16 +547,16 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2,
|
||||
static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
|
||||
uint8* dst_u, uint8* dst_v, int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm7,%%xmm7\n"
|
||||
"psrlw $0x8,%%xmm7\n"
|
||||
"pcmpeqb %%xmm5,%%xmm5\n"
|
||||
"psrlw $0x8,%%xmm5\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
"lea 0x20(%0),%0\n"
|
||||
"movdqa %%xmm0,%%xmm2\n"
|
||||
"movdqa %%xmm1,%%xmm3\n"
|
||||
"pand %%xmm7,%%xmm2\n"
|
||||
"pand %%xmm7,%%xmm3\n"
|
||||
"pand %%xmm5,%%xmm2\n"
|
||||
"pand %%xmm5,%%xmm3\n"
|
||||
"packuswb %%xmm3,%%xmm2\n"
|
||||
"movdqa %%xmm2,(%1)\n"
|
||||
"lea 0x10(%1),%1\n"
|
||||
@ -568,7 +564,7 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
|
||||
"psrlw $0x8,%%xmm1\n"
|
||||
"packuswb %%xmm1,%%xmm0\n"
|
||||
"movdqa %%xmm0,%%xmm1\n"
|
||||
"pand %%xmm7,%%xmm0\n"
|
||||
"pand %%xmm5,%%xmm0\n"
|
||||
"packuswb %%xmm0,%%xmm0\n"
|
||||
"movq %%xmm0,(%2)\n"
|
||||
"lea 0x8(%2),%2\n"
|
||||
@ -584,7 +580,10 @@ static void SplitYUY2_SSE2(const uint8* src_yuy2, uint8* dst_y,
|
||||
"+r"(dst_v), // %3
|
||||
"+r"(pix) // %4
|
||||
:
|
||||
: "memory"
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif
|
||||
@ -626,7 +625,7 @@ int Q420ToI420(const uint8* src_y, int src_stride_y,
|
||||
void (*SplitYUY2)(const uint8* src_yuy2,
|
||||
uint8* dst_y, uint8* dst_u, uint8* dst_v, int pix);
|
||||
#if defined(HAS_SPLITYUY2_SSE2)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
|
||||
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) &&
|
||||
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
|
||||
@ -662,15 +661,15 @@ void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
|
||||
mov eax, [esp + 4] // src_yuy2
|
||||
mov edx, [esp + 8] // dst_y
|
||||
mov ecx, [esp + 12] // pix
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
|
||||
psrlw xmm7, 8
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
||||
psrlw xmm5, 8
|
||||
|
||||
wloop:
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
lea eax, [eax + 32]
|
||||
pand xmm0, xmm7 // even bytes are Y
|
||||
pand xmm1, xmm7
|
||||
pand xmm0, xmm5 // even bytes are Y
|
||||
pand xmm1, xmm5
|
||||
packuswb xmm0, xmm1
|
||||
movdqa [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
@ -691,8 +690,8 @@ void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
||||
mov edx, [esp + 8 + 12] // dst_u
|
||||
mov edi, [esp + 8 + 16] // dst_v
|
||||
mov ecx, [esp + 8 + 20] // pix
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
|
||||
psrlw xmm7, 8
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
||||
psrlw xmm5, 8
|
||||
|
||||
wloop:
|
||||
movdqa xmm0, [eax]
|
||||
@ -706,7 +705,7 @@ void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
||||
psrlw xmm1, 8
|
||||
packuswb xmm0, xmm1
|
||||
movdqa xmm1, xmm0
|
||||
pand xmm0, xmm7 // U
|
||||
pand xmm0, xmm5 // U
|
||||
packuswb xmm0, xmm0
|
||||
movq qword ptr [edx], xmm0
|
||||
lea edx, [edx + 8]
|
||||
@ -758,8 +757,8 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
||||
mov edx, [esp + 8 + 12] // dst_u
|
||||
mov edi, [esp + 8 + 16] // dst_v
|
||||
mov ecx, [esp + 8 + 20] // pix
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0x00ff00ff
|
||||
psrlw xmm7, 8
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff
|
||||
psrlw xmm5, 8
|
||||
|
||||
wloop:
|
||||
movdqa xmm0, [eax]
|
||||
@ -769,11 +768,11 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
||||
lea eax, [eax + 32]
|
||||
pavgb xmm0, xmm2
|
||||
pavgb xmm1, xmm3
|
||||
pand xmm0, xmm7 // UYVY -> UVUV
|
||||
pand xmm1, xmm7
|
||||
pand xmm0, xmm5 // UYVY -> UVUV
|
||||
pand xmm1, xmm5
|
||||
packuswb xmm0, xmm1
|
||||
movdqa xmm1, xmm0
|
||||
pand xmm0, xmm7 // U
|
||||
pand xmm0, xmm5 // U
|
||||
packuswb xmm0, xmm0
|
||||
movq qword ptr [edx], xmm0
|
||||
lea edx, [edx + 8]
|
||||
@ -797,14 +796,14 @@ void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
||||
static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
|
||||
uint8* dst_y, int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm7,%%xmm7\n"
|
||||
"psrlw $0x8,%%xmm7\n"
|
||||
"pcmpeqb %%xmm5,%%xmm5\n"
|
||||
"psrlw $0x8,%%xmm5\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
"lea 0x20(%0),%0\n"
|
||||
"pand %%xmm7,%%xmm0\n"
|
||||
"pand %%xmm7,%%xmm1\n"
|
||||
"pand %%xmm5,%%xmm0\n"
|
||||
"pand %%xmm5,%%xmm1\n"
|
||||
"packuswb %%xmm1,%%xmm0\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"lea 0x10(%1),%1\n"
|
||||
@ -814,15 +813,18 @@ static void YUY2ToI420RowY_SSE2(const uint8* src_yuy2,
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(pix) // %2
|
||||
:
|
||||
: "memory"
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1", "xmm5"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
|
||||
static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
||||
uint8* dst_u, uint8* dst_y, int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm7,%%xmm7\n"
|
||||
"psrlw $0x8,%%xmm7\n"
|
||||
"pcmpeqb %%xmm5,%%xmm5\n"
|
||||
"psrlw $0x8,%%xmm5\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
@ -835,7 +837,7 @@ static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
||||
"psrlw $0x8,%%xmm1\n"
|
||||
"packuswb %%xmm1,%%xmm0\n"
|
||||
"movdqa %%xmm0,%%xmm1\n"
|
||||
"pand %%xmm7,%%xmm0\n"
|
||||
"pand %%xmm5,%%xmm0\n"
|
||||
"packuswb %%xmm0,%%xmm0\n"
|
||||
"movq %%xmm0,(%1)\n"
|
||||
"lea 0x8(%1),%1\n"
|
||||
@ -850,7 +852,10 @@ static void YUY2ToI420RowUV_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
||||
"+r"(dst_y), // %2
|
||||
"+r"(pix) // %3
|
||||
: "r"(static_cast<intptr_t>(stride_yuy2)) // %4
|
||||
: "memory"
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#define HAS_UYVYTOI420ROW_SSE2
|
||||
@ -872,15 +877,18 @@ static void UYVYToI420RowY_SSE2(const uint8* src_uyvy,
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(pix) // %2
|
||||
:
|
||||
: "memory"
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
|
||||
static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
||||
uint8* dst_u, uint8* dst_y, int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm7,%%xmm7\n"
|
||||
"psrlw $0x8,%%xmm7\n"
|
||||
"pcmpeqb %%xmm5,%%xmm5\n"
|
||||
"psrlw $0x8,%%xmm5\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"movdqa 0x10(%0),%%xmm1\n"
|
||||
@ -889,11 +897,11 @@ static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
||||
"lea 0x20(%0),%0\n"
|
||||
"pavgb %%xmm2,%%xmm0\n"
|
||||
"pavgb %%xmm3,%%xmm1\n"
|
||||
"pand %%xmm7,%%xmm0\n"
|
||||
"pand %%xmm7,%%xmm1\n"
|
||||
"pand %%xmm5,%%xmm0\n"
|
||||
"pand %%xmm5,%%xmm1\n"
|
||||
"packuswb %%xmm1,%%xmm0\n"
|
||||
"movdqa %%xmm0,%%xmm1\n"
|
||||
"pand %%xmm7,%%xmm0\n"
|
||||
"pand %%xmm5,%%xmm0\n"
|
||||
"packuswb %%xmm0,%%xmm0\n"
|
||||
"movq %%xmm0,(%1)\n"
|
||||
"lea 0x8(%1),%1\n"
|
||||
@ -908,7 +916,10 @@ static void UYVYToI420RowUV_SSE2(const uint8* src_uyvy, int stride_uyvy,
|
||||
"+r"(dst_y), // %2
|
||||
"+r"(pix) // %3
|
||||
: "r"(static_cast<intptr_t>(stride_uyvy)) // %4
|
||||
: "memory"
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif
|
||||
@ -975,7 +986,7 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
|
||||
void (*YUY2ToI420RowY)(const uint8* src_yuy2,
|
||||
uint8* dst_y, int pix);
|
||||
#if defined(HAS_YUY2TOI420ROW_SSE2)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
|
||||
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_yuy2, 16) && (src_stride_yuy2 % 16 == 0) &&
|
||||
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
|
||||
@ -1022,7 +1033,7 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
|
||||
void (*UYVYToI420RowY)(const uint8* src_uyvy,
|
||||
uint8* dst_y, int pix);
|
||||
#if defined(HAS_UYVYTOI420ROW_SSE2)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
|
||||
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_uyvy, 16) && (src_stride_uyvy % 16 == 0) &&
|
||||
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0) &&
|
||||
@ -1053,7 +1064,6 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
|
||||
}
|
||||
|
||||
// Convert I420 to ARGB.
|
||||
// TODO(fbarchard): Add SSE2 version and supply C version for fallback.
|
||||
int I420ToARGB(const uint8* src_y, int src_stride_y,
|
||||
const uint8* src_u, int src_stride_u,
|
||||
const uint8* src_v, int src_stride_v,
|
||||
@ -1065,8 +1075,34 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
|
||||
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
|
||||
dst_stride_argb = -dst_stride_argb;
|
||||
}
|
||||
void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||
(width % 4 == 0) &&
|
||||
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
|
||||
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow4_SSE2;
|
||||
} else
|
||||
#endif
|
||||
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||
(width % 2 == 0)) {
|
||||
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2;
|
||||
} else
|
||||
#endif
|
||||
#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
|
||||
if (width % 2 == 0) {
|
||||
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_MMX;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
|
||||
}
|
||||
for (int y = 0; y < height; ++y) {
|
||||
FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
|
||||
FastConvertYUVToARGBRow(src_y, src_u, src_v, dst_argb, width);
|
||||
dst_argb += dst_stride_argb;
|
||||
src_y += src_stride_y;
|
||||
if (y & 1) {
|
||||
@ -1074,7 +1110,7 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
|
||||
src_v += src_stride_v;
|
||||
}
|
||||
}
|
||||
// MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
|
||||
// MMX used for FastConvertYUVToARGBRow requires an emms instruction.
|
||||
EMMS();
|
||||
return 0;
|
||||
}
|
||||
@ -1091,6 +1127,25 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
|
||||
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
|
||||
dst_stride_argb = -dst_stride_argb;
|
||||
}
|
||||
void (*FastConvertYUVToBGRARow)(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#if defined(HAS_FASTCONVERTYUVTOBGRAROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||
(width % 2 == 0)) {
|
||||
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_SSE2;
|
||||
} else
|
||||
#endif
|
||||
#if defined(HAS_FASTCONVERTYUVTOBGRAROW_MMX)
|
||||
if (width % 2 == 0) {
|
||||
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_MMX;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
FastConvertYUVToBGRARow = FastConvertYUVToBGRARow_C;
|
||||
}
|
||||
for (int y = 0; y < height; ++y) {
|
||||
FastConvertYUVToBGRARow(src_y, src_u, src_v, dst_argb, width);
|
||||
dst_argb += dst_stride_argb;
|
||||
@ -1104,7 +1159,7 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Convert I420 to BGRA.
|
||||
// Convert I420 to ABGR.
|
||||
int I420ToABGR(const uint8* src_y, int src_stride_y,
|
||||
const uint8* src_u, int src_stride_u,
|
||||
const uint8* src_v, int src_stride_v,
|
||||
@ -1116,6 +1171,25 @@ int I420ToABGR(const uint8* src_y, int src_stride_y,
|
||||
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
|
||||
dst_stride_argb = -dst_stride_argb;
|
||||
}
|
||||
void (*FastConvertYUVToABGRRow)(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#if defined(HAS_FASTCONVERTYUVTOABGRROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||
(width % 2 == 0)) {
|
||||
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_SSE2;
|
||||
} else
|
||||
#endif
|
||||
#if defined(HAS_FASTCONVERTYUVTOABGRROW_MMX)
|
||||
if (width % 2 == 0) {
|
||||
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_MMX;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
FastConvertYUVToABGRRow = FastConvertYUVToABGRRow_C;
|
||||
}
|
||||
for (int y = 0; y < height; ++y) {
|
||||
FastConvertYUVToABGRRow(src_y, src_u, src_v, dst_argb, width);
|
||||
dst_argb += dst_stride_argb;
|
||||
@ -1141,14 +1215,33 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
|
||||
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
|
||||
dst_stride_argb = -dst_stride_argb;
|
||||
}
|
||||
void (*FastConvertYUVToARGBRow)(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||
(width % 2 == 0)) {
|
||||
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_SSE2;
|
||||
} else
|
||||
#endif
|
||||
#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
|
||||
if (width % 2 == 0) {
|
||||
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_MMX;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
FastConvertYUVToARGBRow = FastConvertYUVToARGBRow_C;
|
||||
}
|
||||
for (int y = 0; y < height; ++y) {
|
||||
FastConvertYUVToRGB32Row(src_y, src_u, src_v, dst_argb, width);
|
||||
FastConvertYUVToARGBRow(src_y, src_u, src_v, dst_argb, width);
|
||||
dst_argb += dst_stride_argb;
|
||||
src_y += src_stride_y;
|
||||
src_u += src_stride_u;
|
||||
src_v += src_stride_v;
|
||||
}
|
||||
// MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
|
||||
// MMX used for FastConvertYUVToARGBRow requires an emms instruction.
|
||||
EMMS();
|
||||
return 0;
|
||||
}
|
||||
@ -1165,14 +1258,31 @@ int I444ToARGB(const uint8* src_y, int src_stride_y,
|
||||
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
|
||||
dst_stride_argb = -dst_stride_argb;
|
||||
}
|
||||
void (*FastConvertYUV444ToARGBRow)(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2)) {
|
||||
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_SSE2;
|
||||
} else
|
||||
#endif
|
||||
#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
|
||||
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_MMX;
|
||||
#else
|
||||
{
|
||||
FastConvertYUV444ToARGBRow = FastConvertYUV444ToARGBRow_C;
|
||||
}
|
||||
#endif
|
||||
for (int y = 0; y < height; ++y) {
|
||||
FastConvertYUV444ToRGB32Row(src_y, src_u, src_v, dst_argb, width);
|
||||
FastConvertYUV444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
|
||||
dst_argb += dst_stride_argb;
|
||||
src_y += src_stride_y;
|
||||
src_u += src_stride_u;
|
||||
src_v += src_stride_v;
|
||||
}
|
||||
// MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
|
||||
// MMX used for FastConvertYUVToARGBRow requires an emms instruction.
|
||||
EMMS();
|
||||
return 0;
|
||||
}
|
||||
@ -1187,178 +1297,34 @@ int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
|
||||
dst_argb = dst_argb + (height - 1) * dst_stride_argb;
|
||||
dst_stride_argb = -dst_stride_argb;
|
||||
}
|
||||
void (*FastConvertYToARGBRow)(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#if defined(HAS_FASTCONVERTYUVTOARGBROW_SSE2)
|
||||
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||
(width % 2 == 0) &&
|
||||
IS_ALIGNED(dst_argb, 8) && (dst_stride_argb % 8 == 0)) {
|
||||
FastConvertYToARGBRow = FastConvertYToARGBRow_SSE2;
|
||||
} else
|
||||
#endif
|
||||
#if defined(HAS_FASTCONVERTYUVTOARGBROW_MMX)
|
||||
if (width % 2 == 0) {
|
||||
FastConvertYToARGBRow = FastConvertYToARGBRow_MMX;
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
FastConvertYToARGBRow = FastConvertYToARGBRow_C;
|
||||
}
|
||||
for (int y = 0; y < height; ++y) {
|
||||
FastConvertYToRGB32Row(src_y, dst_argb, width);
|
||||
FastConvertYToARGBRow(src_y, dst_argb, width);
|
||||
dst_argb += dst_stride_argb;
|
||||
src_y += src_stride_y;
|
||||
}
|
||||
// MMX used for FastConvertYUVToRGB32Row requires an emms instruction.
|
||||
// MMX used for FastConvertYUVToARGBRow requires an emms instruction.
|
||||
EMMS();
|
||||
return 0;
|
||||
}
|
||||
|
||||
// TODO(fbarchard): 64 bit version
|
||||
#if defined(WIN32) && !defined(COVERAGE_ENABLED)
|
||||
|
||||
#define HAS_I400TOARGBROW_SSE2
|
||||
__declspec(naked)
|
||||
static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_y
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
|
||||
pslld xmm7, 24
|
||||
|
||||
wloop:
|
||||
movq xmm0, qword ptr [eax]
|
||||
lea eax, [eax + 8]
|
||||
punpcklbw xmm0, xmm0
|
||||
movdqa xmm1, xmm0
|
||||
punpcklwd xmm0, xmm0
|
||||
punpckhwd xmm1, xmm1
|
||||
por xmm0, xmm7
|
||||
por xmm1, xmm7
|
||||
movdqa [edx], xmm0
|
||||
movdqa [edx + 16], xmm1
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
ja wloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
#define HAS_ABGRTOARGBROW_SSSE3
|
||||
__declspec(naked)
|
||||
static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
|
||||
int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_abgr
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
movdqa xmm7, _kShuffleMaskABGRToARGB
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
lea eax, [eax + 16]
|
||||
pshufb xmm0, xmm7
|
||||
movdqa [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
#define HAS_BGRATOARGBROW_SSSE3
|
||||
__declspec(naked)
|
||||
static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
|
||||
int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_bgra
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
movdqa xmm7, _kShuffleMaskBGRAToARGB
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
lea eax, [eax + 16]
|
||||
pshufb xmm0, xmm7
|
||||
movdqa [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#elif (defined(__x86_64__) || defined(__i386__)) && \
|
||||
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
|
||||
|
||||
// TODO(yuche): consider moving ARGB related codes to a separate file.
|
||||
#define HAS_I400TOARGBROW_SSE2
|
||||
static void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
asm volatile(
|
||||
"pcmpeqb %%xmm7,%%xmm7\n"
|
||||
"pslld $0x18,%%xmm7\n"
|
||||
"1:"
|
||||
"movq (%0),%%xmm0\n"
|
||||
"lea 0x8(%0),%0\n"
|
||||
"punpcklbw %%xmm0,%%xmm0\n"
|
||||
"movdqa %%xmm0,%%xmm1\n"
|
||||
"punpcklwd %%xmm0,%%xmm0\n"
|
||||
"punpckhwd %%xmm1,%%xmm1\n"
|
||||
"por %%xmm7,%%xmm0\n"
|
||||
"por %%xmm7,%%xmm1\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"movdqa %%xmm1,0x10(%1)\n"
|
||||
"lea 0x20(%1),%1\n"
|
||||
"sub $0x8,%2\n"
|
||||
"ja 1b\n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(pix) // %2
|
||||
:
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#define HAS_ABGRTOARGBROW_SSSE3
|
||||
static void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb,
|
||||
int pix) {
|
||||
asm volatile(
|
||||
"movdqa (%3),%%xmm7\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"lea 0x10(%0),%0\n"
|
||||
"pshufb %%xmm7,%%xmm0\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"lea 0x10(%1),%1\n"
|
||||
"sub $0x4,%2\n"
|
||||
"ja 1b\n"
|
||||
: "+r"(src_abgr), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(pix) // %2
|
||||
: "r"(kShuffleMaskABGRToARGB) // %3
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#define HAS_BGRATOARGBROW_SSSE3
|
||||
static void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb,
|
||||
int pix) {
|
||||
asm volatile(
|
||||
"movdqa (%3),%%xmm7\n"
|
||||
"1:"
|
||||
"movdqa (%0),%%xmm0\n"
|
||||
"lea 0x10(%0),%0\n"
|
||||
"pshufb %%xmm7,%%xmm0\n"
|
||||
"movdqa %%xmm0,(%1)\n"
|
||||
"lea 0x10(%1),%1\n"
|
||||
"sub $0x4,%2\n"
|
||||
"ja 1b\n"
|
||||
: "+r"(src_bgra), // %0
|
||||
"+r"(dst_argb), // %1
|
||||
"+r"(pix) // %2
|
||||
: "r"(kShuffleMaskBGRAToARGB) // %3
|
||||
: "memory"
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
static void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
// Copy a Y to RGB.
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
uint8 y = src_y[0];
|
||||
dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
|
||||
dst_argb[3] = 255u;
|
||||
dst_argb += 4;
|
||||
++src_y;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert I400 to ARGB.
|
||||
int I400ToARGB(const uint8* src_y, int src_stride_y,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
@ -1370,7 +1336,7 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
|
||||
}
|
||||
void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix);
|
||||
#if defined(HAS_I400TOARGBROW_SSE2)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSE2) &&
|
||||
if (TestCpuFlag(kCpuHasSSE2) &&
|
||||
(width % 8 == 0) &&
|
||||
IS_ALIGNED(src_y, 8) && (src_stride_y % 8 == 0) &&
|
||||
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
|
||||
@ -1389,22 +1355,6 @@ int I400ToARGB(const uint8* src_y, int src_stride_y,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
// To support in-place conversion.
|
||||
uint8 r = src_abgr[0];
|
||||
uint8 g = src_abgr[1];
|
||||
uint8 b = src_abgr[2];
|
||||
uint8 a = src_abgr[3];
|
||||
dst_argb[0] = b;
|
||||
dst_argb[1] = g;
|
||||
dst_argb[2] = r;
|
||||
dst_argb[3] = a;
|
||||
dst_argb += 4;
|
||||
src_abgr += 4;
|
||||
}
|
||||
}
|
||||
|
||||
int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
int width, int height) {
|
||||
@ -1415,7 +1365,7 @@ int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
|
||||
}
|
||||
void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix);
|
||||
#if defined(HAS_ABGRTOARGBROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
(width % 4 == 0) &&
|
||||
IS_ALIGNED(src_abgr, 16) && (src_stride_abgr % 16 == 0) &&
|
||||
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
|
||||
@ -1434,22 +1384,6 @@ void (*ABGRToARGBRow)(const uint8* src_abgr, uint8* dst_argb, int pix);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
// To support in-place conversion.
|
||||
uint8 a = src_bgra[0];
|
||||
uint8 r = src_bgra[1];
|
||||
uint8 g = src_bgra[2];
|
||||
uint8 b = src_bgra[3];
|
||||
dst_argb[0] = b;
|
||||
dst_argb[1] = g;
|
||||
dst_argb[2] = r;
|
||||
dst_argb[3] = a;
|
||||
dst_argb += 4;
|
||||
src_bgra += 4;
|
||||
}
|
||||
}
|
||||
|
||||
// Convert BGRA to ARGB.
|
||||
int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
|
||||
uint8* dst_argb, int dst_stride_argb,
|
||||
@ -1461,7 +1395,7 @@ int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
|
||||
}
|
||||
void (*BGRAToARGBRow)(const uint8* src_bgra, uint8* dst_argb, int pix);
|
||||
#if defined(HAS_BGRATOARGBROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
(width % 4 == 0) &&
|
||||
IS_ALIGNED(src_bgra, 16) && (src_stride_bgra % 16 == 0) &&
|
||||
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
|
||||
@ -1491,7 +1425,7 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
|
||||
}
|
||||
void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
#if defined(HAS_ARGBTOYROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
(width % 4 == 0) &&
|
||||
IS_ALIGNED(src_argb, 16) && (src_stride_argb % 16 == 0) &&
|
||||
IS_ALIGNED(dst_y, 16) && (dst_stride_y % 16 == 0)) {
|
||||
@ -1522,7 +1456,7 @@ int RAWToARGB(const uint8* src_raw, int src_stride_raw,
|
||||
}
|
||||
void (*RAWToARGBRow)(const uint8* src_raw, uint8* dst_argb, int pix);
|
||||
#if defined(HAS_RAWTOARGBROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_raw, 16) && (src_stride_raw % 16 == 0) &&
|
||||
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
|
||||
@ -1552,7 +1486,7 @@ int BG24ToARGB(const uint8* src_bg24, int src_stride_bg24,
|
||||
}
|
||||
void (*BG24ToARGBRow)(const uint8* src_bg24, uint8* dst_argb, int pix);
|
||||
#if defined(HAS_BG24TOARGBROW_SSSE3)
|
||||
if (libyuv::TestCpuFlag(libyuv::kCpuHasSSSE3) &&
|
||||
if (TestCpuFlag(kCpuHasSSSE3) &&
|
||||
(width % 16 == 0) &&
|
||||
IS_ALIGNED(src_bg24, 16) && (src_stride_bg24 % 16 == 0) &&
|
||||
IS_ALIGNED(dst_argb, 16) && (dst_stride_argb % 16 == 0)) {
|
||||
|
||||
166
source/row.h
166
source/row.h
@ -13,9 +13,13 @@
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#define kMaxStride (2048 * 4)
|
||||
|
||||
// The following are available on all x86 platforms
|
||||
#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) \
|
||||
&& !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
|
||||
#if (defined(WIN32) || defined(__x86_64__) || defined(__i386__)) && \
|
||||
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
|
||||
#define HAS_ABGRTOARGBROW_SSSE3
|
||||
#define HAS_BGRATOARGBROW_SSSE3
|
||||
#define HAS_ARGBTOYROW_SSSE3
|
||||
#define HAS_BG24TOARGBROW_SSSE3
|
||||
#define HAS_RAWTOARGBROW_SSSE3
|
||||
@ -23,19 +27,41 @@
|
||||
#define HAS_RAWTOYROW_SSSE3
|
||||
#define HAS_RGB24TOUVROW_SSSE3
|
||||
#define HAS_RAWTOUVROW_SSSE3
|
||||
#endif
|
||||
|
||||
// The following are available only on Windows
|
||||
#if defined(WIN32) \
|
||||
&& !defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
|
||||
#define HAS_BGRATOYROW_SSSE3
|
||||
#define HAS_ABGRTOYROW_SSSE3
|
||||
#define HAS_I400TOARGBROW_SSE2
|
||||
#endif
|
||||
|
||||
// The following are available on Windows and Linux
|
||||
#if (defined(WIN32) || defined(__x86_64__) || \
|
||||
(defined(__i386__) && !defined(__pic__))) && \
|
||||
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
|
||||
#define HAS_ARGBTOUVROW_SSSE3
|
||||
#define HAS_BGRATOUVROW_SSSE3
|
||||
#define HAS_ABGRTOUVROW_SSSE3
|
||||
#endif
|
||||
|
||||
// The following are available on Linux (32/64 bit)
|
||||
// TODO(fbarchard): enable for fpic on linux
|
||||
#if (defined(__x86_64__) || \
|
||||
(defined(__i386__) && !defined(__pic__))) && \
|
||||
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
|
||||
#define HAS_FASTCONVERTYUVTOARGBROW_SSE2
|
||||
#define HAS_FASTCONVERTYUVTOBGRAROW_SSE2
|
||||
#define HAS_FASTCONVERTYUVTOABGRROW_SSE2
|
||||
#endif
|
||||
|
||||
// The following are available on Windows and GCC 32 bit
|
||||
#if (defined(WIN32) || \
|
||||
defined(__i386__)) && \
|
||||
!defined(COVERAGE_ENABLED) && !defined(TARGET_IPHONE_SIMULATOR)
|
||||
#define HAS_FASTCONVERTYUVTOARGBROW_MMX
|
||||
#define HAS_FASTCONVERTYUVTOBGRAROW_MMX
|
||||
#define HAS_FASTCONVERTYUVTOABGRROW_MMX
|
||||
#endif
|
||||
|
||||
extern "C" {
|
||||
|
||||
#ifdef HAS_ARGBTOYROW_SSSE3
|
||||
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
@ -75,56 +101,128 @@ void RAWToUVRow_C(const uint8* src_argb0, int src_stride_argb,
|
||||
uint8* dst_u, uint8* dst_v, int width);
|
||||
|
||||
#ifdef HAS_BG24TOARGBROW_SSSE3
|
||||
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix);
|
||||
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix);
|
||||
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
|
||||
void RAWToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix);
|
||||
#endif
|
||||
void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix);
|
||||
void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix);
|
||||
void BG24ToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
|
||||
void RAWToARGBRow_C(const uint8* src_bg24, uint8* dst_argb, int pix);
|
||||
|
||||
#ifdef HAS_I400TOARGBROW_SSE2
|
||||
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix);
|
||||
#endif
|
||||
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix);
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
#define SIMD_ALIGNED(var) __declspec(align(16)) var
|
||||
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
|
||||
#else
|
||||
#else // __GNUC__
|
||||
#define SIMD_ALIGNED(var) var __attribute__((aligned(16)))
|
||||
#define TALIGN16(t, var) t var __attribute__((aligned(16)))
|
||||
typedef signed char __attribute__((vector_size(16))) vec8;
|
||||
typedef unsigned char __attribute__((vector_size(16))) uvec8;
|
||||
#endif
|
||||
|
||||
#ifdef OSX
|
||||
extern SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
|
||||
extern SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
|
||||
extern SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
|
||||
#else
|
||||
extern SIMD_ALIGNED(const int16 _kCoefficientsRgbY[768][4]);
|
||||
extern SIMD_ALIGNED(const int16 _kCoefficientsBgraY[768][4]);
|
||||
extern SIMD_ALIGNED(const int16 _kCoefficientsAbgrY[768][4]);
|
||||
#endif
|
||||
void FastConvertYUVToRGB32Row(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
extern "C" SIMD_ALIGNED(const int16 kCoefficientsRgbY[768][4]);
|
||||
extern "C" SIMD_ALIGNED(const int16 kCoefficientsBgraY[768][4]);
|
||||
extern "C" SIMD_ALIGNED(const int16 kCoefficientsAbgrY[768][4]);
|
||||
|
||||
void FastConvertYUVToBGRARow(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
void FastConvertYUVToARGBRow_C(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void FastConvertYUVToBGRARow_C(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void FastConvertYUVToABGRRow_C(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void FastConvertYUV444ToARGBRow_C(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void FastConvertYToARGBRow_C(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void FastConvertYUVToABGRRow(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#ifdef HAS_FASTCONVERTYUVTOARGBROW_SSE2
|
||||
void FastConvertYUVToARGBRow_SSE2(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
|
||||
void FastConvertYUVToARGBRow4_SSE2(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void FastConvertYUVToBGRARow_SSE2(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void FastConvertYUVToABGRRow_SSE2(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void FastConvertYUV444ToARGBRow_SSE2(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void FastConvertYToARGBRow_SSE2(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#endif
|
||||
|
||||
#ifdef HAS_FASTCONVERTYUVTOARGBROW_MMX
|
||||
void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void FastConvertYToRGB32Row(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void FastConvertYToARGBRow_MMX(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
#endif
|
||||
|
||||
// Method to force C version.
|
||||
//#define USE_MMX 0
|
||||
|
||||
1039
source/row_posix.cc
1039
source/row_posix.cc
File diff suppressed because it is too large
Load Diff
@ -10,8 +10,6 @@
|
||||
|
||||
#include "row.h"
|
||||
|
||||
#define kMaxStride (2048 * 4)
|
||||
|
||||
extern "C" {
|
||||
|
||||
#define MAKETABLE(NAME) \
|
||||
@ -232,11 +230,7 @@ SIMD_ALIGNED(const int16 NAME[256 * 3][4]) = {\
|
||||
0 \
|
||||
}
|
||||
|
||||
#ifdef OSX
|
||||
MAKETABLE(kCoefficientsRgbY)
|
||||
#else
|
||||
MAKETABLE(_kCoefficientsRgbY)
|
||||
#endif
|
||||
|
||||
#undef RGBY
|
||||
#undef RGBU
|
||||
@ -264,12 +258,7 @@ MAKETABLE(_kCoefficientsRgbY)
|
||||
0 \
|
||||
}
|
||||
|
||||
#ifdef OSX
|
||||
MAKETABLE(kCoefficientsBgraY)
|
||||
#else
|
||||
MAKETABLE(_kCoefficientsBgraY)
|
||||
#endif
|
||||
|
||||
|
||||
#undef RGBY
|
||||
#undef RGBU
|
||||
@ -297,12 +286,39 @@ MAKETABLE(_kCoefficientsBgraY)
|
||||
0 \
|
||||
}
|
||||
|
||||
#ifdef OSX
|
||||
MAKETABLE(kCoefficientsAbgrY)
|
||||
#else
|
||||
MAKETABLE(_kCoefficientsAbgrY)
|
||||
#endif
|
||||
|
||||
void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
// To support in-place conversion.
|
||||
uint8 r = src_abgr[0];
|
||||
uint8 g = src_abgr[1];
|
||||
uint8 b = src_abgr[2];
|
||||
uint8 a = src_abgr[3];
|
||||
dst_argb[0] = b;
|
||||
dst_argb[1] = g;
|
||||
dst_argb[2] = r;
|
||||
dst_argb[3] = a;
|
||||
dst_argb += 4;
|
||||
src_abgr += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
// To support in-place conversion.
|
||||
uint8 a = src_bgra[0];
|
||||
uint8 r = src_bgra[1];
|
||||
uint8 g = src_bgra[2];
|
||||
uint8 b = src_bgra[3];
|
||||
dst_argb[0] = b;
|
||||
dst_argb[1] = g;
|
||||
dst_argb[2] = r;
|
||||
dst_argb[3] = a;
|
||||
dst_argb += 4;
|
||||
src_bgra += 4;
|
||||
}
|
||||
}
|
||||
|
||||
void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
@ -466,4 +482,133 @@ void RAWToUVRow_SSSE3(const uint8* src_argb, int src_stride_argb,
|
||||
#endif
|
||||
#endif
|
||||
|
||||
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
// Copy a Y to RGB.
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
uint8 y = src_y[0];
|
||||
dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
|
||||
dst_argb[3] = 255u;
|
||||
dst_argb += 4;
|
||||
++src_y;
|
||||
}
|
||||
}
|
||||
|
||||
// C reference code that mimic the YUV assembly.
|
||||
#define packuswb(x) ((x) < 0 ? 0 : ((x) > 255 ? 255 : (x)))
|
||||
#define paddsw(x, y) (((x) + (y)) < -32768 ? -32768 : \
|
||||
(((x) + (y)) > 32767 ? 32767 : ((x) + (y))))
|
||||
|
||||
static inline void YuvPixel(uint8 y,
|
||||
uint8 u,
|
||||
uint8 v,
|
||||
uint8* rgb_buf,
|
||||
int ashift,
|
||||
int rshift,
|
||||
int gshift,
|
||||
int bshift) {
|
||||
|
||||
int b = kCoefficientsRgbY[256+u][0];
|
||||
int g = kCoefficientsRgbY[256+u][1];
|
||||
int r = kCoefficientsRgbY[256+u][2];
|
||||
int a = kCoefficientsRgbY[256+u][3];
|
||||
|
||||
b = paddsw(b, kCoefficientsRgbY[512+v][0]);
|
||||
g = paddsw(g, kCoefficientsRgbY[512+v][1]);
|
||||
r = paddsw(r, kCoefficientsRgbY[512+v][2]);
|
||||
a = paddsw(a, kCoefficientsRgbY[512+v][3]);
|
||||
|
||||
b = paddsw(b, kCoefficientsRgbY[y][0]);
|
||||
g = paddsw(g, kCoefficientsRgbY[y][1]);
|
||||
r = paddsw(r, kCoefficientsRgbY[y][2]);
|
||||
a = paddsw(a, kCoefficientsRgbY[y][3]);
|
||||
|
||||
b >>= 6;
|
||||
g >>= 6;
|
||||
r >>= 6;
|
||||
a >>= 6;
|
||||
|
||||
*reinterpret_cast<uint32*>(rgb_buf) = (packuswb(b) << bshift) |
|
||||
(packuswb(g) << gshift) |
|
||||
(packuswb(r) << rshift) |
|
||||
(packuswb(a) << ashift);
|
||||
}
|
||||
|
||||
void FastConvertYUVToARGBRow_C(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
for (int x = 0; x < width - 1; x += 2) {
|
||||
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
|
||||
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 16, 8, 0);
|
||||
y_buf += 2;
|
||||
u_buf += 1;
|
||||
v_buf += 1;
|
||||
rgb_buf += 8; // Advance 2 pixels.
|
||||
}
|
||||
if (width & 1) {
|
||||
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 16, 8, 0);
|
||||
}
|
||||
}
|
||||
|
||||
void FastConvertYUVToBGRARow_C(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
for (int x = 0; x < width - 1; x += 2) {
|
||||
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 0, 8, 16, 24);
|
||||
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 0, 8, 16, 24);
|
||||
y_buf += 2;
|
||||
u_buf += 1;
|
||||
v_buf += 1;
|
||||
rgb_buf += 8; // Advance 2 pixels.
|
||||
}
|
||||
if (width & 1) {
|
||||
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 0, 8, 16, 24);
|
||||
}
|
||||
}
|
||||
|
||||
void FastConvertYUVToABGRRow_C(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
for (int x = 0; x < width - 1; x += 2) {
|
||||
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
|
||||
YuvPixel(y_buf[1], u_buf[0], v_buf[0], rgb_buf + 4, 24, 0, 8, 16);
|
||||
y_buf += 2;
|
||||
u_buf += 1;
|
||||
v_buf += 1;
|
||||
rgb_buf += 8; // Advance 2 pixels.
|
||||
}
|
||||
if (width & 1) {
|
||||
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf + 0, 24, 0, 8, 16);
|
||||
}
|
||||
}
|
||||
|
||||
void FastConvertYUV444ToARGBRow_C(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
YuvPixel(y_buf[0], u_buf[0], v_buf[0], rgb_buf, 24, 16, 8, 0);
|
||||
y_buf += 1;
|
||||
u_buf += 1;
|
||||
v_buf += 1;
|
||||
rgb_buf += 4; // Advance 1 pixel.
|
||||
}
|
||||
}
|
||||
|
||||
void FastConvertYToARGBRow_C(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
YuvPixel(y_buf[0], 128, 128, rgb_buf, 24, 16, 8, 0);
|
||||
y_buf += 1;
|
||||
rgb_buf += 4; // Advance 1 pixel.
|
||||
}
|
||||
}
|
||||
|
||||
} // extern "C"
|
||||
|
||||
@ -74,6 +74,160 @@ extern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
|
||||
2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
|
||||
};
|
||||
|
||||
// Shuffle table for converting ABGR to ARGB.
|
||||
extern "C" TALIGN16(const uint8, kShuffleMaskABGRToARGB[16]) = {
|
||||
2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
|
||||
};
|
||||
|
||||
// Shuffle table for converting BGRA to ARGB.
|
||||
extern "C" TALIGN16(const uint8, kShuffleMaskBGRAToARGB[16]) = {
|
||||
3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
|
||||
};
|
||||
|
||||
__declspec(naked)
|
||||
void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_y
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
|
||||
pslld xmm5, 24
|
||||
|
||||
wloop:
|
||||
movq xmm0, qword ptr [eax]
|
||||
lea eax, [eax + 8]
|
||||
punpcklbw xmm0, xmm0
|
||||
movdqa xmm1, xmm0
|
||||
punpcklwd xmm0, xmm0
|
||||
punpckhwd xmm1, xmm1
|
||||
por xmm0, xmm5
|
||||
por xmm1, xmm5
|
||||
movdqa [edx], xmm0
|
||||
movdqa [edx + 16], xmm1
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 8
|
||||
ja wloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_abgr
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
movdqa xmm5, _kShuffleMaskABGRToARGB
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
lea eax, [eax + 16]
|
||||
pshufb xmm0, xmm5
|
||||
movdqa [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_bgra
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
movdqa xmm5, _kShuffleMaskBGRAToARGB
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
lea eax, [eax + 16]
|
||||
pshufb xmm0, xmm5
|
||||
movdqa [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 4
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_bg24
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
|
||||
pslld xmm5, 24
|
||||
movdqa xmm4, _kShuffleMaskBG24ToARGB
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm3, [eax + 32]
|
||||
lea eax, [eax + 48]
|
||||
movdqa xmm2, xmm3
|
||||
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
|
||||
pshufb xmm2, xmm4
|
||||
por xmm2, xmm5
|
||||
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
|
||||
pshufb xmm0, xmm4
|
||||
movdqa [edx + 32], xmm2
|
||||
por xmm0, xmm5
|
||||
pshufb xmm1, xmm4
|
||||
movdqa [edx], xmm0
|
||||
por xmm1, xmm5
|
||||
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
|
||||
pshufb xmm3, xmm4
|
||||
movdqa [edx + 16], xmm1
|
||||
por xmm3, xmm5
|
||||
movdqa [edx + 48], xmm3
|
||||
lea edx, [edx + 64]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
|
||||
int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_raw
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
pcmpeqb xmm5, xmm5 // generate mask 0xff000000
|
||||
pslld xmm5, 24
|
||||
movdqa xmm4, _kShuffleMaskRAWToARGB
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm3, [eax + 32]
|
||||
lea eax, [eax + 48]
|
||||
movdqa xmm2, xmm3
|
||||
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
|
||||
pshufb xmm2, xmm4
|
||||
por xmm2, xmm5
|
||||
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
|
||||
pshufb xmm0, xmm4
|
||||
movdqa [edx + 32], xmm2
|
||||
por xmm0, xmm5
|
||||
pshufb xmm1, xmm4
|
||||
movdqa [edx], xmm0
|
||||
por xmm1, xmm5
|
||||
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
|
||||
pshufb xmm3, xmm4
|
||||
movdqa [edx + 16], xmm1
|
||||
por xmm3, xmm5
|
||||
movdqa [edx + 48], xmm3
|
||||
lea edx, [edx + 64]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
// Convert 16 ARGB pixels (64 bytes) to 16 Y values
|
||||
__declspec(naked)
|
||||
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
|
||||
@ -81,25 +235,25 @@ __asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
mov edx, [esp + 8] /* dst_y */
|
||||
mov ecx, [esp + 12] /* pix */
|
||||
movdqa xmm7, _kARGBToY
|
||||
movdqa xmm6, _kAddY16
|
||||
movdqa xmm5, _kAddY16
|
||||
movdqa xmm4, _kARGBToY
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm2, [eax + 32]
|
||||
movdqa xmm3, [eax + 48]
|
||||
pmaddubsw xmm0, xmm7
|
||||
pmaddubsw xmm1, xmm7
|
||||
pmaddubsw xmm2, xmm7
|
||||
pmaddubsw xmm3, xmm7
|
||||
pmaddubsw xmm0, xmm4
|
||||
pmaddubsw xmm1, xmm4
|
||||
pmaddubsw xmm2, xmm4
|
||||
pmaddubsw xmm3, xmm4
|
||||
lea eax, [eax + 64]
|
||||
phaddw xmm0, xmm1
|
||||
phaddw xmm2, xmm3
|
||||
psrlw xmm0, 7
|
||||
psrlw xmm2, 7
|
||||
packuswb xmm0, xmm2
|
||||
paddb xmm0, xmm6
|
||||
paddb xmm0, xmm5
|
||||
movdqa [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
@ -114,25 +268,25 @@ __asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
mov edx, [esp + 8] /* dst_y */
|
||||
mov ecx, [esp + 12] /* pix */
|
||||
movdqa xmm7, _kBGRAToY
|
||||
movdqa xmm6, _kAddY16
|
||||
movdqa xmm5, _kAddY16
|
||||
movdqa xmm4, _kBGRAToY
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm2, [eax + 32]
|
||||
movdqa xmm3, [eax + 48]
|
||||
pmaddubsw xmm0, xmm7
|
||||
pmaddubsw xmm1, xmm7
|
||||
pmaddubsw xmm2, xmm7
|
||||
pmaddubsw xmm3, xmm7
|
||||
pmaddubsw xmm0, xmm4
|
||||
pmaddubsw xmm1, xmm4
|
||||
pmaddubsw xmm2, xmm4
|
||||
pmaddubsw xmm3, xmm4
|
||||
lea eax, [eax + 64]
|
||||
phaddw xmm0, xmm1
|
||||
phaddw xmm2, xmm3
|
||||
psrlw xmm0, 7
|
||||
psrlw xmm2, 7
|
||||
packuswb xmm0, xmm2
|
||||
paddb xmm0, xmm6
|
||||
paddb xmm0, xmm5
|
||||
movdqa [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
@ -147,25 +301,25 @@ __asm {
|
||||
mov eax, [esp + 4] /* src_argb */
|
||||
mov edx, [esp + 8] /* dst_y */
|
||||
mov ecx, [esp + 12] /* pix */
|
||||
movdqa xmm7, _kABGRToY
|
||||
movdqa xmm6, _kAddY16
|
||||
movdqa xmm5, _kAddY16
|
||||
movdqa xmm4, _kABGRToY
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm2, [eax + 32]
|
||||
movdqa xmm3, [eax + 48]
|
||||
pmaddubsw xmm0, xmm7
|
||||
pmaddubsw xmm1, xmm7
|
||||
pmaddubsw xmm2, xmm7
|
||||
pmaddubsw xmm3, xmm7
|
||||
pmaddubsw xmm0, xmm4
|
||||
pmaddubsw xmm1, xmm4
|
||||
pmaddubsw xmm2, xmm4
|
||||
pmaddubsw xmm3, xmm4
|
||||
lea eax, [eax + 64]
|
||||
phaddw xmm0, xmm1
|
||||
phaddw xmm2, xmm3
|
||||
psrlw xmm0, 7
|
||||
psrlw xmm2, 7
|
||||
packuswb xmm0, xmm2
|
||||
paddb xmm0, xmm6
|
||||
paddb xmm0, xmm5
|
||||
movdqa [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 16
|
||||
@ -366,230 +520,138 @@ __asm {
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_bg24
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
|
||||
pslld xmm7, 24
|
||||
movdqa xmm6, _kShuffleMaskBG24ToARGB
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm3, [eax + 32]
|
||||
lea eax, [eax + 48]
|
||||
movdqa xmm2, xmm3
|
||||
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
|
||||
pshufb xmm2, xmm6
|
||||
por xmm2, xmm7
|
||||
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
|
||||
pshufb xmm0, xmm6
|
||||
movdqa [edx + 32], xmm2
|
||||
por xmm0, xmm7
|
||||
pshufb xmm1, xmm6
|
||||
movdqa [edx], xmm0
|
||||
por xmm1, xmm7
|
||||
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
|
||||
pshufb xmm3, xmm6
|
||||
movdqa [edx + 16], xmm1
|
||||
por xmm3, xmm7
|
||||
movdqa [edx + 48], xmm3
|
||||
lea edx, [edx + 64]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
ret
|
||||
#define YUVTORGB(TABLE) __asm { \
|
||||
__asm convertloop : \
|
||||
__asm movzx eax, byte ptr [edi] \
|
||||
__asm lea edi, [edi + 1] \
|
||||
__asm movzx ebx, byte ptr [esi] \
|
||||
__asm lea esi, [esi + 1] \
|
||||
__asm movq mm0, [TABLE + 2048 + 8 * eax] \
|
||||
__asm movzx eax, byte ptr [edx] \
|
||||
__asm paddsw mm0, [TABLE + 4096 + 8 * ebx] \
|
||||
__asm movzx ebx, byte ptr [edx + 1] \
|
||||
__asm movq mm1, [TABLE + 8 * eax] \
|
||||
__asm lea edx, [edx + 2] \
|
||||
__asm movq mm2, [TABLE + 8 * ebx] \
|
||||
__asm paddsw mm1, mm0 \
|
||||
__asm paddsw mm2, mm0 \
|
||||
__asm psraw mm1, 6 \
|
||||
__asm psraw mm2, 6 \
|
||||
__asm packuswb mm1, mm2 \
|
||||
__asm movq [ebp], mm1 \
|
||||
__asm lea ebp, [ebp + 8] \
|
||||
__asm sub ecx, 2 \
|
||||
__asm ja convertloop \
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
|
||||
int pix) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] // src_raw
|
||||
mov edx, [esp + 8] // dst_argb
|
||||
mov ecx, [esp + 12] // pix
|
||||
pcmpeqb xmm7, xmm7 // generate mask 0xff000000
|
||||
pslld xmm7, 24
|
||||
movdqa xmm6, _kShuffleMaskRAWToARGB
|
||||
|
||||
convertloop :
|
||||
movdqa xmm0, [eax]
|
||||
movdqa xmm1, [eax + 16]
|
||||
movdqa xmm3, [eax + 32]
|
||||
lea eax, [eax + 48]
|
||||
movdqa xmm2, xmm3
|
||||
palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]}
|
||||
pshufb xmm2, xmm6
|
||||
por xmm2, xmm7
|
||||
palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]}
|
||||
pshufb xmm0, xmm6
|
||||
movdqa [edx + 32], xmm2
|
||||
por xmm0, xmm7
|
||||
pshufb xmm1, xmm6
|
||||
movdqa [edx], xmm0
|
||||
por xmm1, xmm7
|
||||
palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]}
|
||||
pshufb xmm3, xmm6
|
||||
movdqa [edx + 16], xmm1
|
||||
por xmm3, xmm7
|
||||
movdqa [edx + 48], xmm3
|
||||
lea edx, [edx + 64]
|
||||
sub ecx, 16
|
||||
ja convertloop
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void FastConvertYUVToRGB32Row(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
__asm {
|
||||
pushad
|
||||
mov edx, [esp + 32 + 4]
|
||||
mov edi, [esp + 32 + 8]
|
||||
mov esi, [esp + 32 + 12]
|
||||
mov ebp, [esp + 32 + 16]
|
||||
mov ecx, [esp + 32 + 20]
|
||||
|
||||
convertloop :
|
||||
movzx eax, byte ptr [edi]
|
||||
lea edi, [edi + 1]
|
||||
movzx ebx, byte ptr [esi]
|
||||
lea esi, [esi + 1]
|
||||
movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
|
||||
movzx eax, byte ptr [edx]
|
||||
paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
|
||||
movzx ebx, byte ptr [edx + 1]
|
||||
movq mm1, [_kCoefficientsRgbY + 8 * eax]
|
||||
lea edx, [edx + 2]
|
||||
movq mm2, [_kCoefficientsRgbY + 8 * ebx]
|
||||
paddsw mm1, mm0
|
||||
paddsw mm2, mm0
|
||||
psraw mm1, 6
|
||||
psraw mm2, 6
|
||||
packuswb mm1, mm2
|
||||
movntq [ebp], mm1
|
||||
lea ebp, [ebp + 8]
|
||||
sub ecx, 2
|
||||
ja convertloop
|
||||
|
||||
popad
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void FastConvertYUVToBGRARow(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
__asm {
|
||||
pushad
|
||||
mov edx, [esp + 32 + 4]
|
||||
mov edi, [esp + 32 + 8]
|
||||
mov esi, [esp + 32 + 12]
|
||||
mov ebp, [esp + 32 + 16]
|
||||
mov ecx, [esp + 32 + 20]
|
||||
|
||||
convertloop :
|
||||
movzx eax, byte ptr [edi]
|
||||
lea edi, [edi + 1]
|
||||
movzx ebx, byte ptr [esi]
|
||||
lea esi, [esi + 1]
|
||||
movq mm0, [_kCoefficientsBgraY + 2048 + 8 * eax]
|
||||
movzx eax, byte ptr [edx]
|
||||
paddsw mm0, [_kCoefficientsBgraY + 4096 + 8 * ebx]
|
||||
movzx ebx, byte ptr [edx + 1]
|
||||
movq mm1, [_kCoefficientsBgraY + 8 * eax]
|
||||
lea edx, [edx + 2]
|
||||
movq mm2, [_kCoefficientsBgraY + 8 * ebx]
|
||||
paddsw mm1, mm0
|
||||
paddsw mm2, mm0
|
||||
psraw mm1, 6
|
||||
psraw mm2, 6
|
||||
packuswb mm1, mm2
|
||||
movntq [ebp], mm1
|
||||
lea ebp, [ebp + 8]
|
||||
sub ecx, 2
|
||||
ja convertloop
|
||||
|
||||
popad
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void FastConvertYUVToABGRRow(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
__asm {
|
||||
pushad
|
||||
mov edx, [esp + 32 + 4]
|
||||
mov edi, [esp + 32 + 8]
|
||||
mov esi, [esp + 32 + 12]
|
||||
mov ebp, [esp + 32 + 16]
|
||||
mov ecx, [esp + 32 + 20]
|
||||
|
||||
convertloop :
|
||||
movzx eax, byte ptr [edi]
|
||||
lea edi, [edi + 1]
|
||||
movzx ebx, byte ptr [esi]
|
||||
lea esi, [esi + 1]
|
||||
movq mm0, [_kCoefficientsAbgrY + 2048 + 8 * eax]
|
||||
movzx eax, byte ptr [edx]
|
||||
paddsw mm0, [_kCoefficientsAbgrY + 4096 + 8 * ebx]
|
||||
movzx ebx, byte ptr [edx + 1]
|
||||
movq mm1, [_kCoefficientsAbgrY + 8 * eax]
|
||||
lea edx, [edx + 2]
|
||||
movq mm2, [_kCoefficientsAbgrY + 8 * ebx]
|
||||
paddsw mm1, mm0
|
||||
paddsw mm2, mm0
|
||||
psraw mm1, 6
|
||||
psraw mm2, 6
|
||||
packuswb mm1, mm2
|
||||
movntq [ebp], mm1
|
||||
lea ebp, [ebp + 8]
|
||||
sub ecx, 2
|
||||
ja convertloop
|
||||
|
||||
popad
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
|
||||
void FastConvertYUVToARGBRow_MMX(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
__asm {
|
||||
pushad
|
||||
mov edx, [esp + 32 + 4] // Y
|
||||
mov edi, [esp + 32 + 8] // U
|
||||
mov esi, [esp + 32 + 12] // V
|
||||
mov ebp, [esp + 32 + 16] // rgb
|
||||
mov ecx, [esp + 32 + 20] // width
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
push ebp
|
||||
mov edx, [esp + 16 + 4]
|
||||
mov edi, [esp + 16 + 8]
|
||||
mov esi, [esp + 16 + 12]
|
||||
mov ebp, [esp + 16 + 16]
|
||||
mov ecx, [esp + 16 + 20]
|
||||
|
||||
YUVTORGB(kCoefficientsRgbY)
|
||||
|
||||
pop ebp
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void FastConvertYUVToBGRARow_MMX(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
__asm {
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
push ebp
|
||||
mov edx, [esp + 16 + 4]
|
||||
mov edi, [esp + 16 + 8]
|
||||
mov esi, [esp + 16 + 12]
|
||||
mov ebp, [esp + 16 + 16]
|
||||
mov ecx, [esp + 16 + 20]
|
||||
|
||||
YUVTORGB(kCoefficientsBgraY)
|
||||
|
||||
pop ebp
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void FastConvertYUVToABGRRow_MMX(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
__asm {
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
push ebp
|
||||
mov edx, [esp + 16 + 4]
|
||||
mov edi, [esp + 16 + 8]
|
||||
mov esi, [esp + 16 + 12]
|
||||
mov ebp, [esp + 16 + 16]
|
||||
mov ecx, [esp + 16 + 20]
|
||||
|
||||
YUVTORGB(kCoefficientsAbgrY)
|
||||
|
||||
pop ebp
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void FastConvertYUV444ToARGBRow_MMX(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
__asm {
|
||||
push ebx
|
||||
push esi
|
||||
push edi
|
||||
push ebp
|
||||
mov edx, [esp + 16 + 4]
|
||||
mov edi, [esp + 16 + 8]
|
||||
mov esi, [esp + 16 + 12]
|
||||
mov ebp, [esp + 16 + 16]
|
||||
mov ecx, [esp + 16 + 20]
|
||||
|
||||
convertloop :
|
||||
movzx eax, byte ptr [edi]
|
||||
lea edi, [edi + 1]
|
||||
movzx ebx, byte ptr [esi]
|
||||
lea esi, [esi + 1]
|
||||
movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
|
||||
movq mm0, [kCoefficientsRgbY + 2048 + 8 * eax]
|
||||
movzx eax, byte ptr [edx]
|
||||
paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
|
||||
paddsw mm0, [kCoefficientsRgbY + 4096 + 8 * ebx]
|
||||
lea edx, [edx + 1]
|
||||
paddsw mm0, [_kCoefficientsRgbY + 8 * eax]
|
||||
paddsw mm0, [kCoefficientsRgbY + 8 * eax]
|
||||
psraw mm0, 6
|
||||
packuswb mm0, mm0
|
||||
movd [ebp], mm0
|
||||
@ -597,15 +659,18 @@ void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
|
||||
sub ecx, 1
|
||||
ja convertloop
|
||||
|
||||
popad
|
||||
pop ebp
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
__declspec(naked)
|
||||
void FastConvertYToRGB32Row(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
void FastConvertYToARGBRow_MMX(const uint8* y_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
__asm {
|
||||
push ebx
|
||||
mov eax, [esp + 4 + 4] // Y
|
||||
@ -614,10 +679,10 @@ void FastConvertYToRGB32Row(const uint8* y_buf,
|
||||
|
||||
convertloop :
|
||||
movzx ebx, byte ptr [eax]
|
||||
movq mm0, [_kCoefficientsRgbY + 8 * ebx]
|
||||
movq mm0, [kCoefficientsRgbY + 8 * ebx]
|
||||
psraw mm0, 6
|
||||
movzx ebx, byte ptr [eax + 1]
|
||||
movq mm1, [_kCoefficientsRgbY + 8 * ebx]
|
||||
movq mm1, [kCoefficientsRgbY + 8 * ebx]
|
||||
psraw mm1, 6
|
||||
packuswb mm0, mm1
|
||||
lea eax, [eax + 2]
|
||||
|
||||
@ -42,6 +42,7 @@ enum FourCC {
|
||||
FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
|
||||
FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
|
||||
FOURCC_M420 = FOURCC('M', '4', '2', '0'),
|
||||
FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
|
||||
FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
|
||||
FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
|
||||
FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user