mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
splituv and mirroruv in row use 2 pixels at a time in C
BUG=none TEST=none Review URL: https://webrtc-codereview.appspot.com/432006 git-svn-id: http://libyuv.googlecode.com/svn/trunk@201 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
f69e90a19e
commit
16a96645b4
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 200
|
||||
Version: 201
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -11,7 +11,7 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 200
|
||||
#define LIBYUV_VERSION 201
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
|
||||
@ -23,12 +23,6 @@ extern "C" {
|
||||
|
||||
#if (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__)) && \
|
||||
!defined(YUV_DISABLE_ASM)
|
||||
// Note static const preferred, but gives internal compiler error on gcc 4.2
|
||||
// Shuffle table for reversing the bytes of UV channels.
|
||||
uvec8 kShuffleMirrorUV = {
|
||||
14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
|
||||
};
|
||||
|
||||
#if defined(__APPLE__) && defined(__i386__)
|
||||
#define DECLARE_FUNCTION(name) \
|
||||
".text \n" \
|
||||
@ -759,8 +753,7 @@ static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
|
||||
static void TransposeWx8_C(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int w) {
|
||||
int i;
|
||||
for (i = 0; i < w; ++i) {
|
||||
for (int i = 0; i < w; ++i) {
|
||||
dst[0] = src[0 * src_stride];
|
||||
dst[1] = src[1 * src_stride];
|
||||
dst[2] = src[2 * src_stride];
|
||||
@ -777,9 +770,8 @@ static void TransposeWx8_C(const uint8* src, int src_stride,
|
||||
static void TransposeWxH_C(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width, int height) {
|
||||
int i, j;
|
||||
for (i = 0; i < width; ++i)
|
||||
for (j = 0; j < height; ++j)
|
||||
for (int i = 0; i < width; ++i)
|
||||
for (int j = 0; j < height; ++j)
|
||||
dst[i * dst_stride + j] = src[j * src_stride + i];
|
||||
}
|
||||
|
||||
@ -1005,79 +997,6 @@ void RotateUV270(const uint8* src, int src_stride,
|
||||
width, height);
|
||||
}
|
||||
|
||||
#if defined(_M_IX86) && !defined(YUV_DISABLE_ASM)
|
||||
#define HAS_MIRRORROW_UV_SSSE3
|
||||
__declspec(naked)
|
||||
void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_a, uint8* dst_b,
|
||||
int width) {
|
||||
__asm {
|
||||
push edi
|
||||
mov eax, [esp + 4 + 4] // src
|
||||
mov edx, [esp + 4 + 8] // dst_a
|
||||
mov edi, [esp + 4 + 12] // dst_b
|
||||
mov ecx, [esp + 4 + 16] // width
|
||||
movdqa xmm1, kShuffleMirrorUV
|
||||
lea eax, [eax + ecx * 2 - 16]
|
||||
sub edi, edx
|
||||
|
||||
convertloop:
|
||||
movdqa xmm0, [eax]
|
||||
lea eax, [eax - 16]
|
||||
pshufb xmm0, xmm1
|
||||
sub ecx, 8
|
||||
movlpd qword ptr [edx], xmm0
|
||||
movhpd qword ptr [edx + edi], xmm0
|
||||
lea edx, [edx + 8]
|
||||
ja convertloop
|
||||
|
||||
pop edi
|
||||
ret
|
||||
}
|
||||
}
|
||||
|
||||
#elif (defined(__i386__) || defined(__x86_64__)) && \
|
||||
!defined(YUV_DISABLE_ASM)
|
||||
#define HAS_MIRRORROW_UV_SSSE3
|
||||
void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_a, uint8* dst_b,
|
||||
int width) {
|
||||
intptr_t temp_width = static_cast<intptr_t>(width);
|
||||
asm volatile (
|
||||
"movdqa %4,%%xmm1 \n"
|
||||
"lea -16(%0,%3,2),%0 \n"
|
||||
"sub %1,%2 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"lea -16(%0),%0 \n"
|
||||
"pshufb %%xmm1,%%xmm0 \n"
|
||||
"sub $8,%3 \n"
|
||||
"movlpd %%xmm0,(%1) \n"
|
||||
"movhpd %%xmm0,(%1,%2) \n"
|
||||
"lea 8(%1),%1 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst_a), // %1
|
||||
"+r"(dst_b), // %2
|
||||
"+r"(temp_width) // %3
|
||||
: "m"(kShuffleMirrorUV) // %4
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
static void MirrorRowUV_C(const uint8* src,
|
||||
uint8* dst_a, uint8* dst_b,
|
||||
int width) {
|
||||
src += (width << 1) - 2;
|
||||
for (int i = 0; i < width; ++i) {
|
||||
dst_a[i] = src[0];
|
||||
dst_b[i] = src[1];
|
||||
src -= 2;
|
||||
}
|
||||
}
|
||||
|
||||
void RotateUV180(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
|
||||
@ -19,94 +19,6 @@ extern "C" {
|
||||
|
||||
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
|
||||
|
||||
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
||||
asm volatile (
|
||||
// compute where to start writing destination
|
||||
"add %1, %2 \n"
|
||||
|
||||
// work on segments that are multiples of 16
|
||||
"lsrs r3, %2, #4 \n"
|
||||
|
||||
// the output is written in two block. 8 bytes followed
|
||||
// by another 8. reading is done sequentially, from left to
|
||||
// right. writing is done from right to left in block sizes
|
||||
// %1, the destination pointer is incremented after writing
|
||||
// the first of the two blocks. need to subtract that 8 off
|
||||
// along with 16 to get the next location.
|
||||
"mov r3, #-24 \n"
|
||||
|
||||
"beq 2f \n"
|
||||
|
||||
// back of destination by the size of the register that is
|
||||
// going to be mirrord
|
||||
"sub %1, #16 \n"
|
||||
|
||||
// the loop needs to run on blocks of 16. what will be left
|
||||
// over is either a negative number, the residuals that need
|
||||
// to be done, or 0. if this isn't subtracted off here the
|
||||
// loop will run one extra time.
|
||||
"sub %2, #16 \n"
|
||||
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // src += 16
|
||||
|
||||
// mirror the bytes in the 64 bit segments. unable to mirror
|
||||
// the bytes in the entire 128 bits in one go.
|
||||
"vrev64.8 q0, q0 \n"
|
||||
|
||||
// because of the inability to mirror the entire 128 bits
|
||||
// mirror the writing out of the two 64 bit segments.
|
||||
"vst1.8 {d1}, [%1]! \n"
|
||||
"vst1.8 {d0}, [%1], r3 \n" // dst -= 16
|
||||
|
||||
"subs %2, #16 \n"
|
||||
"bge 1b \n"
|
||||
|
||||
// add 16 back to the counter. if the result is 0 there is no
|
||||
// residuals so jump past
|
||||
"adds %2, #16 \n"
|
||||
"beq 5f \n"
|
||||
|
||||
"add %1, #16 \n"
|
||||
|
||||
"2: \n"
|
||||
|
||||
"mov r3, #-3 \n"
|
||||
|
||||
"sub %1, #2 \n"
|
||||
"subs %2, #2 \n"
|
||||
// check for 16*n+1 scenarios where segments_of_2 should not
|
||||
// be run, but there is something left over.
|
||||
"blt 4f \n"
|
||||
|
||||
// do this in neon registers as per
|
||||
// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
|
||||
"3: \n"
|
||||
"vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
|
||||
|
||||
"vst1.8 {d1[0]}, [%1]! \n"
|
||||
"vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2
|
||||
|
||||
"subs %2, #2 \n"
|
||||
"bge 3b \n"
|
||||
|
||||
"adds %2, #2 \n"
|
||||
"beq 5f \n"
|
||||
|
||||
"4: \n"
|
||||
"add %1, #1 \n"
|
||||
"vld1.8 {d0[0]}, [%0] \n"
|
||||
"vst1.8 {d0[0]}, [%1] \n"
|
||||
|
||||
"5: \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
:
|
||||
: "memory", "cc", "r3", "q0"
|
||||
);
|
||||
}
|
||||
|
||||
static const uvec8 vtbl_4x4_transpose =
|
||||
{ 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
|
||||
|
||||
@ -272,80 +184,6 @@ void TransposeWx8_NEON(const uint8* src, int src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
void MirrorRowUV_NEON(const uint8* src,
|
||||
uint8* dst_a, uint8* dst_b,
|
||||
int width) {
|
||||
asm volatile (
|
||||
// compute where to start writing destination
|
||||
"add %1, %3 \n" // dst_a + width
|
||||
"add %2, %3 \n" // dst_b + width
|
||||
|
||||
// work on input segments that are multiples of 16, but
|
||||
// width that has been passed is output segments, half
|
||||
// the size of input.
|
||||
"lsrs r12, %3, #3 \n"
|
||||
|
||||
"beq 2f \n"
|
||||
|
||||
// the output is written in to two blocks.
|
||||
"mov r12, #-8 \n"
|
||||
|
||||
// back of destination by the size of the register that is
|
||||
// going to be mirrord
|
||||
"sub %1, #8 \n"
|
||||
"sub %2, #8 \n"
|
||||
|
||||
// the loop needs to run on blocks of 8. what will be left
|
||||
// over is either a negative number, the residuals that need
|
||||
// to be done, or 0. if this isn't subtracted off here the
|
||||
// loop will run one extra time.
|
||||
"sub %3, #8 \n"
|
||||
|
||||
"1: \n"
|
||||
"vld2.8 {d0, d1}, [%0]! \n" // src += 16
|
||||
|
||||
// mirror the bytes in the 64 bit segments
|
||||
"vrev64.8 q0, q0 \n"
|
||||
|
||||
"vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8
|
||||
"vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8
|
||||
|
||||
"subs %3, #8 \n"
|
||||
"bge 1b \n"
|
||||
|
||||
// add 8 back to the counter. if the result is 0 there is no
|
||||
// residuals so return
|
||||
"adds %3, #8 \n"
|
||||
"beq 4f \n"
|
||||
|
||||
"add %1, #8 \n"
|
||||
"add %2, #8 \n"
|
||||
|
||||
"2: \n"
|
||||
|
||||
"mov r12, #-1 \n"
|
||||
|
||||
"sub %1, #1 \n"
|
||||
"sub %2, #1 \n"
|
||||
|
||||
"3: \n"
|
||||
"vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
|
||||
|
||||
"vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1
|
||||
"vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1
|
||||
|
||||
"subs %3, %3, #1 \n"
|
||||
"bgt 3b \n"
|
||||
"4: \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst_a), // %1
|
||||
"+r"(dst_b), // %2
|
||||
"+r"(width) // %3
|
||||
:
|
||||
: "memory", "cc", "r12", "q0"
|
||||
);
|
||||
}
|
||||
|
||||
static const uvec8 vtbl_4x4_transpose_di =
|
||||
{ 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
|
||||
|
||||
|
||||
@ -54,6 +54,7 @@ extern "C" {
|
||||
#define HAS_I444TOARGBROW_SSSE3
|
||||
#define HAS_MIRRORROW_SSSE3
|
||||
#define HAS_MIRRORROW_SSE2
|
||||
#define HAS_MIRRORROWUV_SSSE3
|
||||
#define HAS_SPLITUV_SSE2
|
||||
#define HAS_COPYROW_SSE2
|
||||
#define HAS_COPYROW_X86
|
||||
@ -66,6 +67,7 @@ extern "C" {
|
||||
// The following are available on Neon platforms
|
||||
#if defined(__ARM_NEON__) && !defined(YUV_DISABLE_ASM)
|
||||
#define HAS_MIRRORROW_NEON
|
||||
#define HAS_MIRRORROWUV_NEON
|
||||
#define HAS_SPLITUV_NEON
|
||||
#define HAS_COPYROW_NEON
|
||||
#define HAS_I420TOARGBROW_NEON
|
||||
@ -126,6 +128,10 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
|
||||
void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
|
||||
void MirrorRow_C(const uint8* src, uint8* dst, int width);
|
||||
|
||||
void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
|
||||
void MirrorRowUV_NEON(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
|
||||
void MirrorRowUV_C(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
|
||||
|
||||
void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
|
||||
void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
|
||||
void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix);
|
||||
|
||||
@ -18,8 +18,8 @@ namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int width) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
// To support in-place conversion.
|
||||
uint8 r = src_abgr[0];
|
||||
uint8 g = src_abgr[1];
|
||||
@ -34,8 +34,8 @@ void ABGRToARGBRow_C(const uint8* src_abgr, uint8* dst_argb, int pix) {
|
||||
}
|
||||
}
|
||||
|
||||
void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int width) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
// To support in-place conversion.
|
||||
uint8 a = src_bgra[0];
|
||||
uint8 r = src_bgra[1];
|
||||
@ -50,8 +50,8 @@ void BGRAToARGBRow_C(const uint8* src_bgra, uint8* dst_argb, int pix) {
|
||||
}
|
||||
}
|
||||
|
||||
void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int width) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
uint8 b = src_rgb24[0];
|
||||
uint8 g = src_rgb24[1];
|
||||
uint8 r = src_rgb24[2];
|
||||
@ -64,8 +64,8 @@ void RGB24ToARGBRow_C(const uint8* src_rgb24, uint8* dst_argb, int pix) {
|
||||
}
|
||||
}
|
||||
|
||||
void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int width) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
uint8 r = src_raw[0];
|
||||
uint8 g = src_raw[1];
|
||||
uint8 b = src_raw[2];
|
||||
@ -78,8 +78,8 @@ void RAWToARGBRow_C(const uint8* src_raw, uint8* dst_argb, int pix) {
|
||||
}
|
||||
}
|
||||
|
||||
void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
uint8 b = src_rgb[0] & 0x1f;
|
||||
uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x07) << 3);
|
||||
uint8 r = src_rgb[1] >> 3;
|
||||
@ -92,8 +92,8 @@ void RGB565ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
|
||||
}
|
||||
}
|
||||
|
||||
void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
uint8 b = src_rgb[0] & 0x1f;
|
||||
uint8 g = (src_rgb[0] >> 5) | ((src_rgb[1] & 0x03) << 3);
|
||||
uint8 r = (src_rgb[1] & 0x7c) >> 2;
|
||||
@ -107,8 +107,8 @@ void ARGB1555ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
|
||||
}
|
||||
}
|
||||
|
||||
void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int width) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
uint8 a = src_rgb[1] >> 4;
|
||||
uint8 r = src_rgb[1] & 0x0f;
|
||||
uint8 g = src_rgb[0] >> 4;
|
||||
@ -122,8 +122,8 @@ void ARGB4444ToARGBRow_C(const uint8* src_rgb, uint8* dst_argb, int pix) {
|
||||
}
|
||||
}
|
||||
|
||||
void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
uint8 b = src_argb[0];
|
||||
uint8 g = src_argb[1];
|
||||
uint8 r = src_argb[2];
|
||||
@ -135,8 +135,8 @@ void ARGBToRGB24Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
}
|
||||
}
|
||||
|
||||
void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int width) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
uint8 b = src_argb[0];
|
||||
uint8 g = src_argb[1];
|
||||
uint8 r = src_argb[2];
|
||||
@ -149,8 +149,8 @@ void ARGBToRAWRow_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
}
|
||||
|
||||
// TODO(fbarchard): support big endian CPU
|
||||
void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
uint8 b = src_argb[0] >> 3;
|
||||
uint8 g = src_argb[1] >> 2;
|
||||
uint8 r = src_argb[2] >> 3;
|
||||
@ -160,8 +160,8 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
}
|
||||
}
|
||||
|
||||
void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
uint8 b = src_argb[0] >> 3;
|
||||
uint8 g = src_argb[1] >> 3;
|
||||
uint8 r = src_argb[2] >> 3;
|
||||
@ -172,8 +172,8 @@ void ARGBToARGB1555Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
}
|
||||
}
|
||||
|
||||
void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int pix) {
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
void ARGBToARGB4444Row_C(const uint8* src_argb, uint8* dst_rgb, int width) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
uint8 b = src_argb[0] >> 4;
|
||||
uint8 g = src_argb[1] >> 4;
|
||||
uint8 r = src_argb[2] >> 4;
|
||||
@ -233,9 +233,9 @@ MAKEROWY(ARGB,2,1,0)
|
||||
MAKEROWY(BGRA,1,2,3)
|
||||
MAKEROWY(ABGR,0,1,2)
|
||||
|
||||
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int pix) {
|
||||
void I400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
|
||||
// Copy a Y to RGB.
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
for (int x = 0; x < width; ++x) {
|
||||
uint8 y = src_y[0];
|
||||
dst_argb[2] = dst_argb[1] = dst_argb[0] = y;
|
||||
dst_argb[3] = 255u;
|
||||
@ -360,20 +360,42 @@ void YToARGBRow_C(const uint8* y_buf, uint8* rgb_buf, int width) {
|
||||
|
||||
void MirrorRow_C(const uint8* src, uint8* dst, int width) {
|
||||
src += width - 1;
|
||||
for (int i = 0; i < width; ++i) {
|
||||
dst[i] = src[0];
|
||||
--src;
|
||||
for (int x = 0; x < width - 1; x += 2) {
|
||||
dst[x] = src[0];
|
||||
dst[x + 1] = src[-1];
|
||||
src -= 2;
|
||||
}
|
||||
if (width & 1) {
|
||||
dst[width - 1] = src[0];
|
||||
}
|
||||
}
|
||||
|
||||
void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
||||
// Copy a row of UV.
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
dst_u[0] = src_uv[0];
|
||||
dst_v[0] = src_uv[1];
|
||||
src_uv += 2;
|
||||
dst_u += 1;
|
||||
dst_v += 1;
|
||||
void MirrorRowUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
|
||||
src_uv += (width - 1) << 1;
|
||||
for (int x = 0; x < width - 1; x += 2) {
|
||||
dst_u[x] = src_uv[0];
|
||||
dst_u[x + 1] = src_uv[-2];
|
||||
dst_v[x] = src_uv[1];
|
||||
dst_v[x + 1] = src_uv[-2 + 1];
|
||||
src_uv -= 4;
|
||||
}
|
||||
if (width & 1) {
|
||||
dst_u[width - 1] = src_uv[0];
|
||||
dst_v[width - 1] = src_uv[1];
|
||||
}
|
||||
}
|
||||
|
||||
void SplitUV_C(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
|
||||
for (int x = 0; x < width - 1; x += 2) {
|
||||
dst_u[x] = src_uv[0];
|
||||
dst_u[x + 1] = src_uv[2];
|
||||
dst_v[x] = src_uv[1];
|
||||
dst_v[x + 1] = src_uv[3];
|
||||
src_uv += 4;
|
||||
}
|
||||
if (width & 1) {
|
||||
dst_u[width - 1] = src_uv[0];
|
||||
dst_v[width - 1] = src_uv[1];
|
||||
}
|
||||
}
|
||||
|
||||
@ -383,9 +405,9 @@ void CopyRow_C(const uint8* src, uint8* dst, int count) {
|
||||
|
||||
// Filter 2 rows of YUY2 UV's (422) into U and V (420)
|
||||
void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
|
||||
uint8* dst_u, uint8* dst_v, int pix) {
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
// Output a row of UV values, filtering 2 rows of YUY2
|
||||
for (int x = 0; x < pix; x += 2) {
|
||||
for (int x = 0; x < width; x += 2) {
|
||||
dst_u[0] = (src_yuy2[1] + src_yuy2[src_stride_yuy2 + 1] + 1) >> 1;
|
||||
dst_v[0] = (src_yuy2[3] + src_yuy2[src_stride_yuy2 + 3] + 1) >> 1;
|
||||
src_yuy2 += 4;
|
||||
@ -394,20 +416,22 @@ void YUY2ToUVRow_C(const uint8* src_yuy2, int src_stride_yuy2,
|
||||
}
|
||||
}
|
||||
|
||||
void YUY2ToYRow_C(const uint8* src_yuy2,
|
||||
uint8* dst_y, int pix) {
|
||||
void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
|
||||
// Copy a row of yuy2 Y values
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
dst_y[0] = src_yuy2[0];
|
||||
src_yuy2 += 2;
|
||||
dst_y += 1;
|
||||
for (int x = 0; x < width - 1; x += 2) {
|
||||
dst_y[x] = src_yuy2[0];
|
||||
dst_y[x + 1] = src_yuy2[2];
|
||||
src_yuy2 += 4;
|
||||
}
|
||||
if (width & 1) {
|
||||
dst_y[width - 1] = src_yuy2[0];
|
||||
}
|
||||
}
|
||||
|
||||
void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
|
||||
uint8* dst_u, uint8* dst_v, int pix) {
|
||||
uint8* dst_u, uint8* dst_v, int width) {
|
||||
// Copy a row of uyvy UV values
|
||||
for (int x = 0; x < pix; x += 2) {
|
||||
for (int x = 0; x < width; x += 2) {
|
||||
dst_u[0] = (src_uyvy[0] + src_uyvy[src_stride_uyvy + 0] + 1) >> 1;
|
||||
dst_v[0] = (src_uyvy[2] + src_uyvy[src_stride_uyvy + 2] + 1) >> 1;
|
||||
src_uyvy += 4;
|
||||
@ -416,13 +440,15 @@ void UYVYToUVRow_C(const uint8* src_uyvy, int src_stride_uyvy,
|
||||
}
|
||||
}
|
||||
|
||||
void UYVYToYRow_C(const uint8* src_uyvy,
|
||||
uint8* dst_y, int pix) {
|
||||
void UYVYToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width) {
|
||||
// Copy a row of uyvy Y values
|
||||
for (int x = 0; x < pix; ++x) {
|
||||
dst_y[0] = src_uyvy[1];
|
||||
src_uyvy += 2;
|
||||
dst_y += 1;
|
||||
for (int x = 0; x < width - 1; x += 2) {
|
||||
dst_y[x] = src_yuy2[1];
|
||||
dst_y[x + 1] = src_yuy2[3];
|
||||
src_yuy2 += 4;
|
||||
}
|
||||
if (width & 1) {
|
||||
dst_y[width - 1] = src_yuy2[1];
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -22,38 +22,26 @@ extern "C" {
|
||||
"vld1.u8 {d0}, [%0]! \n" \
|
||||
"vld1.u32 {d2[0]}, [%1]! \n" \
|
||||
"vld1.u32 {d2[1]}, [%2]! \n" \
|
||||
\
|
||||
"veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\
|
||||
\
|
||||
"vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\
|
||||
\
|
||||
"vmull.s8 q9, d2, d25 \n"/* u/v G component */\
|
||||
\
|
||||
"vmov.u8 d1, #0 \n"/* split odd/even y apart */\
|
||||
"vtrn.u8 d0, d1 \n" \
|
||||
\
|
||||
"vsub.s16 q0, q0, q15 \n"/* offset y */\
|
||||
"vmul.s16 q0, q0, q14 \n" \
|
||||
\
|
||||
"vadd.s16 d18, d19 \n" \
|
||||
\
|
||||
"vqadd.s16 d20, d0, d16 \n" \
|
||||
"vqadd.s16 d21, d1, d16 \n" \
|
||||
\
|
||||
"vqadd.s16 d22, d0, d17 \n" \
|
||||
"vqadd.s16 d23, d1, d17 \n" \
|
||||
\
|
||||
"vqadd.s16 d16, d0, d18 \n" \
|
||||
"vqadd.s16 d17, d1, d18 \n" \
|
||||
\
|
||||
"vqrshrun.s16 d0, q10, #6 \n" \
|
||||
"vqrshrun.s16 d1, q11, #6 \n" \
|
||||
"vqrshrun.s16 d2, q8, #6 \n" \
|
||||
\
|
||||
"vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\
|
||||
"vmovl.u8 q11, d1 \n" \
|
||||
"vmovl.u8 q8, d2 \n" \
|
||||
\
|
||||
"vtrn.u8 d20, d21 \n" \
|
||||
"vtrn.u8 d22, d23 \n" \
|
||||
"vtrn.u8 d16, d17 \n" \
|
||||
@ -67,7 +55,7 @@ static const vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
|
||||
0, 0, 0, 0, 0, 0, 0, 0 };
|
||||
#endif
|
||||
|
||||
#if defined(HAS_I420TOARGBROW_NEON)
|
||||
#ifdef HAS_I420TOARGBROW_NEON
|
||||
void I420ToARGBRow_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -99,7 +87,7 @@ YUVTORGB
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_I420TOBGRAROW_NEON)
|
||||
#ifdef HAS_I420TOBGRAROW_NEON
|
||||
void I420ToBGRARow_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -132,7 +120,7 @@ YUVTORGB
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_I420TOABGRROW_NEON)
|
||||
#ifdef HAS_I420TOABGRROW_NEON
|
||||
void I420ToABGRRow_NEON(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
@ -165,10 +153,10 @@ YUVTORGB
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_SPLITUV_NEON)
|
||||
#ifdef HAS_SPLITUV_NEON
|
||||
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v
|
||||
// Alignment requirement: 16 bytes for pointers, and multiple of 16 pixels.
|
||||
void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
||||
void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) {
|
||||
asm volatile (
|
||||
"1: \n"
|
||||
"vld2.u8 {q0,q1}, [%0]! \n" // load 16 pairs of UV
|
||||
@ -179,15 +167,14 @@ void SplitUV_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
||||
: "+r"(src_uv), // %0
|
||||
"+r"(dst_u), // %1
|
||||
"+r"(dst_v), // %2
|
||||
"+r"(pix) // %3 // Output registers
|
||||
"+r"(width) // %3 // Output registers
|
||||
: // Input registers
|
||||
: "memory", "cc", "q0", "q1" // Clobber List
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(HAS_COPYROW_NEON)
|
||||
// TODO(fbarchard): Test without pld on NexusS
|
||||
#ifdef HAS_COPYROW_NEON
|
||||
// Copy multiple of 64
|
||||
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
|
||||
asm volatile (
|
||||
@ -206,6 +193,170 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
|
||||
}
|
||||
#endif // HAS_COPYROW_NEON
|
||||
|
||||
#ifdef HAS_MIRRORROW_NEON
|
||||
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
|
||||
asm volatile (
|
||||
// compute where to start writing destination
|
||||
"add %1, %2 \n"
|
||||
|
||||
// work on segments that are multiples of 16
|
||||
"lsrs r3, %2, #4 \n"
|
||||
|
||||
// the output is written in two block. 8 bytes followed
|
||||
// by another 8. reading is done sequentially, from left to
|
||||
// right. writing is done from right to left in block sizes
|
||||
// %1, the destination pointer is incremented after writing
|
||||
// the first of the two blocks. need to subtract that 8 off
|
||||
// along with 16 to get the next location.
|
||||
"mov r3, #-24 \n"
|
||||
|
||||
"beq 2f \n"
|
||||
|
||||
// back of destination by the size of the register that is
|
||||
// going to be mirrord
|
||||
"sub %1, #16 \n"
|
||||
|
||||
// the loop needs to run on blocks of 16. what will be left
|
||||
// over is either a negative number, the residuals that need
|
||||
// to be done, or 0. if this isn't subtracted off here the
|
||||
// loop will run one extra time.
|
||||
"sub %2, #16 \n"
|
||||
|
||||
"1: \n"
|
||||
"vld1.8 {q0}, [%0]! \n" // src += 16
|
||||
|
||||
// mirror the bytes in the 64 bit segments. unable to mirror
|
||||
// the bytes in the entire 128 bits in one go.
|
||||
"vrev64.8 q0, q0 \n"
|
||||
|
||||
// because of the inability to mirror the entire 128 bits
|
||||
// mirror the writing out of the two 64 bit segments.
|
||||
"vst1.8 {d1}, [%1]! \n"
|
||||
"vst1.8 {d0}, [%1], r3 \n" // dst -= 16
|
||||
|
||||
"subs %2, #16 \n"
|
||||
"bge 1b \n"
|
||||
|
||||
// add 16 back to the counter. if the result is 0 there is no
|
||||
// residuals so jump past
|
||||
"adds %2, #16 \n"
|
||||
"beq 5f \n"
|
||||
|
||||
"add %1, #16 \n"
|
||||
|
||||
"2: \n"
|
||||
|
||||
"mov r3, #-3 \n"
|
||||
|
||||
"sub %1, #2 \n"
|
||||
"subs %2, #2 \n"
|
||||
// check for 16*n+1 scenarios where segments_of_2 should not
|
||||
// be run, but there is something left over.
|
||||
"blt 4f \n"
|
||||
|
||||
// do this in neon registers as per
|
||||
// http://blogs.arm.com/software-enablement/196-coding-for-neon-part-2-dealing-with-leftovers/
|
||||
"3: \n"
|
||||
"vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
|
||||
|
||||
"vst1.8 {d1[0]}, [%1]! \n"
|
||||
"vst1.8 {d0[0]}, [%1], r3 \n" // dst -= 2
|
||||
|
||||
"subs %2, #2 \n"
|
||||
"bge 3b \n"
|
||||
|
||||
"adds %2, #2 \n"
|
||||
"beq 5f \n"
|
||||
|
||||
"4: \n"
|
||||
"add %1, #1 \n"
|
||||
"vld1.8 {d0[0]}, [%0] \n"
|
||||
"vst1.8 {d0[0]}, [%1] \n"
|
||||
|
||||
"5: \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
:
|
||||
: "memory", "cc", "r3", "q0"
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MIRRORROWUV_NEON
|
||||
void MirrorRowUV_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width) {
|
||||
asm volatile (
|
||||
// compute where to start writing destination
|
||||
"add %1, %3 \n" // dst_a + width
|
||||
"add %2, %3 \n" // dst_b + width
|
||||
|
||||
// work on input segments that are multiples of 16, but
|
||||
// width that has been passed is output segments, half
|
||||
// the size of input.
|
||||
"lsrs r12, %3, #3 \n"
|
||||
|
||||
"beq 2f \n"
|
||||
|
||||
// the output is written in to two blocks.
|
||||
"mov r12, #-8 \n"
|
||||
|
||||
// back of destination by the size of the register that is
|
||||
// going to be mirrord
|
||||
"sub %1, #8 \n"
|
||||
"sub %2, #8 \n"
|
||||
|
||||
// the loop needs to run on blocks of 8. what will be left
|
||||
// over is either a negative number, the residuals that need
|
||||
// to be done, or 0. if this isn't subtracted off here the
|
||||
// loop will run one extra time.
|
||||
"sub %3, #8 \n"
|
||||
|
||||
"1: \n"
|
||||
"vld2.8 {d0, d1}, [%0]! \n" // src += 16
|
||||
|
||||
// mirror the bytes in the 64 bit segments
|
||||
"vrev64.8 q0, q0 \n"
|
||||
|
||||
"vst1.8 {d0}, [%1], r12 \n" // dst_a -= 8
|
||||
"vst1.8 {d1}, [%2], r12 \n" // dst_b -= 8
|
||||
|
||||
"subs %3, #8 \n"
|
||||
"bge 1b \n"
|
||||
|
||||
// add 8 back to the counter. if the result is 0 there is no
|
||||
// residuals so return
|
||||
"adds %3, #8 \n"
|
||||
"beq 4f \n"
|
||||
|
||||
"add %1, #8 \n"
|
||||
"add %2, #8 \n"
|
||||
|
||||
"2: \n"
|
||||
|
||||
"mov r12, #-1 \n"
|
||||
|
||||
"sub %1, #1 \n"
|
||||
"sub %2, #1 \n"
|
||||
|
||||
"3: \n"
|
||||
"vld2.8 {d0[0], d1[0]}, [%0]! \n" // src += 2
|
||||
|
||||
"vst1.8 {d0[0]}, [%1], r12 \n" // dst_a -= 1
|
||||
"vst1.8 {d1[0]}, [%2], r12 \n" // dst_b -= 1
|
||||
|
||||
"subs %3, %3, #1 \n"
|
||||
"bgt 3b \n"
|
||||
"4: \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst_a), // %1
|
||||
"+r"(dst_b), // %2
|
||||
"+r"(width) // %3
|
||||
:
|
||||
: "memory", "cc", "r12", "q0"
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // __ARM_NEON__
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -1493,7 +1493,6 @@ void YToARGBRow_SSE2(const uint8* y_buf,
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MIRRORROW_SSSE3
|
||||
|
||||
// Shuffle table for reversing the bytes.
|
||||
CONST uvec8 kShuffleMirror = {
|
||||
15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
|
||||
@ -1524,7 +1523,6 @@ void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MIRRORROW_SSE2
|
||||
|
||||
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
|
||||
intptr_t temp_width = static_cast<intptr_t>(width);
|
||||
asm volatile (
|
||||
@ -1554,6 +1552,40 @@ void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MIRRORROW_UV_SSSE3
|
||||
// Shuffle table for reversing the bytes of UV channels.
|
||||
CONST uvec8 kShuffleMirrorUV = {
|
||||
14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
|
||||
};
|
||||
void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
|
||||
int width) {
|
||||
intptr_t temp_width = static_cast<intptr_t>(width);
|
||||
asm volatile (
|
||||
"movdqa %4,%%xmm1 \n"
|
||||
"lea -16(%0,%3,2),%0 \n"
|
||||
"sub %1,%2 \n"
|
||||
"1: \n"
|
||||
"movdqa (%0),%%xmm0 \n"
|
||||
"lea -16(%0),%0 \n"
|
||||
"pshufb %%xmm1,%%xmm0 \n"
|
||||
"sub $8,%3 \n"
|
||||
"movlpd %%xmm0,(%1) \n"
|
||||
"movhpd %%xmm0,(%1,%2) \n"
|
||||
"lea 8(%1),%1 \n"
|
||||
"ja 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst_u), // %1
|
||||
"+r"(dst_v), // %2
|
||||
"+r"(temp_width) // %3
|
||||
: "m"(kShuffleMirrorUV) // %4
|
||||
: "memory", "cc"
|
||||
#if defined(__SSE2__)
|
||||
, "xmm0", "xmm1"
|
||||
#endif
|
||||
);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SPLITUV_SSE2
|
||||
void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
||||
asm volatile (
|
||||
|
||||
@ -1501,7 +1501,6 @@ __asm {
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MIRRORROW_SSE2
|
||||
|
||||
// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
|
||||
// version can not.
|
||||
__declspec(naked)
|
||||
@ -1529,6 +1528,41 @@ __asm {
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_MIRRORROW_UV_SSSE3
|
||||
// Shuffle table for reversing the bytes of UV channels.
|
||||
static const uvec8 kShuffleMirrorUV = {
|
||||
14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
|
||||
};
|
||||
|
||||
__declspec(naked)
|
||||
void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
|
||||
int width) {
|
||||
__asm {
|
||||
push edi
|
||||
mov eax, [esp + 4 + 4] // src
|
||||
mov edx, [esp + 4 + 8] // dst_u
|
||||
mov edi, [esp + 4 + 12] // dst_v
|
||||
mov ecx, [esp + 4 + 16] // width
|
||||
movdqa xmm1, kShuffleMirrorUV
|
||||
lea eax, [eax + ecx * 2 - 16]
|
||||
sub edi, edx
|
||||
|
||||
convertloop:
|
||||
movdqa xmm0, [eax]
|
||||
lea eax, [eax - 16]
|
||||
pshufb xmm0, xmm1
|
||||
sub ecx, 8
|
||||
movlpd qword ptr [edx], xmm0
|
||||
movhpd qword ptr [edx + edi], xmm0
|
||||
lea edx, [edx + 8]
|
||||
ja convertloop
|
||||
|
||||
pop edi
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SPLITUV_SSE2
|
||||
__declspec(naked)
|
||||
void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user