mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
Mips port of libyuv. Includes functionality for convert, rotate, scale and memcpy.
BUG=126 TESTED=tested by mips Review URL: https://webrtc-codereview.appspot.com/930005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@449 16f28f9a-4ce2-e073-06de-1de4eb20be90
This commit is contained in:
parent
1f399dfaf8
commit
6c1b2d38c6
@ -1,6 +1,6 @@
|
||||
Name: libyuv
|
||||
URL: http://code.google.com/p/libyuv/
|
||||
Version: 447
|
||||
Version: 449
|
||||
License: BSD
|
||||
License File: LICENSE
|
||||
|
||||
|
||||
@ -19,9 +19,11 @@
|
||||
#include "libyuv/convert_from_argb.h"
|
||||
#include "libyuv/cpu_id.h"
|
||||
#include "libyuv/format_conversion.h"
|
||||
#include "libyuv/mjpeg_decoder.h"
|
||||
#include "libyuv/planar_functions.h"
|
||||
#include "libyuv/rotate.h"
|
||||
#include "libyuv/rotate_argb.h"
|
||||
#include "libyuv/row.h"
|
||||
#include "libyuv/scale.h"
|
||||
#include "libyuv/scale_argb.h"
|
||||
#include "libyuv/version.h"
|
||||
|
||||
@ -175,8 +175,14 @@ extern "C" {
|
||||
|
||||
// The following are available on Mips platforms
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(__mips__)
|
||||
#define HAS_COPYROW_MIPS
|
||||
#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
|
||||
#define HAS_SPLITUV_MIPS_DSPR2
|
||||
#define HAS_MIRRORROW_MIPS_DSPR2
|
||||
#define HAS_MIRRORROWUV_MIPS_DSPR2
|
||||
#define HAS_I422TOARGBROW_MIPS_DSPR2
|
||||
#define HAS_I422TOBGRAROW_MIPS_DSPR2
|
||||
#define HAS_I422TOABGRROW_MIPS_DSPR2
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -282,6 +288,9 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix);
|
||||
void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width);
|
||||
void MirrorRow_SSE2(const uint8* src, uint8* dst, int width);
|
||||
void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
|
||||
void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width);
|
||||
void MirrorRowUV_MIPS_DSPR2(const uint8* src, uint8* dst_u, uint8* dst_v,
|
||||
int width);
|
||||
void MirrorRow_C(const uint8* src, uint8* dst, int width);
|
||||
|
||||
void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width);
|
||||
@ -321,6 +330,7 @@ void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
|
||||
void CopyRow_SSE2(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_X86(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_NEON(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_MIPS(const uint8* src, uint8* dst, int count);
|
||||
void CopyRow_C(const uint8* src, uint8* dst, int count);
|
||||
|
||||
void SetRow8_X86(uint8* dst, uint32 v32, int count);
|
||||
@ -694,6 +704,21 @@ void NV21ToARGBRow_Any_NEON(const uint8* y_buf,
|
||||
const uint8* uv_buf,
|
||||
uint8* argb_buf,
|
||||
int width);
|
||||
void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width);
|
||||
|
||||
void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix);
|
||||
void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
|
||||
|
||||
@ -11,6 +11,6 @@
|
||||
#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
|
||||
#define INCLUDE_LIBYUV_VERSION_H_
|
||||
|
||||
#define LIBYUV_VERSION 447
|
||||
#define LIBYUV_VERSION 449
|
||||
|
||||
#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
|
||||
|
||||
@ -75,11 +75,12 @@
|
||||
'source/convert_from_argb.cc',
|
||||
'source/cpu_id.cc',
|
||||
'source/format_conversion.cc',
|
||||
'source/memcpy_mips.S',
|
||||
'source/memcpy_mips.S', # TODO(fbarchard): Move into row_mips.cc
|
||||
'source/mjpeg_decoder.cc',
|
||||
'source/planar_functions.cc',
|
||||
'source/rotate.cc',
|
||||
'source/rotate_argb.cc',
|
||||
'source/rotate_mips.cc',
|
||||
'source/rotate_neon.cc',
|
||||
'source/row_common.cc',
|
||||
'source/row_mips.cc',
|
||||
|
||||
@ -132,6 +132,14 @@ int I422ToARGB(const uint8* src_y, int src_stride_y,
|
||||
I422ToARGBRow = I422ToARGBRow_NEON;
|
||||
}
|
||||
}
|
||||
#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
|
||||
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
|
||||
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
|
||||
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
|
||||
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
@ -756,6 +764,11 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
|
||||
I422ToARGBRow = I422ToARGBRow_NEON;
|
||||
}
|
||||
}
|
||||
#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
|
||||
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
|
||||
}
|
||||
#endif
|
||||
|
||||
SIMD_ALIGNED(uint8 rowy[kMaxStride]);
|
||||
@ -829,6 +842,11 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
|
||||
I422ToARGBRow = I422ToARGBRow_NEON;
|
||||
}
|
||||
}
|
||||
#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
|
||||
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
|
||||
}
|
||||
#endif
|
||||
|
||||
SIMD_ALIGNED(uint8 rowy[kMaxStride]);
|
||||
|
||||
@ -599,6 +599,14 @@ int I420ToARGB(const uint8* src_y, int src_stride_y,
|
||||
I422ToARGBRow = I422ToARGBRow_NEON;
|
||||
}
|
||||
}
|
||||
#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
|
||||
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
|
||||
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
|
||||
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
|
||||
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
@ -652,6 +660,14 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y,
|
||||
I422ToBGRARow = I422ToBGRARow_NEON;
|
||||
}
|
||||
}
|
||||
#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
|
||||
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
|
||||
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
|
||||
IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
|
||||
I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
@ -909,6 +925,13 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y,
|
||||
I422ToARGBRow = I422ToARGBRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
|
||||
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
|
||||
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
|
||||
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
|
||||
}
|
||||
#endif
|
||||
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride]);
|
||||
@ -975,6 +998,14 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y,
|
||||
I422ToARGBRow = I422ToARGBRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
|
||||
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
|
||||
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
|
||||
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
|
||||
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
|
||||
}
|
||||
#endif
|
||||
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride]);
|
||||
@ -1041,6 +1072,14 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y,
|
||||
I422ToARGBRow = I422ToARGBRow_SSSE3;
|
||||
}
|
||||
}
|
||||
#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
|
||||
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
|
||||
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
|
||||
IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
|
||||
I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
|
||||
}
|
||||
#endif
|
||||
|
||||
SIMD_ALIGNED(uint8 row[kMaxStride]);
|
||||
|
||||
@ -174,7 +174,7 @@ int InitCpuFlags(void) {
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// environment variable overrides for testing.
|
||||
// Environment variable overrides for testing.
|
||||
if (TestEnv("LIBYUV_DISABLE_X86")) {
|
||||
cpu_info_ &= ~kCpuHasX86;
|
||||
}
|
||||
@ -197,7 +197,7 @@ int InitCpuFlags(void) {
|
||||
cpu_info_ &= ~kCpuHasAVX2;
|
||||
}
|
||||
#elif defined(__mips__) && defined(__linux__)
|
||||
// linux mips parse text file for dsp detect.
|
||||
// Linux mips parse text file for dsp detect.
|
||||
cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP.
|
||||
#if defined(__mips_dspr2)
|
||||
cpu_info_ |= kCpuHasMIPS_DSPR2;
|
||||
@ -215,7 +215,7 @@ int InitCpuFlags(void) {
|
||||
}
|
||||
#elif defined(__arm__)
|
||||
#if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
|
||||
// linux arm parse text file for neon detect.
|
||||
// Linux arm parse text file for neon detect.
|
||||
cpu_info_ = ArmCpuCaps("/proc/cpuinfo");
|
||||
#elif defined(__ARM_NEON__)
|
||||
// gcc -mfpu=neon defines __ARM_NEON__
|
||||
|
||||
@ -1,5 +1,13 @@
|
||||
#if defined (__mips__)
|
||||
|
||||
#
|
||||
# Copyright (c) 2012 The LibYuv project authors. All Rights Reserved.
|
||||
#
|
||||
# Use of this source code is governed by a BSD-style license
|
||||
# that can be found in the LICENSE file in the root of the source
|
||||
# tree. An additional intellectual property rights grant can be found
|
||||
# in the file PATENTS. All contributing project authors may
|
||||
# be found in the AUTHORS file in the root of the source tree.
|
||||
#
|
||||
.globl memcpy_MIPS;
|
||||
.align 2;
|
||||
.type memcpy_MIPS,@function;
|
||||
@ -21,8 +29,8 @@ memcpy_MIPS:
|
||||
negu $a3,$a0
|
||||
|
||||
andi $a3,$a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned
|
||||
beq $a3,$zero,chk16w # when a3=0 then the dst (a0) is word-aligned
|
||||
subu $a2,$a2,$a3 # now a2 is the remining bytes count
|
||||
beq $a3,$zero,chk16w # when a3=0 then the dst (a0) is
|
||||
subu $a2,$a2,$a3 # word-aligned now a2 is the remining bytes count
|
||||
|
||||
lwr $t8,0($a1)
|
||||
addu $a1,$a1,$a3
|
||||
@ -30,15 +38,14 @@ memcpy_MIPS:
|
||||
addu $a0,$a0,$a3
|
||||
|
||||
# Now the dst/src are mutually word-aligned with word-aligned addresses
|
||||
chk16w: andi $t8,$a2,0x3f # any whole 64-byte chunks?
|
||||
chk16w:
|
||||
andi $t8,$a2,0x3f # any whole 64-byte chunks?
|
||||
# t8 is the byte count after 64-byte chunks
|
||||
|
||||
beq $a2,$t8,chk8w # if a2==t8, no 64-byte chunks
|
||||
# There will be at most 1 32-byte chunk after it
|
||||
subu $a3,$a2,$t8 # subtract from a2 the reminder
|
||||
# Here a3 counts bytes in 16w chunks
|
||||
addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
|
||||
|
||||
addu $t0,$a0,$a2 # t0 is the "past the end" address
|
||||
|
||||
# When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
|
||||
@ -164,7 +171,8 @@ last8loop:
|
||||
bne $a0,$a3,last8loop
|
||||
sb $v1,-1($a0)
|
||||
|
||||
leave: j $ra
|
||||
leave:
|
||||
j $ra
|
||||
nop
|
||||
|
||||
#
|
||||
@ -183,18 +191,16 @@ unaligned:
|
||||
swr $v1,0($a0)
|
||||
addu $a0,$a0,$a3 # below the dst will be word aligned (NOTE1)
|
||||
|
||||
ua_chk16w: andi $t8,$a2,0x3f # any whole 64-byte chunks?
|
||||
ua_chk16w:
|
||||
andi $t8,$a2,0x3f # any whole 64-byte chunks?
|
||||
# t8 is the byte count after 64-byte chunks
|
||||
beq $a2,$t8,ua_chk8w # if a2==t8, no 64-byte chunks
|
||||
# There will be at most 1 32-byte chunk after it
|
||||
subu $a3,$a2,$t8 # subtract from a2 the reminder
|
||||
# Here a3 counts bytes in 16w chunks
|
||||
addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks
|
||||
|
||||
addu $t0,$a0,$a2 # t0 is the "past the end" address
|
||||
|
||||
subu $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address
|
||||
|
||||
pref 0,0($a1) # bring the first line of src, addr 0
|
||||
pref 0,32($a1) # bring the second line of src, addr 32
|
||||
pref 0,64($a1) # bring the third line of src, addr 64
|
||||
|
||||
@ -46,6 +46,11 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
|
||||
CopyRow = CopyRow_SSE2;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_COPYROW_MIPS)
|
||||
if (TestCpuFlag(kCpuHasMIPS)) {
|
||||
CopyRow = CopyRow_MIPS;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Copy plane
|
||||
for (int y = 0; y < height; ++y) {
|
||||
@ -424,6 +429,14 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y,
|
||||
}
|
||||
}
|
||||
}
|
||||
#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
|
||||
IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
|
||||
IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
|
||||
IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
|
||||
I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < height; ++y) {
|
||||
|
||||
@ -56,6 +56,23 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride,
|
||||
int width);
|
||||
#endif // defined(__ARM_NEON__)
|
||||
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(__mips__)
|
||||
#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
|
||||
#define HAS_TRANSPOSE_WX8_MIPS_DSPR2
|
||||
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width);
|
||||
|
||||
void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride, int width);
|
||||
#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
|
||||
void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width);
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
|
||||
#define HAS_TRANSPOSE_WX8_SSSE3
|
||||
__declspec(naked) __declspec(align(16))
|
||||
@ -794,6 +811,16 @@ void TransposePlane(const uint8* src, int src_stride,
|
||||
TransposeWx8 = TransposeWx8_FAST_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
|
||||
if (IS_ALIGNED(width, 4) &&
|
||||
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
|
||||
TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
|
||||
} else {
|
||||
TransposeWx8 = TransposeWx8_MIPS_DSPR2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// Work across the source in 8x8 tiles
|
||||
int i = height;
|
||||
@ -856,6 +883,13 @@ void RotatePlane180(const uint8* src, int src_stride,
|
||||
IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
|
||||
MirrorRow = MirrorRow_SSSE3;
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_MIRRORROW_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
|
||||
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
|
||||
IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
|
||||
MirrorRow = MirrorRow_MIPS_DSPR2;
|
||||
}
|
||||
#endif
|
||||
void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
|
||||
#if defined(HAS_COPYROW_NEON)
|
||||
@ -952,6 +986,11 @@ void TransposeUV(const uint8* src, int src_stride,
|
||||
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
|
||||
TransposeUVWx8 = TransposeUVWx8_SSE2;
|
||||
}
|
||||
#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
|
||||
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
|
||||
TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Work through the source in 8x8 tiles.
|
||||
@ -1021,6 +1060,11 @@ void RotateUV180(const uint8* src, int src_stride,
|
||||
IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
|
||||
MirrorRowUV = MirrorRowUV_SSSE3;
|
||||
}
|
||||
#elif defined(HAS_MIRRORROWUV_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
|
||||
IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
|
||||
MirrorRowUV = MirrorRowUV_MIPS_DSPR2;
|
||||
}
|
||||
#endif
|
||||
|
||||
dst_a += dst_stride_a * (height - 1);
|
||||
|
||||
485
source/rotate_mips.cc
Normal file
485
source/rotate_mips.cc
Normal file
@ -0,0 +1,485 @@
|
||||
/*
|
||||
* Copyright 2011 The LibYuv Project Authors. All rights reserved.
|
||||
*
|
||||
* Use of this source code is governed by a BSD-style license
|
||||
* that can be found in the LICENSE file in the root of the source
|
||||
* tree. An additional intellectual property rights grant can be found
|
||||
* in the file PATENTS. All contributing project authors may
|
||||
* be found in the AUTHORS file in the root of the source tree.
|
||||
*/
|
||||
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#include "libyuv/basic_types.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(__mips_dsp) && (__mips_dsp_rev >= 2)
|
||||
|
||||
void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
|
||||
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
|
||||
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
|
||||
"addu $t3, $t2, %[src_stride] \n"
|
||||
"addu $t5, $t4, %[src_stride] \n"
|
||||
"addu $t6, $t2, $t4 \n"
|
||||
"andi $t0, %[dst], 0x3 \n"
|
||||
"andi $t1, %[dst_stride], 0x3 \n"
|
||||
"or $t0, $t0, $t1 \n"
|
||||
"bnez $t0, 11f \n"
|
||||
" subu $t7, $t9, %[src_stride] \n"
|
||||
//dst + dst_stride word aligned
|
||||
"1: \n"
|
||||
"lbu $t0, 0(%[src]) \n"
|
||||
"lbux $t1, %[src_stride](%[src]) \n"
|
||||
"lbux $t8, $t2(%[src]) \n"
|
||||
"lbux $t9, $t3(%[src]) \n"
|
||||
"sll $t1, $t1, 16 \n"
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"or $t0, $t0, $t1 \n"
|
||||
"or $t8, $t8, $t9 \n"
|
||||
"precr.qb.ph $s0, $t8, $t0 \n"
|
||||
"lbux $t0, $t4(%[src]) \n"
|
||||
"lbux $t1, $t5(%[src]) \n"
|
||||
"lbux $t8, $t6(%[src]) \n"
|
||||
"lbux $t9, $t7(%[src]) \n"
|
||||
"sll $t1, $t1, 16 \n"
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"or $t0, $t0, $t1 \n"
|
||||
"or $t8, $t8, $t9 \n"
|
||||
"precr.qb.ph $s1, $t8, $t0 \n"
|
||||
"sw $s0, 0(%[dst]) \n"
|
||||
"addiu %[width], -1 \n"
|
||||
"addiu %[src], 1 \n"
|
||||
"sw $s1, 4(%[dst]) \n"
|
||||
"bnez %[width], 1b \n"
|
||||
" addu %[dst], %[dst], %[dst_stride] \n"
|
||||
"b 2f \n"
|
||||
//dst + dst_stride unaligned
|
||||
"11: \n"
|
||||
"lbu $t0, 0(%[src]) \n"
|
||||
"lbux $t1, %[src_stride](%[src]) \n"
|
||||
"lbux $t8, $t2(%[src]) \n"
|
||||
"lbux $t9, $t3(%[src]) \n"
|
||||
"sll $t1, $t1, 16 \n"
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"or $t0, $t0, $t1 \n"
|
||||
"or $t8, $t8, $t9 \n"
|
||||
"precr.qb.ph $s0, $t8, $t0 \n"
|
||||
"lbux $t0, $t4(%[src]) \n"
|
||||
"lbux $t1, $t5(%[src]) \n"
|
||||
"lbux $t8, $t6(%[src]) \n"
|
||||
"lbux $t9, $t7(%[src]) \n"
|
||||
"sll $t1, $t1, 16 \n"
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"or $t0, $t0, $t1 \n"
|
||||
"or $t8, $t8, $t9 \n"
|
||||
"precr.qb.ph $s1, $t8, $t0 \n"
|
||||
"swr $s0, 0(%[dst]) \n"
|
||||
"swl $s0, 3(%[dst]) \n"
|
||||
"addiu %[width], -1 \n"
|
||||
"addiu %[src], 1 \n"
|
||||
"swr $s1, 4(%[dst]) \n"
|
||||
"swl $s1, 7(%[dst]) \n"
|
||||
"bnez %[width], 11b \n"
|
||||
"addu %[dst], %[dst], %[dst_stride] \n"
|
||||
"2: \n"
|
||||
".set pop \n"
|
||||
:[src] "+r" (src),
|
||||
[dst] "+r" (dst),
|
||||
[width] "+r" (width)
|
||||
:[src_stride] "r" (src_stride),
|
||||
[dst_stride] "r" (dst_stride)
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5",
|
||||
"t6", "t7", "t8", "t9",
|
||||
"s0", "s1"
|
||||
);
|
||||
}
|
||||
|
||||
void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst, int dst_stride,
|
||||
int width) {
|
||||
__asm__ __volatile__ (
|
||||
".set noat \n"
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"beqz %[width], 2f \n"
|
||||
" sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
|
||||
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
|
||||
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
|
||||
"addu $t3, $t2, %[src_stride] \n"
|
||||
"addu $t5, $t4, %[src_stride] \n"
|
||||
"addu $t6, $t2, $t4 \n"
|
||||
|
||||
"srl $AT, %[width], 0x2 \n"
|
||||
"andi $t0, %[dst], 0x3 \n"
|
||||
"andi $t1, %[dst_stride], 0x3 \n"
|
||||
"or $t0, $t0, $t1 \n"
|
||||
"bnez $t0, 11f \n"
|
||||
" subu $t7, $t9, %[src_stride] \n"
|
||||
//dst + dst_stride word aligned
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src]) \n"
|
||||
"lwx $t1, %[src_stride](%[src]) \n"
|
||||
"lwx $t8, $t2(%[src]) \n"
|
||||
"lwx $t9, $t3(%[src]) \n"
|
||||
|
||||
// t0 = | 30 | 20 | 10 | 00 |
|
||||
// t1 = | 31 | 21 | 11 | 01 |
|
||||
// t8 = | 32 | 22 | 12 | 02 |
|
||||
// t9 = | 33 | 23 | 13 | 03 |
|
||||
|
||||
"precr.qb.ph $s0, $t1, $t0 \n"
|
||||
"precr.qb.ph $s1, $t9, $t8 \n"
|
||||
"precrq.qb.ph $s2, $t1, $t0 \n"
|
||||
"precrq.qb.ph $s3, $t9, $t8 \n"
|
||||
|
||||
// s0 = | 21 | 01 | 20 | 00 |
|
||||
// s1 = | 23 | 03 | 22 | 02 |
|
||||
// s2 = | 31 | 11 | 30 | 10 |
|
||||
// s3 = | 33 | 13 | 32 | 12 |
|
||||
|
||||
"precr.qb.ph $s4, $s1, $s0 \n"
|
||||
"precrq.qb.ph $s5, $s1, $s0 \n"
|
||||
"precr.qb.ph $s6, $s3, $s2 \n"
|
||||
"precrq.qb.ph $s7, $s3, $s2 \n"
|
||||
|
||||
// s4 = | 03 | 02 | 01 | 00 |
|
||||
// s5 = | 23 | 22 | 21 | 20 |
|
||||
// s6 = | 13 | 12 | 11 | 10 |
|
||||
// s7 = | 33 | 32 | 31 | 30 |
|
||||
|
||||
"lwx $t0, $t4(%[src]) \n"
|
||||
"lwx $t1, $t5(%[src]) \n"
|
||||
"lwx $t8, $t6(%[src]) \n"
|
||||
"lwx $t9, $t7(%[src]) \n"
|
||||
|
||||
// t0 = | 34 | 24 | 14 | 04 |
|
||||
// t1 = | 35 | 25 | 15 | 05 |
|
||||
// t8 = | 36 | 26 | 16 | 06 |
|
||||
// t9 = | 37 | 27 | 17 | 07 |
|
||||
|
||||
"precr.qb.ph $s0, $t1, $t0 \n"
|
||||
"precr.qb.ph $s1, $t9, $t8 \n"
|
||||
"precrq.qb.ph $s2, $t1, $t0 \n"
|
||||
"precrq.qb.ph $s3, $t9, $t8 \n"
|
||||
|
||||
// s0 = | 25 | 05 | 24 | 04 |
|
||||
// s1 = | 27 | 07 | 26 | 06 |
|
||||
// s2 = | 35 | 15 | 34 | 14 |
|
||||
// s3 = | 37 | 17 | 36 | 16 |
|
||||
|
||||
"precr.qb.ph $t0, $s1, $s0 \n"
|
||||
"precrq.qb.ph $t1, $s1, $s0 \n"
|
||||
"precr.qb.ph $t8, $s3, $s2 \n"
|
||||
"precrq.qb.ph $t9, $s3, $s2 \n"
|
||||
|
||||
// t0 = | 07 | 06 | 05 | 04 |
|
||||
// t1 = | 27 | 26 | 25 | 24 |
|
||||
// t8 = | 17 | 16 | 15 | 14 |
|
||||
// t9 = | 37 | 36 | 35 | 34 |
|
||||
|
||||
"addu $s0, %[dst], %[dst_stride] \n"
|
||||
"addu $s1, $s0, %[dst_stride] \n"
|
||||
"addu $s2, $s1, %[dst_stride] \n"
|
||||
|
||||
"sw $s4, 0(%[dst]) \n"
|
||||
"sw $t0, 4(%[dst]) \n"
|
||||
"sw $s6, 0($s0) \n"
|
||||
"sw $t8, 4($s0) \n"
|
||||
"sw $s5, 0($s1) \n"
|
||||
"sw $t1, 4($s1) \n"
|
||||
"sw $s7, 0($s2) \n"
|
||||
"sw $t9, 4($s2) \n"
|
||||
|
||||
"addiu $AT, -1 \n"
|
||||
"addiu %[src], 4 \n"
|
||||
|
||||
"bnez $AT, 1b \n"
|
||||
" addu %[dst], $s2, %[dst_stride] \n"
|
||||
"b 2f \n"
|
||||
//dst + dst_stride unaligned
|
||||
"11: \n"
|
||||
"lw $t0, 0(%[src]) \n"
|
||||
"lwx $t1, %[src_stride](%[src]) \n"
|
||||
"lwx $t8, $t2(%[src]) \n"
|
||||
"lwx $t9, $t3(%[src]) \n"
|
||||
|
||||
// t0 = | 30 | 20 | 10 | 00 |
|
||||
// t1 = | 31 | 21 | 11 | 01 |
|
||||
// t8 = | 32 | 22 | 12 | 02 |
|
||||
// t9 = | 33 | 23 | 13 | 03 |
|
||||
|
||||
"precr.qb.ph $s0, $t1, $t0 \n"
|
||||
"precr.qb.ph $s1, $t9, $t8 \n"
|
||||
"precrq.qb.ph $s2, $t1, $t0 \n"
|
||||
"precrq.qb.ph $s3, $t9, $t8 \n"
|
||||
|
||||
// s0 = | 21 | 01 | 20 | 00 |
|
||||
// s1 = | 23 | 03 | 22 | 02 |
|
||||
// s2 = | 31 | 11 | 30 | 10 |
|
||||
// s3 = | 33 | 13 | 32 | 12 |
|
||||
|
||||
"precr.qb.ph $s4, $s1, $s0 \n"
|
||||
"precrq.qb.ph $s5, $s1, $s0 \n"
|
||||
"precr.qb.ph $s6, $s3, $s2 \n"
|
||||
"precrq.qb.ph $s7, $s3, $s2 \n"
|
||||
|
||||
// s4 = | 03 | 02 | 01 | 00 |
|
||||
// s5 = | 23 | 22 | 21 | 20 |
|
||||
// s6 = | 13 | 12 | 11 | 10 |
|
||||
// s7 = | 33 | 32 | 31 | 30 |
|
||||
|
||||
"lwx $t0, $t4(%[src]) \n"
|
||||
"lwx $t1, $t5(%[src]) \n"
|
||||
"lwx $t8, $t6(%[src]) \n"
|
||||
"lwx $t9, $t7(%[src]) \n"
|
||||
|
||||
// t0 = | 34 | 24 | 14 | 04 |
|
||||
// t1 = | 35 | 25 | 15 | 05 |
|
||||
// t8 = | 36 | 26 | 16 | 06 |
|
||||
// t9 = | 37 | 27 | 17 | 07 |
|
||||
|
||||
"precr.qb.ph $s0, $t1, $t0 \n"
|
||||
"precr.qb.ph $s1, $t9, $t8 \n"
|
||||
"precrq.qb.ph $s2, $t1, $t0 \n"
|
||||
"precrq.qb.ph $s3, $t9, $t8 \n"
|
||||
|
||||
// s0 = | 25 | 05 | 24 | 04 |
|
||||
// s1 = | 27 | 07 | 26 | 06 |
|
||||
// s2 = | 35 | 15 | 34 | 14 |
|
||||
// s3 = | 37 | 17 | 36 | 16 |
|
||||
|
||||
"precr.qb.ph $t0, $s1, $s0 \n"
|
||||
"precrq.qb.ph $t1, $s1, $s0 \n"
|
||||
"precr.qb.ph $t8, $s3, $s2 \n"
|
||||
"precrq.qb.ph $t9, $s3, $s2 \n"
|
||||
|
||||
// t0 = | 07 | 06 | 05 | 04 |
|
||||
// t1 = | 27 | 26 | 25 | 24 |
|
||||
// t8 = | 17 | 16 | 15 | 14 |
|
||||
// t9 = | 37 | 36 | 35 | 34 |
|
||||
|
||||
"addu $s0, %[dst], %[dst_stride] \n"
|
||||
"addu $s1, $s0, %[dst_stride] \n"
|
||||
"addu $s2, $s1, %[dst_stride] \n"
|
||||
|
||||
"swr $s4, 0(%[dst]) \n"
|
||||
"swl $s4, 3(%[dst]) \n"
|
||||
"swr $t0, 4(%[dst]) \n"
|
||||
"swl $t0, 7(%[dst]) \n"
|
||||
"swr $s6, 0($s0) \n"
|
||||
"swl $s6, 3($s0) \n"
|
||||
"swr $t8, 4($s0) \n"
|
||||
"swl $t8, 7($s0) \n"
|
||||
"swr $s5, 0($s1) \n"
|
||||
"swl $s5, 3($s1) \n"
|
||||
"swr $t1, 4($s1) \n"
|
||||
"swl $t1, 7($s1) \n"
|
||||
"swr $s7, 0($s2) \n"
|
||||
"swl $s7, 3($s2) \n"
|
||||
"swr $t9, 4($s2) \n"
|
||||
"swl $t9, 7($s2) \n"
|
||||
|
||||
"addiu $AT, -1 \n"
|
||||
"addiu %[src], 4 \n"
|
||||
|
||||
"bnez $AT, 11b \n"
|
||||
" addu %[dst], $s2, %[dst_stride] \n"
|
||||
"2: \n"
|
||||
".set pop \n"
|
||||
".set at \n"
|
||||
:[src] "+r" (src),
|
||||
[dst] "+r" (dst),
|
||||
[width] "+r" (width)
|
||||
:[src_stride] "r" (src_stride),
|
||||
[dst_stride] "r" (dst_stride)
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5",
|
||||
"t6", "t7", "t8", "t9",
|
||||
"s0", "s1", "s2", "s3", "s4",
|
||||
"s5", "s6", "s7"
|
||||
);
|
||||
}
|
||||
|
||||
void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
|
||||
uint8* dst_a, int dst_stride_a,
|
||||
uint8* dst_b, int dst_stride_b,
|
||||
int width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"beqz %[width], 2f \n"
|
||||
" sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
|
||||
"sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
|
||||
"sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
|
||||
"addu $t3, $t2, %[src_stride] \n"
|
||||
"addu $t5, $t4, %[src_stride] \n"
|
||||
"addu $t6, $t2, $t4 \n"
|
||||
"subu $t7, $t9, %[src_stride] \n"
|
||||
"srl $t1, %[width], 1 \n"
|
||||
|
||||
// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
|
||||
"andi $t0, %[dst_a], 0x3 \n"
|
||||
"andi $t8, %[dst_b], 0x3 \n"
|
||||
"or $t0, $t0, $t8 \n"
|
||||
"andi $t8, %[dst_stride_a], 0x3 \n"
|
||||
"andi $s5, %[dst_stride_b], 0x3 \n"
|
||||
"or $t8, $t8, $s5 \n"
|
||||
"or $t0, $t0, $t8 \n"
|
||||
"bnez $t0, 11f \n"
|
||||
" nop \n"
|
||||
// dst + dst_stride word aligned (both, a & b dst addresses)
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
|
||||
"lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
|
||||
"addu $s5, %[dst_a], %[dst_stride_a] \n"
|
||||
"lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
|
||||
"lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
|
||||
"addu $s6, %[dst_b], %[dst_stride_b] \n"
|
||||
|
||||
"precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
|
||||
"precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
|
||||
"precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
|
||||
"precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
|
||||
|
||||
"sll $t0, $t0, 16 \n"
|
||||
"packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
|
||||
|
||||
"sw $s3, 0($s5) \n"
|
||||
"sw $s4, 0($s6) \n"
|
||||
|
||||
"precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
|
||||
"precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
|
||||
|
||||
"lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
|
||||
"lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
|
||||
"lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
|
||||
"lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
|
||||
"sw $s3, 0(%[dst_a]) \n"
|
||||
"sw $s4, 0(%[dst_b]) \n"
|
||||
|
||||
"precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
|
||||
"precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
|
||||
"precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
|
||||
"precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
|
||||
|
||||
"sll $t0, $t0, 16 \n"
|
||||
"packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
|
||||
"sw $s3, 4($s5) \n"
|
||||
"sw $s4, 4($s6) \n"
|
||||
|
||||
"precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
|
||||
"precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
|
||||
|
||||
"addiu %[src], 4 \n"
|
||||
"addiu $t1, -1 \n"
|
||||
"sll $t0, %[dst_stride_a], 1 \n"
|
||||
"sll $t8, %[dst_stride_b], 1 \n"
|
||||
"sw $s3, 4(%[dst_a]) \n"
|
||||
"sw $s4, 4(%[dst_b]) \n"
|
||||
"addu %[dst_a], %[dst_a], $t0 \n"
|
||||
"bnez $t1, 1b \n"
|
||||
" addu %[dst_b], %[dst_b], $t8 \n"
|
||||
"b 2f \n"
|
||||
" nop \n"
|
||||
|
||||
// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
|
||||
"11: \n"
|
||||
"lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
|
||||
"lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
|
||||
"addu $s5, %[dst_a], %[dst_stride_a] \n"
|
||||
"lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
|
||||
"lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
|
||||
"addu $s6, %[dst_b], %[dst_stride_b] \n"
|
||||
|
||||
"precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
|
||||
"precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
|
||||
"precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
|
||||
"precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
|
||||
|
||||
"sll $t0, $t0, 16 \n"
|
||||
"packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
|
||||
|
||||
"swr $s3, 0($s5) \n"
|
||||
"swl $s3, 3($s5) \n"
|
||||
"swr $s4, 0($s6) \n"
|
||||
"swl $s4, 3($s6) \n"
|
||||
|
||||
"precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
|
||||
"precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
|
||||
|
||||
"lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
|
||||
"lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
|
||||
"lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
|
||||
"lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
|
||||
"swr $s3, 0(%[dst_a]) \n"
|
||||
"swl $s3, 3(%[dst_a]) \n"
|
||||
"swr $s4, 0(%[dst_b]) \n"
|
||||
"swl $s4, 3(%[dst_b]) \n"
|
||||
|
||||
"precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
|
||||
"precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
|
||||
"precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
|
||||
"precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
|
||||
|
||||
"sll $t0, $t0, 16 \n"
|
||||
"packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
|
||||
|
||||
"swr $s3, 4($s5) \n"
|
||||
"swl $s3, 7($s5) \n"
|
||||
"swr $s4, 4($s6) \n"
|
||||
"swl $s4, 7($s6) \n"
|
||||
|
||||
"precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
|
||||
"precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
|
||||
|
||||
"addiu %[src], 4 \n"
|
||||
"addiu $t1, -1 \n"
|
||||
"sll $t0, %[dst_stride_a], 1 \n"
|
||||
"sll $t8, %[dst_stride_b], 1 \n"
|
||||
"swr $s3, 4(%[dst_a]) \n"
|
||||
"swl $s3, 7(%[dst_a]) \n"
|
||||
"swr $s4, 4(%[dst_b]) \n"
|
||||
"swl $s4, 7(%[dst_b]) \n"
|
||||
"addu %[dst_a], %[dst_a], $t0 \n"
|
||||
"bnez $t1, 11b \n"
|
||||
" addu %[dst_b], %[dst_b], $t8 \n"
|
||||
|
||||
"2: \n"
|
||||
".set pop \n"
|
||||
: [src] "+r" (src),
|
||||
[dst_a] "+r" (dst_a),
|
||||
[dst_b] "+r" (dst_b),
|
||||
[width] "+r" (width),
|
||||
[src_stride] "+r" (src_stride)
|
||||
: [dst_stride_a] "r" (dst_stride_a),
|
||||
[dst_stride_b] "r" (dst_stride_b)
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5",
|
||||
"t6", "t7", "t8", "t9",
|
||||
"s0", "s1", "s2", "s3",
|
||||
"s4", "s5", "s6"
|
||||
);
|
||||
}
|
||||
|
||||
#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
#endif
|
||||
@ -16,6 +16,13 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
#if !defined(YUV_DISABLE_ASM) && defined(__mips__)
|
||||
#if defined HAS_COPYROW_MIPS
|
||||
extern "C" void memcpy_MIPS(uint8* dst, const uint8* src, int count);
|
||||
void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
|
||||
memcpy_MIPS(dst, src, count);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAS_SPLITUV_MIPS_DSPR2
|
||||
void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
|
||||
int width) {
|
||||
@ -166,6 +173,400 @@ void SplitUV_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u,
|
||||
}
|
||||
#endif // HAS_SPLITUV_MIPS_DSPR2
|
||||
|
||||
#ifdef HAS_MIRRORROW_MIPS_DSPR2
|
||||
void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
|
||||
"srl $t4, %[width], 4 \n" // multiplies of 16
|
||||
"andi $t5, %[width], 0xf \n"
|
||||
"blez $t4, 2f \n"
|
||||
" addu %[src], %[src], %[width] \n" // src += width
|
||||
|
||||
"1: \n"
|
||||
"lw $t0, -16(%[src]) \n" // |3|2|1|0|
|
||||
"lw $t1, -12(%[src]) \n" // |7|6|5|4|
|
||||
"lw $t2, -8(%[src]) \n" // |11|10|9|8|
|
||||
"lw $t3, -4(%[src]) \n" // |15|14|13|12|
|
||||
"wsbh $t0, $t0 \n" // |2|3|0|1|
|
||||
"wsbh $t1, $t1 \n" // |6|7|4|5|
|
||||
"wsbh $t2, $t2 \n" // |10|11|8|9|
|
||||
"wsbh $t3, $t3 \n" // |14|15|12|13|
|
||||
"rotr $t0, $t0, 16 \n" // |0|1|2|3|
|
||||
"rotr $t1, $t1, 16 \n" // |4|5|6|7|
|
||||
"rotr $t2, $t2, 16 \n" // |8|9|10|11|
|
||||
"rotr $t3, $t3, 16 \n" // |12|13|14|15|
|
||||
"addiu %[src], %[src], -16 \n"
|
||||
"addiu $t4, $t4, -1 \n"
|
||||
"sw $t3, 0(%[dst]) \n" // |15|14|13|12|
|
||||
"sw $t2, 4(%[dst]) \n" // |11|10|9|8|
|
||||
"sw $t1, 8(%[dst]) \n" // |7|6|5|4|
|
||||
"sw $t0, 12(%[dst]) \n" // |3|2|1|0|
|
||||
"bgtz $t4, 1b \n"
|
||||
" addiu %[dst], %[dst], 16 \n"
|
||||
"beqz $t5, 3f \n"
|
||||
" nop \n"
|
||||
|
||||
"2: \n"
|
||||
"lbu $t0, -1(%[src]) \n"
|
||||
"addiu $t5, $t5, -1 \n"
|
||||
"addiu %[src], %[src], -1 \n"
|
||||
"sb $t0, 0(%[dst]) \n"
|
||||
"bgez $t5, 2b \n"
|
||||
" addiu %[dst], %[dst], 1 \n"
|
||||
|
||||
"3: \n"
|
||||
".set pop \n"
|
||||
: [src] "+r" (src), [dst] "+r" (dst)
|
||||
: [width] "r" (width)
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5"
|
||||
);
|
||||
}
|
||||
#endif // HAS_MIRRORROW_MIPS_DSPR2
|
||||
|
||||
#ifdef HAS_MIRRORROWUV_MIPS_DSPR2
|
||||
void MirrorRowUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
|
||||
int width) {
|
||||
int x = 0;
|
||||
int y = 0;
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
|
||||
"addu $t4, %[width], %[width] \n"
|
||||
"srl %[x], %[width], 4 \n"
|
||||
"andi %[y], %[width], 0xf \n"
|
||||
"blez %[x], 2f \n"
|
||||
" addu %[src_uv], %[src_uv], $t4 \n"
|
||||
|
||||
"1: \n"
|
||||
"lw $t0, -32(%[src_uv]) \n" // |3|2|1|0|
|
||||
"lw $t1, -28(%[src_uv]) \n" // |7|6|5|4|
|
||||
"lw $t2, -24(%[src_uv]) \n" // |11|10|9|8|
|
||||
"lw $t3, -20(%[src_uv]) \n" // |15|14|13|12|
|
||||
"lw $t4, -16(%[src_uv]) \n" // |19|18|17|16|
|
||||
"lw $t6, -12(%[src_uv]) \n" // |23|22|21|20|
|
||||
"lw $t7, -8(%[src_uv]) \n" // |27|26|25|24|
|
||||
"lw $t8, -4(%[src_uv]) \n" // |31|30|29|28|
|
||||
|
||||
"rotr $t0, $t0, 16 \n" // |1|0|3|2|
|
||||
"rotr $t1, $t1, 16 \n" // |5|4|7|6|
|
||||
"rotr $t2, $t2, 16 \n" // |9|8|11|10|
|
||||
"rotr $t3, $t3, 16 \n" // |13|12|15|14|
|
||||
"rotr $t4, $t4, 16 \n" // |17|16|19|18|
|
||||
"rotr $t6, $t6, 16 \n" // |21|20|23|22|
|
||||
"rotr $t7, $t7, 16 \n" // |25|24|27|26|
|
||||
"rotr $t8, $t8, 16 \n" // |29|28|31|30|
|
||||
"precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6|
|
||||
"precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7|
|
||||
"precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14|
|
||||
"precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15|
|
||||
"precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22|
|
||||
"precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23|
|
||||
"precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30|
|
||||
"precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31|
|
||||
"addiu %[src_uv], %[src_uv], -32 \n"
|
||||
"addiu %[x], %[x], -1 \n"
|
||||
"swr $t4, 0(%[dst_u]) \n"
|
||||
"swl $t4, 3(%[dst_u]) \n" // |30|28|26|24|
|
||||
"swr $t6, 0(%[dst_v]) \n"
|
||||
"swl $t6, 3(%[dst_v]) \n" // |31|29|27|25|
|
||||
"swr $t2, 4(%[dst_u]) \n"
|
||||
"swl $t2, 7(%[dst_u]) \n" // |22|20|18|16|
|
||||
"swr $t3, 4(%[dst_v]) \n"
|
||||
"swl $t3, 7(%[dst_v]) \n" // |23|21|19|17|
|
||||
"swr $t0, 8(%[dst_u]) \n"
|
||||
"swl $t0, 11(%[dst_u]) \n" // |14|12|10|8|
|
||||
"swr $t1, 8(%[dst_v]) \n"
|
||||
"swl $t1, 11(%[dst_v]) \n" // |15|13|11|9|
|
||||
"swr $t9, 12(%[dst_u]) \n"
|
||||
"swl $t9, 15(%[dst_u]) \n" // |6|4|2|0|
|
||||
"swr $t5, 12(%[dst_v]) \n"
|
||||
"swl $t5, 15(%[dst_v]) \n" // |7|5|3|1|
|
||||
"addiu %[dst_v], %[dst_v], 16 \n"
|
||||
"bgtz %[x], 1b \n"
|
||||
" addiu %[dst_u], %[dst_u], 16 \n"
|
||||
"beqz %[y], 3f \n"
|
||||
" nop \n"
|
||||
"b 2f \n"
|
||||
" nop \n"
|
||||
|
||||
"2: \n"
|
||||
"lbu $t0, -2(%[src_uv]) \n"
|
||||
"lbu $t1, -1(%[src_uv]) \n"
|
||||
"addiu %[src_uv], %[src_uv], -2 \n"
|
||||
"addiu %[y], %[y], -1 \n"
|
||||
"sb $t0, 0(%[dst_u]) \n"
|
||||
"sb $t1, 0(%[dst_v]) \n"
|
||||
"addiu %[dst_u], %[dst_u], 1 \n"
|
||||
"bgtz %[y], 2b \n"
|
||||
" addiu %[dst_v], %[dst_v], 1 \n"
|
||||
|
||||
"3: \n"
|
||||
".set pop \n"
|
||||
: [src_uv] "+r" (src_uv),
|
||||
[dst_u] "+r" (dst_u),
|
||||
[dst_v] "+r" (dst_v),
|
||||
[x] "=&r" (x),
|
||||
[y] "+r" (y)
|
||||
: [width] "r" (width)
|
||||
: "t0", "t1", "t2", "t3", "t4",
|
||||
"t5", "t7", "t8", "t9"
|
||||
);
|
||||
}
|
||||
#endif // HAS_MIRRORROWUV_MIPS_DSPR2
|
||||
|
||||
|
||||
|
||||
// Convert (4 Y and 2 VU) I422 and arrange RGB values into
|
||||
// t5 = | 0 | B0 | 0 | b0 |
|
||||
// t4 = | 0 | B1 | 0 | b1 |
|
||||
// t9 = | 0 | G0 | 0 | g0 |
|
||||
// t8 = | 0 | G1 | 0 | g1 |
|
||||
// t2 = | 0 | R0 | 0 | r0 |
|
||||
// t1 = | 0 | R1 | 0 | r1 |
|
||||
#define I422ToTransientMipsRGB \
|
||||
"lw $t0, 0(%[y_buf]) \n" \
|
||||
"lhu $t1, 0(%[u_buf]) \n" \
|
||||
"lhu $t2, 0(%[v_buf]) \n" \
|
||||
"preceu.ph.qbr $t1, $t1 \n" \
|
||||
"preceu.ph.qbr $t2, $t2 \n" \
|
||||
"preceu.ph.qbra $t3, $t0 \n" \
|
||||
"preceu.ph.qbla $t0, $t0 \n" \
|
||||
"subu.ph $t1, $t1, $s5 \n" \
|
||||
"subu.ph $t2, $t2, $s5 \n" \
|
||||
"subu.ph $t3, $t3, $s4 \n" \
|
||||
"subu.ph $t0, $t0, $s4 \n" \
|
||||
"mul.ph $t3, $t3, $s0 \n" \
|
||||
"mul.ph $t0, $t0, $s0 \n" \
|
||||
"shll.ph $t4, $t1, 0x7 \n" \
|
||||
"subu.ph $t4, $t4, $t1 \n" \
|
||||
"mul.ph $t6, $t1, $s1 \n" \
|
||||
"mul.ph $t1, $t2, $s2 \n" \
|
||||
"addq_s.ph $t5, $t4, $t3 \n" \
|
||||
"addq_s.ph $t4, $t4, $t0 \n" \
|
||||
"shra.ph $t5, $t5, 6 \n" \
|
||||
"shra.ph $t4, $t4, 6 \n" \
|
||||
"addiu %[u_buf], 2 \n" \
|
||||
"addiu %[v_buf], 2 \n" \
|
||||
"addu.ph $t6, $t6, $t1 \n" \
|
||||
"mul.ph $t1, $t2, $s3 \n" \
|
||||
"addu.ph $t9, $t6, $t3 \n" \
|
||||
"addu.ph $t8, $t6, $t0 \n" \
|
||||
"shra.ph $t9, $t9, 6 \n" \
|
||||
"shra.ph $t8, $t8, 6 \n" \
|
||||
"addu.ph $t2, $t1, $t3 \n" \
|
||||
"addu.ph $t1, $t1, $t0 \n" \
|
||||
"shra.ph $t2, $t2, 6 \n" \
|
||||
"shra.ph $t1, $t1, 6 \n" \
|
||||
"subu.ph $t5, $t5, $s5 \n" \
|
||||
"subu.ph $t4, $t4, $s5 \n" \
|
||||
"subu.ph $t9, $t9, $s5 \n" \
|
||||
"subu.ph $t8, $t8, $s5 \n" \
|
||||
"subu.ph $t2, $t2, $s5 \n" \
|
||||
"subu.ph $t1, $t1, $s5 \n" \
|
||||
"shll_s.ph $t5, $t5, 8 \n" \
|
||||
"shll_s.ph $t4, $t4, 8 \n" \
|
||||
"shll_s.ph $t9, $t9, 8 \n" \
|
||||
"shll_s.ph $t8, $t8, 8 \n" \
|
||||
"shll_s.ph $t2, $t2, 8 \n" \
|
||||
"shll_s.ph $t1, $t1, 8 \n" \
|
||||
"shra.ph $t5, $t5, 8 \n" \
|
||||
"shra.ph $t4, $t4, 8 \n" \
|
||||
"shra.ph $t9, $t9, 8 \n" \
|
||||
"shra.ph $t8, $t8, 8 \n" \
|
||||
"shra.ph $t2, $t2, 8 \n" \
|
||||
"shra.ph $t1, $t1, 8 \n" \
|
||||
"addu.ph $t5, $t5, $s5 \n" \
|
||||
"addu.ph $t4, $t4, $s5 \n" \
|
||||
"addu.ph $t9, $t9, $s5 \n" \
|
||||
"addu.ph $t8, $t8, $s5 \n" \
|
||||
"addu.ph $t2, $t2, $s5 \n" \
|
||||
"addu.ph $t1, $t1, $s5 \n"
|
||||
|
||||
void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"beqz %[width], 2f \n"
|
||||
" repl.ph $s0, 74 \n" // |YG|YG| = |74|74|
|
||||
"repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
|
||||
"repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
|
||||
"repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
|
||||
"repl.ph $s4, 16 \n" // |0|16|0|16|
|
||||
"repl.ph $s5, 128 \n" // |128|128| // clipping
|
||||
"lui $s6, 0xff00 \n"
|
||||
"ori $s6, 0xff00 \n" // |ff|00|ff|00|ff|
|
||||
"1: \n"
|
||||
I422ToTransientMipsRGB
|
||||
// Arranging into argb format
|
||||
"precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1|
|
||||
"precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0|
|
||||
"addiu %[width], -4 \n"
|
||||
"precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0|
|
||||
"precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0|
|
||||
"precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
|
||||
|
||||
"addiu %[y_buf], 4 \n"
|
||||
"preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
|
||||
"preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
|
||||
"or $t1, $t1, $s6 \n" // |ff|R1|ff|R0|
|
||||
"or $t2, $t2, $s6 \n" // |ff|r1|ff|r0|
|
||||
"precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1|
|
||||
"precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1|
|
||||
"sll $t9, $t9, 16 \n"
|
||||
"sll $t8, $t8, 16 \n"
|
||||
"packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0|
|
||||
"packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0|
|
||||
// Store results.
|
||||
"sw $t2, 0(%[rgb_buf]) \n"
|
||||
"sw $t0, 4(%[rgb_buf]) \n"
|
||||
"sw $t1, 8(%[rgb_buf]) \n"
|
||||
"sw $t3, 12(%[rgb_buf]) \n"
|
||||
"bnez %[width], 1b \n"
|
||||
" addiu %[rgb_buf], 16 \n"
|
||||
"2: \n"
|
||||
".set pop \n"
|
||||
:[y_buf] "+r" (y_buf),
|
||||
[u_buf] "+r" (u_buf),
|
||||
[v_buf] "+r" (v_buf),
|
||||
[width] "+r" (width),
|
||||
[rgb_buf] "+r" (rgb_buf)
|
||||
:
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5",
|
||||
"t6", "t7", "t8", "t9",
|
||||
"s0", "s1", "s2", "s3",
|
||||
"s4", "s5", "s6"
|
||||
);
|
||||
}
|
||||
|
||||
void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n\t"
|
||||
".set noreorder \n\t"
|
||||
"beqz %[width], 2f \n\t"
|
||||
" repl.ph $s0, 74 \n\t" // |YG|YG| = |74|74|
|
||||
"repl.ph $s1, -25 \n\t" // |UG|UG| = |-25|-25|
|
||||
"repl.ph $s2, -52 \n\t" // |VG|VG| = |-52|-52|
|
||||
"repl.ph $s3, 102 \n\t" // |VR|VR| = |102|102|
|
||||
"repl.ph $s4, 16 \n\t" // |0|16|0|16|
|
||||
"repl.ph $s5, 128 \n\t" // |128|128|
|
||||
"lui $s6, 0xff00 \n\t"
|
||||
"ori $s6, 0xff00 \n\t" // |ff|00|ff|00|
|
||||
"1: \n"
|
||||
I422ToTransientMipsRGB
|
||||
// Arranging into abgr format
|
||||
"precr.qb.ph $t0, $t8, $t1 \n\t" // |G1|g1|R1|r1|
|
||||
"precr.qb.ph $t3, $t9, $t2 \n\t" // |G0|g0|R0|r0|
|
||||
"precrq.qb.ph $t8, $t0, $t3 \n\t" // |G1|R1|G0|R0|
|
||||
"precr.qb.ph $t9, $t0, $t3 \n\t" // |g1|r1|g0|r0|
|
||||
|
||||
"precr.qb.ph $t2, $t4, $t5 \n\t" // |B1|b1|B0|b0|
|
||||
"addiu %[width], -4 \n\t"
|
||||
"addiu %[y_buf], 4 \n\t"
|
||||
"preceu.ph.qbla $t1, $t2 \n\t" // |0 |B1|0 |B0|
|
||||
"preceu.ph.qbra $t2, $t2 \n\t" // |0 |b1|0 |b0|
|
||||
"or $t1, $t1, $s6 \n\t" // |ff|B1|ff|B0|
|
||||
"or $t2, $t2, $s6 \n\t" // |ff|b1|ff|b0|
|
||||
"precrq.ph.w $t0, $t2, $t9 \n\t" // |ff|b1|g1|r1|
|
||||
"precrq.ph.w $t3, $t1, $t8 \n\t" // |ff|B1|G1|R1|
|
||||
"sll $t9, $t9, 16 \n\t"
|
||||
"sll $t8, $t8, 16 \n\t"
|
||||
"packrl.ph $t2, $t2, $t9 \n\t" // |ff|b0|g0|r0|
|
||||
"packrl.ph $t1, $t1, $t8 \n\t" // |ff|B0|G0|R0|
|
||||
// Store results.
|
||||
"sw $t2, 0(%[rgb_buf]) \n\t"
|
||||
"sw $t0, 4(%[rgb_buf]) \n\t"
|
||||
"sw $t1, 8(%[rgb_buf]) \n\t"
|
||||
"sw $t3, 12(%[rgb_buf]) \n\t"
|
||||
"bnez %[width], 1b \n\t"
|
||||
" addiu %[rgb_buf], 16 \n\t"
|
||||
"2: \n\t"
|
||||
".set pop \n\t"
|
||||
:[y_buf] "+r" (y_buf),
|
||||
[u_buf] "+r" (u_buf),
|
||||
[v_buf] "+r" (v_buf),
|
||||
[width] "+r" (width),
|
||||
[rgb_buf] "+r" (rgb_buf)
|
||||
:
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5",
|
||||
"t6", "t7", "t8", "t9",
|
||||
"s0", "s1", "s2", "s3",
|
||||
"s4", "s5", "s6"
|
||||
);
|
||||
}
|
||||
|
||||
void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf,
|
||||
const uint8* u_buf,
|
||||
const uint8* v_buf,
|
||||
uint8* rgb_buf,
|
||||
int width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"beqz %[width], 2f \n"
|
||||
" repl.ph $s0, 74 \n" // |YG|YG| = |74 |74 |
|
||||
"repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25|
|
||||
"repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52|
|
||||
"repl.ph $s3, 102 \n" // |VR|VR| = |102|102|
|
||||
"repl.ph $s4, 16 \n" // |0|16|0|16|
|
||||
"repl.ph $s5, 128 \n" // |128|128|
|
||||
"lui $s6, 0xff \n"
|
||||
"ori $s6, 0xff \n" // |00|ff|00|ff|
|
||||
"1: \n"
|
||||
I422ToTransientMipsRGB
|
||||
// Arranging into bgra format
|
||||
"precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1|
|
||||
"precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0|
|
||||
"precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0|
|
||||
"precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0|
|
||||
|
||||
"precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0|
|
||||
"addiu %[width], -4 \n"
|
||||
"addiu %[y_buf], 4 \n"
|
||||
"preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0|
|
||||
"preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0|
|
||||
"sll $t1, $t1, 8 \n" // |R1|0 |R0|0 |
|
||||
"sll $t2, $t2, 8 \n" // |r1|0 |r0|0 |
|
||||
"or $t1, $t1, $s6 \n" // |R1|ff|R0|ff|
|
||||
"or $t2, $t2, $s6 \n" // |r1|ff|r0|ff|
|
||||
"precrq.ph.w $t0, $t9, $t2 \n" // |b1|g1|r1|ff|
|
||||
"precrq.ph.w $t3, $t8, $t1 \n" // |B1|G1|R1|ff|
|
||||
"sll $t1, $t1, 16 \n"
|
||||
"sll $t2, $t2, 16 \n"
|
||||
"packrl.ph $t2, $t9, $t2 \n" // |b0|g0|r0|ff|
|
||||
"packrl.ph $t1, $t8, $t1 \n" // |B0|G0|R0|ff|
|
||||
// Store results.
|
||||
"sw $t2, 0(%[rgb_buf]) \n"
|
||||
"sw $t0, 4(%[rgb_buf]) \n"
|
||||
"sw $t1, 8(%[rgb_buf]) \n"
|
||||
"sw $t3, 12(%[rgb_buf]) \n"
|
||||
"bnez %[width], 1b \n"
|
||||
" addiu %[rgb_buf], 16 \n"
|
||||
"2: \n"
|
||||
".set pop \n"
|
||||
:[y_buf] "+r" (y_buf),
|
||||
[u_buf] "+r" (u_buf),
|
||||
[v_buf] "+r" (v_buf),
|
||||
[width] "+r" (width),
|
||||
[rgb_buf] "+r" (rgb_buf)
|
||||
:
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5",
|
||||
"t6", "t7", "t8", "t9",
|
||||
"s0", "s1", "s2", "s3",
|
||||
"s4", "s5", "s6"
|
||||
);
|
||||
}
|
||||
|
||||
#endif // __mips__
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -1957,6 +1957,26 @@ void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr,
|
||||
const unsigned char* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int dst_width, int source_y_fraction);
|
||||
#define HAS_SCALEROWDOWN4_MIPS_DSPR2
|
||||
void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
|
||||
uint8* dst, int dst_width);
|
||||
void ScaleRowDown4Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width);
|
||||
#define HAS_SCALEROWDOWN34_MIPS_DSPR2
|
||||
void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
|
||||
uint8* dst, int dst_width);
|
||||
void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* d, int dst_width);
|
||||
void ScaleRowDown34_1_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* d, int dst_width);
|
||||
#define HAS_SCALEROWDOWN38_MIPS_DSPR2
|
||||
void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
|
||||
uint8* dst, int dst_width);
|
||||
void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
void ScaleRowDown38_3_Int_MIPS_DSPR2(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width);
|
||||
#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
|
||||
|
||||
// CPU agnostic row functions
|
||||
@ -2368,6 +2388,13 @@ static void ScalePlaneDown4(int /* src_width */, int /* src_height */,
|
||||
IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) {
|
||||
ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2;
|
||||
}
|
||||
#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
|
||||
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
|
||||
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
|
||||
ScaleRowDown4 = filtering ?
|
||||
ScaleRowDown4Int_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2;
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < dst_height; ++y) {
|
||||
@ -2461,6 +2488,19 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */,
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) &&
|
||||
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
|
||||
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
|
||||
if (!filtering) {
|
||||
ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2;
|
||||
} else {
|
||||
ScaleRowDown34_0 = ScaleRowDown34_0_Int_MIPS_DSPR2;
|
||||
ScaleRowDown34_1 = ScaleRowDown34_1_Int_MIPS_DSPR2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < dst_height - 2; y += 3) {
|
||||
ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width);
|
||||
@ -2541,6 +2581,18 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */,
|
||||
ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3;
|
||||
}
|
||||
}
|
||||
#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2)
|
||||
if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) &&
|
||||
IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) &&
|
||||
IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) {
|
||||
if (!filtering) {
|
||||
ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2;
|
||||
} else {
|
||||
ScaleRowDown38_3 = ScaleRowDown38_3_Int_MIPS_DSPR2;
|
||||
ScaleRowDown38_2 = ScaleRowDown38_2_Int_MIPS_DSPR2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (int y = 0; y < dst_height - 2; y += 3) {
|
||||
|
||||
@ -173,6 +173,460 @@ void ScaleRowDown2Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
|
||||
uint8* dst, int dst_width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
|
||||
"srl $t9, %[dst_width], 3 \n"
|
||||
"beqz $t9, 2f \n"
|
||||
" nop \n"
|
||||
|
||||
"1: \n"
|
||||
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
|
||||
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
|
||||
"lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
|
||||
"lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
|
||||
"lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
|
||||
"lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
|
||||
"lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
|
||||
"lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
|
||||
"precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0|
|
||||
"precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8|
|
||||
"precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16|
|
||||
"precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24|
|
||||
"precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0|
|
||||
"precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16|
|
||||
"addiu %[src_ptr], %[src_ptr], 32 \n"
|
||||
"addiu $t9, $t9, -1 \n"
|
||||
"sw $t1, 0(%[dst]) \n"
|
||||
"sw $t5, 4(%[dst]) \n"
|
||||
"bgtz $t9, 1b \n"
|
||||
" addiu %[dst], %[dst], 8 \n"
|
||||
|
||||
"2: \n"
|
||||
"andi $t9, %[dst_width], 7 \n" // residue
|
||||
"beqz $t9, 3f \n"
|
||||
" nop \n"
|
||||
|
||||
"21: \n"
|
||||
"lbu $t1, 0(%[src_ptr]) \n"
|
||||
"addiu %[src_ptr], %[src_ptr], 4 \n"
|
||||
"addiu $t9, $t9, -1 \n"
|
||||
"sb $t1, 0(%[dst]) \n"
|
||||
"bgtz $t9, 21b \n"
|
||||
" addiu %[dst], %[dst], 1 \n"
|
||||
|
||||
"3: \n"
|
||||
".set pop \n"
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[dst] "+r" (dst)
|
||||
: [dst_width] "r" (dst_width)
|
||||
: "t1", "t2", "t3", "t4", "t5",
|
||||
"t6", "t7", "t8", "t9"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown4Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst, int dst_width) {
|
||||
intptr_t stride = src_stride;
|
||||
const uint8* s1 = src_ptr + stride;
|
||||
const uint8* s2 = s1 + stride;
|
||||
const uint8* s3 = s2 + stride;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
|
||||
"srl $t9, %[dst_width], 1 \n"
|
||||
"andi $t8, %[dst_width], 1 \n"
|
||||
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
|
||||
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
|
||||
"lw $t2, 0(%[s2]) \n" // |11|10|9|8|
|
||||
"lw $t3, 0(%[s3]) \n" // |15|14|13|12|
|
||||
"lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16|
|
||||
"lw $t5, 4(%[s1]) \n" // |23|22|21|20|
|
||||
"lw $t6, 4(%[s2]) \n" // |27|26|25|24|
|
||||
"lw $t7, 4(%[s3]) \n" // |31|30|29|28|
|
||||
"raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
|
||||
"raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
|
||||
"raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
|
||||
"raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
|
||||
"raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16|
|
||||
"raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20|
|
||||
"raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24|
|
||||
"raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28|
|
||||
"add $t0, $t0, $t1 \n"
|
||||
"add $t1, $t2, $t3 \n"
|
||||
"add $t0, $t0, $t1 \n"
|
||||
"add $t4, $t4, $t5 \n"
|
||||
"add $t6, $t6, $t7 \n"
|
||||
"add $t4, $t4, $t6 \n"
|
||||
"shra_r.w $t0, $t0, 4 \n"
|
||||
"shra_r.w $t4, $t4, 4 \n"
|
||||
"sb $t0, 0(%[dst]) \n"
|
||||
"sb $t4, 1(%[dst]) \n"
|
||||
"addiu %[src_ptr], %[src_ptr], 8 \n"
|
||||
"addiu %[s1], %[s1], 8 \n"
|
||||
"addiu %[s2], %[s2], 8 \n"
|
||||
"addiu %[s3], %[s3], 8 \n"
|
||||
"addiu $t9, $t9, -1 \n"
|
||||
"bgtz $t9, 1b \n"
|
||||
" addiu %[dst], %[dst], 2 \n"
|
||||
"beqz $t8, 2f \n"
|
||||
" nop \n"
|
||||
|
||||
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
|
||||
"lw $t1, 0(%[s1]) \n" // |7|6|5|4|
|
||||
"lw $t2, 0(%[s2]) \n" // |11|10|9|8|
|
||||
"lw $t3, 0(%[s3]) \n" // |15|14|13|12|
|
||||
"raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0|
|
||||
"raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4|
|
||||
"raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8|
|
||||
"raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12|
|
||||
"add $t0, $t0, $t1 \n"
|
||||
"add $t1, $t2, $t3 \n"
|
||||
"add $t0, $t0, $t1 \n"
|
||||
"shra_r.w $t0, $t0, 4 \n"
|
||||
"sb $t0, 0(%[dst]) \n"
|
||||
|
||||
"2: \n"
|
||||
".set pop \n"
|
||||
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[dst] "+r" (dst),
|
||||
[s1] "+r" (s1),
|
||||
[s2] "+r" (s2),
|
||||
[s3] "+r" (s3)
|
||||
: [dst_width] "r" (dst_width)
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5",
|
||||
"t6","t7", "t8", "t9"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
|
||||
uint8* dst, int dst_width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"1: \n"
|
||||
"lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0|
|
||||
"lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4|
|
||||
"lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8|
|
||||
"lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12|
|
||||
"lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16|
|
||||
"lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20|
|
||||
"lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24|
|
||||
"lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28|
|
||||
"precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13|
|
||||
"precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30|
|
||||
"addiu %[dst_width], %[dst_width], -24 \n"
|
||||
"ins $t1, $t1, 8, 16 \n" // |3|1|0|X|
|
||||
"ins $t4, $t0, 8, 16 \n" // |X|15|13|12|
|
||||
"ins $t5, $t5, 8, 16 \n" // |19|17|16|X|
|
||||
"ins $t8, $t9, 8, 16 \n" // |X|31|29|28|
|
||||
"addiu %[src_ptr], %[src_ptr], 32 \n"
|
||||
"packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5|
|
||||
"packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21|
|
||||
"prepend $t1, $t2, 8 \n" // |4|3|1|0|
|
||||
"prepend $t3, $t4, 24 \n" // |15|13|12|11|
|
||||
"prepend $t5, $t6, 8 \n" // |20|19|17|16|
|
||||
"prepend $t7, $t8, 24 \n" // |31|29|28|27|
|
||||
"sw $t1, 0(%[dst]) \n"
|
||||
"sw $t0, 4(%[dst]) \n"
|
||||
"sw $t3, 8(%[dst]) \n"
|
||||
"sw $t5, 12(%[dst]) \n"
|
||||
"sw $t9, 16(%[dst]) \n"
|
||||
"sw $t7, 20(%[dst]) \n"
|
||||
"bnez %[dst_width], 1b \n"
|
||||
" addiu %[dst], %[dst], 24 \n"
|
||||
".set pop \n"
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[dst] "+r" (dst),
|
||||
[dst_width] "+r" (dst_width)
|
||||
:
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5",
|
||||
"t6","t7", "t8", "t9"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* d, int dst_width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"repl.ph $t3, 3 \n" // 0x00030003
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
|
||||
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
|
||||
"rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1|
|
||||
"rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
|
||||
"muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3|
|
||||
"muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3|
|
||||
"andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1|
|
||||
"andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
|
||||
"raddu.w.qb $t0, $t0 \n"
|
||||
"raddu.w.qb $t1, $t1 \n"
|
||||
"shra_r.w $t0, $t0, 1 \n"
|
||||
"shra_r.w $t1, $t1, 1 \n"
|
||||
"preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1|
|
||||
"preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
|
||||
"rotr $t2, $t2, 16 \n" // |0|S1|0|S2|
|
||||
"rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
|
||||
"addu.ph $t2, $t2, $t4 \n"
|
||||
"addu.ph $t6, $t6, $t5 \n"
|
||||
"sll $t5, $t0, 1 \n"
|
||||
"add $t0, $t5, $t0 \n"
|
||||
"shra_r.ph $t2, $t2, 2 \n"
|
||||
"shra_r.ph $t6, $t6, 2 \n"
|
||||
"shll.ph $t4, $t2, 1 \n"
|
||||
"addq.ph $t4, $t4, $t2 \n"
|
||||
"addu $t0, $t0, $t1 \n"
|
||||
"addiu %[src_ptr], %[src_ptr], 4 \n"
|
||||
"shra_r.w $t0, $t0, 2 \n"
|
||||
"addu.ph $t6, $t6, $t4 \n"
|
||||
"shra_r.ph $t6, $t6, 2 \n"
|
||||
"srl $t1, $t6, 16 \n"
|
||||
"addiu %[dst_width], %[dst_width], -3 \n"
|
||||
"sb $t1, 0(%[d]) \n"
|
||||
"sb $t0, 1(%[d]) \n"
|
||||
"sb $t6, 2(%[d]) \n"
|
||||
"bgtz %[dst_width], 1b \n"
|
||||
" addiu %[d], %[d], 3 \n"
|
||||
"3: \n"
|
||||
".set pop \n"
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[src_stride] "+r" (src_stride),
|
||||
[d] "+r" (d),
|
||||
[dst_width] "+r" (dst_width)
|
||||
:
|
||||
: "t0", "t1", "t2", "t3",
|
||||
"t4", "t5", "t6"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown34_1_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* d, int dst_width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"repl.ph $t2, 3 \n" // 0x00030003
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
|
||||
"lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0|
|
||||
"rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1|
|
||||
"rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1|
|
||||
"muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3|
|
||||
"muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3|
|
||||
"andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1|
|
||||
"andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1|
|
||||
"raddu.w.qb $t0, $t0 \n"
|
||||
"raddu.w.qb $t1, $t1 \n"
|
||||
"shra_r.w $t0, $t0, 1 \n"
|
||||
"shra_r.w $t1, $t1, 1 \n"
|
||||
"preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1|
|
||||
"preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1|
|
||||
"rotr $t4, $t4, 16 \n" // |0|S1|0|S2|
|
||||
"rotr $t6, $t6, 16 \n" // |0|T1|0|T2|
|
||||
"addu.ph $t4, $t4, $t3 \n"
|
||||
"addu.ph $t6, $t6, $t5 \n"
|
||||
"shra_r.ph $t6, $t6, 2 \n"
|
||||
"shra_r.ph $t4, $t4, 2 \n"
|
||||
"addu.ph $t6, $t6, $t4 \n"
|
||||
"addiu %[src_ptr], %[src_ptr], 4 \n"
|
||||
"shra_r.ph $t6, $t6, 1 \n"
|
||||
"addu $t0, $t0, $t1 \n"
|
||||
"addiu %[dst_width], %[dst_width], -3 \n"
|
||||
"shra_r.w $t0, $t0, 1 \n"
|
||||
"srl $t1, $t6, 16 \n"
|
||||
"sb $t1, 0(%[d]) \n"
|
||||
"sb $t0, 1(%[d]) \n"
|
||||
"sb $t6, 2(%[d]) \n"
|
||||
"bgtz %[dst_width], 1b \n"
|
||||
" addiu %[d], %[d], 3 \n"
|
||||
"3: \n"
|
||||
".set pop \n"
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[src_stride] "+r" (src_stride),
|
||||
[d] "+r" (d),
|
||||
[dst_width] "+r" (dst_width)
|
||||
:
|
||||
: "t0", "t1", "t2", "t3",
|
||||
"t4", "t5", "t6"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */,
|
||||
uint8* dst, int dst_width) {
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0|
|
||||
"lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4|
|
||||
"lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8|
|
||||
"lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12|
|
||||
"lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16|
|
||||
"lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20|
|
||||
"lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24|
|
||||
"lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28|
|
||||
"wsbh $t0, $t0 \n" // |2|3|0|1|
|
||||
"wsbh $t6, $t6 \n" // |26|27|24|25|
|
||||
"srl $t0, $t0, 8 \n" // |X|2|3|0|
|
||||
"srl $t3, $t3, 16 \n" // |X|X|15|14|
|
||||
"srl $t5, $t5, 16 \n" // |X|X|23|22|
|
||||
"srl $t7, $t7, 16 \n" // |X|X|31|30|
|
||||
"ins $t1, $t2, 24, 8 \n" // |8|6|5|4|
|
||||
"ins $t6, $t5, 0, 8 \n" // |26|27|24|22|
|
||||
"ins $t1, $t0, 0, 16 \n" // |8|6|3|0|
|
||||
"ins $t6, $t7, 24, 8 \n" // |30|27|24|22|
|
||||
"prepend $t2, $t3, 24 \n" // |X|15|14|11|
|
||||
"ins $t4, $t4, 16, 8 \n" // |19|16|17|X|
|
||||
"ins $t4, $t2, 0, 16 \n" // |19|16|14|11|
|
||||
"addiu %[src_ptr], %[src_ptr], 32 \n"
|
||||
"addiu %[dst_width], %[dst_width], -12 \n"
|
||||
"addiu $t8,%[dst_width], -12 \n"
|
||||
"sw $t1, 0(%[dst]) \n"
|
||||
"sw $t4, 4(%[dst]) \n"
|
||||
"sw $t6, 8(%[dst]) \n"
|
||||
"bgez $t8, 1b \n"
|
||||
" addiu %[dst], %[dst], 12 \n"
|
||||
".set pop \n"
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[dst] "+r" (dst),
|
||||
[dst_width] "+r" (dst_width)
|
||||
:
|
||||
: "t0", "t1", "t2", "t3", "t4",
|
||||
"t5", "t6", "t7", "t8"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
intptr_t stride = src_stride;
|
||||
const uint8* t = src_ptr + stride;
|
||||
const int c = 0x2AAA;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
|
||||
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
|
||||
"lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0|
|
||||
"lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4|
|
||||
"rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
|
||||
"packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6|
|
||||
"packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4|
|
||||
"raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6
|
||||
"raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4
|
||||
"precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1|
|
||||
"precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3|
|
||||
"srl $t4, $t4, 2 \n" // t4 / 4
|
||||
"srl $t6, $t6, 16 \n" // |0|0|S3|T3|
|
||||
"raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3
|
||||
"addu $t6, $t5, $t6 \n"
|
||||
"mul $t6, $t6, %[c] \n" // t6 * 0x2AAA
|
||||
"sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
|
||||
"sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
|
||||
"raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0
|
||||
"raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0
|
||||
"addu $t0, $t0, $t2 \n"
|
||||
"mul $t0, $t0, %[c] \n" // t0 * 0x2AAA
|
||||
"addiu %[src_ptr], %[src_ptr], 8 \n"
|
||||
"addiu %[t], %[t], 8 \n"
|
||||
"addiu %[dst_width], %[dst_width], -3 \n"
|
||||
"addiu %[dst_ptr], %[dst_ptr], 3 \n"
|
||||
"srl $t6, $t6, 16 \n"
|
||||
"srl $t0, $t0, 16 \n"
|
||||
"sb $t4, -1(%[dst_ptr]) \n"
|
||||
"sb $t6, -2(%[dst_ptr]) \n"
|
||||
"bgtz %[dst_width], 1b \n"
|
||||
" sb $t0, -3(%[dst_ptr]) \n"
|
||||
".set pop \n"
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[dst_ptr] "+r" (dst_ptr),
|
||||
[t] "+r" (t),
|
||||
[dst_width] "+r" (dst_width)
|
||||
: [c] "r" (c)
|
||||
: "t0", "t1", "t2", "t3", "t4", "t5", "t6"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleRowDown38_3_Int_MIPS_DSPR2(const uint8* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
uint8* dst_ptr, int dst_width) {
|
||||
intptr_t stride = src_stride;
|
||||
const uint8* s1 = src_ptr + stride;
|
||||
stride += stride;
|
||||
const uint8* s2 = src_ptr + stride;
|
||||
const int c1 = 0x1C71;
|
||||
const int c2 = 0x2AAA;
|
||||
|
||||
__asm__ __volatile__ (
|
||||
".set push \n"
|
||||
".set noreorder \n"
|
||||
"1: \n"
|
||||
"lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0|
|
||||
"lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4|
|
||||
"lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0|
|
||||
"lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4|
|
||||
"lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0|
|
||||
"lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4|
|
||||
"rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6|
|
||||
"packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6|
|
||||
"raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6
|
||||
"packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4|
|
||||
"raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4
|
||||
"sll $t8, $t5, 16 \n" // |R5|R4|0|0|
|
||||
"raddu.w.qb $t8, $t8 \n" // R5+R4
|
||||
"addu $t7, $t7, $t8 \n"
|
||||
"srl $t8, $t5, 16 \n" // |0|0|R7|R6|
|
||||
"raddu.w.qb $t8, $t8 \n" // R7 + R6
|
||||
"addu $t6, $t6, $t8 \n"
|
||||
"mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA
|
||||
"precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1|
|
||||
"precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1|
|
||||
"srl $t8, $t8, 8 \n" // |0|S3|T3|R3|
|
||||
"raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3
|
||||
"addu $t7, $t7, $t8 \n"
|
||||
"mul $t7, $t7, %[c1] \n" // t7 * 0x1C71
|
||||
"sll $t0, $t0, 8 \n" // |S2|S1|S0|0|
|
||||
"sll $t2, $t2, 8 \n" // |T2|T1|T0|0|
|
||||
"sll $t4, $t4, 8 \n" // |R2|R1|R0|0|
|
||||
"raddu.w.qb $t0, $t0 \n"
|
||||
"raddu.w.qb $t2, $t2 \n"
|
||||
"raddu.w.qb $t4, $t4 \n"
|
||||
"addu $t0, $t0, $t2 \n"
|
||||
"addu $t0, $t0, $t4 \n"
|
||||
"mul $t0, $t0, %[c1] \n" // t0 * 0x1C71
|
||||
"addiu %[src_ptr], %[src_ptr], 8 \n"
|
||||
"addiu %[s1], %[s1], 8 \n"
|
||||
"addiu %[s2], %[s2], 8 \n"
|
||||
"addiu %[dst_width], %[dst_width], -3 \n"
|
||||
"addiu %[dst_ptr], %[dst_ptr], 3 \n"
|
||||
"srl $t6, $t6, 16 \n"
|
||||
"srl $t7, $t7, 16 \n"
|
||||
"srl $t0, $t0, 16 \n"
|
||||
"sb $t6, -1(%[dst_ptr]) \n"
|
||||
"sb $t7, -2(%[dst_ptr]) \n"
|
||||
"bgtz %[dst_width], 1b \n"
|
||||
" sb $t0, -3(%[dst_ptr]) \n"
|
||||
".set pop \n"
|
||||
: [src_ptr] "+r" (src_ptr),
|
||||
[dst_ptr] "+r" (dst_ptr),
|
||||
[s1] "+r" (s1),
|
||||
[s2] "+r" (s2),
|
||||
[dst_width] "+r" (dst_width)
|
||||
: [c1] "r" (c1), [c2] "r" (c2)
|
||||
: "t0", "t1", "t2", "t3", "t4",
|
||||
"t5", "t6", "t7", "t8"
|
||||
);
|
||||
}
|
||||
|
||||
void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr,
|
||||
const unsigned char* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
|
||||
@ -630,4 +630,70 @@ TEST_F(libyuvTest, TestAffine) {
|
||||
#endif
|
||||
}
|
||||
|
||||
TEST_F(libyuvTest, TestCopyPlane) {
|
||||
int err = 0;
|
||||
int yw = benchmark_width_;
|
||||
int yh = benchmark_height_;
|
||||
int b = 12;
|
||||
int i, j;
|
||||
|
||||
int y_plane_size = (yw + b * 2) * (yh + b * 2);
|
||||
srandom(time(NULL));
|
||||
align_buffer_16(orig_y, y_plane_size)
|
||||
align_buffer_16(dst_c, y_plane_size)
|
||||
align_buffer_16(dst_opt, y_plane_size);
|
||||
|
||||
memset(orig_y, 0, y_plane_size);
|
||||
memset(dst_c, 0, y_plane_size);
|
||||
memset(dst_opt, 0, y_plane_size);
|
||||
|
||||
// Fill image buffers with random data.
|
||||
for (i = b; i < (yh + b); ++i) {
|
||||
for (j = b; j < (yw + b); ++j) {
|
||||
orig_y[i * (yw + b * 2) + j] = random() & 0xff;
|
||||
}
|
||||
}
|
||||
|
||||
// Fill destination buffers with random data.
|
||||
for (i = 0; i < y_plane_size; ++i) {
|
||||
uint8 random_number = random() & 0x7f;
|
||||
dst_c[i] = random_number;
|
||||
dst_opt[i] = dst_c[i];
|
||||
}
|
||||
|
||||
int y_off = b * (yw + b * 2) + b;
|
||||
|
||||
int y_st = yw + b * 2;
|
||||
int stride = 8;
|
||||
|
||||
// Disable all optimizations.
|
||||
MaskCpuFlags(0);
|
||||
double c_time = get_time();
|
||||
for (j = 0; j < benchmark_iterations_; j++) {
|
||||
CopyPlane(orig_y + y_off, y_st, dst_c + y_off, stride, yw, yh);
|
||||
}
|
||||
c_time = (get_time() - c_time) / benchmark_iterations_;
|
||||
|
||||
// Enable optimizations.
|
||||
MaskCpuFlags(-1);
|
||||
double opt_time = get_time();
|
||||
for (j = 0; j < benchmark_iterations_; j++) {
|
||||
CopyPlane(orig_y + y_off, y_st, dst_opt + y_off, stride, yw, yh);
|
||||
}
|
||||
opt_time = (get_time() - opt_time) / benchmark_iterations_;
|
||||
printf(" %8d us C - %8d us OPT\n",
|
||||
static_cast<int>(c_time * 1e6), static_cast<int>(opt_time * 1e6));
|
||||
|
||||
for (i = 0; i < y_plane_size; ++i) {
|
||||
if (dst_c[i] != dst_opt[i])
|
||||
++err;
|
||||
}
|
||||
|
||||
free_aligned_buffer_16(orig_y)
|
||||
free_aligned_buffer_16(dst_c)
|
||||
free_aligned_buffer_16(dst_opt)
|
||||
|
||||
EXPECT_EQ(0, err);
|
||||
}
|
||||
|
||||
} // namespace libyuv
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user