From 6c1b2d38c685e769cf7db2806e27c8ec4c028fe3 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Fri, 26 Oct 2012 22:49:18 +0000 Subject: [PATCH] Mips port of libyuv. Includes functionality for convert, rotate, scale and memcpy. BUG=126 TESTED=tested by mips Review URL: https://webrtc-codereview.appspot.com/930005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@449 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv.h | 2 + include/libyuv/row.h | 25 ++ include/libyuv/version.h | 2 +- libyuv.gyp | 3 +- source/convert_argb.cc | 18 ++ source/convert_from.cc | 39 +++ source/cpu_id.cc | 6 +- source/memcpy_mips.S | 514 +++++++++++++++++++------------------ source/planar_functions.cc | 13 + source/rotate.cc | 44 ++++ source/rotate_mips.cc | 485 ++++++++++++++++++++++++++++++++++ source/row_mips.cc | 401 +++++++++++++++++++++++++++++ source/scale.cc | 54 +++- source/scale_mips.cc | 454 ++++++++++++++++++++++++++++++++ unit_test/planar_test.cc | 66 +++++ 16 files changed, 1867 insertions(+), 261 deletions(-) create mode 100644 source/rotate_mips.cc diff --git a/README.chromium b/README.chromium index 0936990f3..d8a9a5d86 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 447 +Version: 449 License: BSD License File: LICENSE diff --git a/include/libyuv.h b/include/libyuv.h index 1c57a41dd..bd45c8e1c 100644 --- a/include/libyuv.h +++ b/include/libyuv.h @@ -19,9 +19,11 @@ #include "libyuv/convert_from_argb.h" #include "libyuv/cpu_id.h" #include "libyuv/format_conversion.h" +#include "libyuv/mjpeg_decoder.h" #include "libyuv/planar_functions.h" #include "libyuv/rotate.h" #include "libyuv/rotate_argb.h" +#include "libyuv/row.h" #include "libyuv/scale.h" #include "libyuv/scale_argb.h" #include "libyuv/version.h" diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 00b20b1de..f68f6ddb4 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -175,8 +175,14 @@ extern "C" { // The following are available on Mips platforms #if !defined(YUV_DISABLE_ASM) && defined(__mips__) +#define HAS_COPYROW_MIPS #if defined(__mips_dsp) && (__mips_dsp_rev >= 2) #define HAS_SPLITUV_MIPS_DSPR2 +#define HAS_MIRRORROW_MIPS_DSPR2 +#define HAS_MIRRORROWUV_MIPS_DSPR2 +#define HAS_I422TOARGBROW_MIPS_DSPR2 +#define HAS_I422TOBGRAROW_MIPS_DSPR2 +#define HAS_I422TOABGRROW_MIPS_DSPR2 #endif #endif @@ -282,6 +288,9 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix); void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width); void MirrorRow_SSE2(const uint8* src, uint8* dst, int width); void MirrorRow_NEON(const uint8* src, uint8* dst, int width); +void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width); +void MirrorRowUV_MIPS_DSPR2(const uint8* src, uint8* dst_u, uint8* dst_v, + int width); void MirrorRow_C(const uint8* src, uint8* dst, int width); void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, int width); @@ -321,6 +330,7 @@ void MergeUV_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, void CopyRow_SSE2(const uint8* src, uint8* dst, int count); void CopyRow_X86(const uint8* src, uint8* dst, int count); void CopyRow_NEON(const uint8* src, uint8* dst, int count); +void CopyRow_MIPS(const uint8* src, uint8* dst, int count); void CopyRow_C(const uint8* src, uint8* dst, int count); void SetRow8_X86(uint8* dst, uint32 v32, int count); @@ -694,6 +704,21 @@ void NV21ToARGBRow_Any_NEON(const uint8* y_buf, const uint8* uv_buf, uint8* argb_buf, int width); +void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); +void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width); void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix); void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 06aee083e..75411b9ed 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 447 +#define LIBYUV_VERSION 449 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/libyuv.gyp b/libyuv.gyp index 0fd33b35a..9607bb3a5 100644 --- a/libyuv.gyp +++ b/libyuv.gyp @@ -75,11 +75,12 @@ 'source/convert_from_argb.cc', 'source/cpu_id.cc', 'source/format_conversion.cc', - 'source/memcpy_mips.S', + 'source/memcpy_mips.S', # TODO(fbarchard): Move into row_mips.cc 'source/mjpeg_decoder.cc', 'source/planar_functions.cc', 'source/rotate.cc', 'source/rotate_argb.cc', + 'source/rotate_mips.cc', 'source/rotate_neon.cc', 'source/row_common.cc', 'source/row_mips.cc', diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 74d04dbee..5b8d285cd 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -132,6 +132,14 @@ int I422ToARGB(const uint8* src_y, int src_stride_y, I422ToARGBRow = I422ToARGBRow_NEON; } } +#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } #endif for (int y = 0; y < height; ++y) { @@ -756,6 +764,11 @@ int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2, I422ToARGBRow = I422ToARGBRow_NEON; } } +#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } #endif SIMD_ALIGNED(uint8 rowy[kMaxStride]); @@ -829,6 +842,11 @@ int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy, I422ToARGBRow = I422ToARGBRow_NEON; } } +#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } #endif SIMD_ALIGNED(uint8 rowy[kMaxStride]); diff --git a/source/convert_from.cc b/source/convert_from.cc index 73ed900bd..cb00480a5 100644 --- a/source/convert_from.cc +++ b/source/convert_from.cc @@ -599,6 +599,14 @@ int I420ToARGB(const uint8* src_y, int src_stride_y, I422ToARGBRow = I422ToARGBRow_NEON; } } +#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } #endif for (int y = 0; y < height; ++y) { @@ -652,6 +660,14 @@ int I420ToBGRA(const uint8* src_y, int src_stride_y, I422ToBGRARow = I422ToBGRARow_NEON; } } +#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) { + I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2; + } #endif for (int y = 0; y < height; ++y) { @@ -909,6 +925,13 @@ int I420ToRGB565(const uint8* src_y, int src_stride_y, I422ToARGBRow = I422ToARGBRow_SSSE3; } } +#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } #endif SIMD_ALIGNED(uint8 row[kMaxStride]); @@ -975,6 +998,14 @@ int I420ToARGB1555(const uint8* src_y, int src_stride_y, I422ToARGBRow = I422ToARGBRow_SSSE3; } } +#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } #endif SIMD_ALIGNED(uint8 row[kMaxStride]); @@ -1041,6 +1072,14 @@ int I420ToARGB4444(const uint8* src_y, int src_stride_y, I422ToARGBRow = I422ToARGBRow_SSSE3; } } +#elif defined(HAS_I422TOARGBROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) { + I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2; + } #endif SIMD_ALIGNED(uint8 row[kMaxStride]); diff --git a/source/cpu_id.cc b/source/cpu_id.cc index a75739cbe..4032080f9 100644 --- a/source/cpu_id.cc +++ b/source/cpu_id.cc @@ -174,7 +174,7 @@ int InitCpuFlags(void) { } } #endif - // environment variable overrides for testing. + // Environment variable overrides for testing. if (TestEnv("LIBYUV_DISABLE_X86")) { cpu_info_ &= ~kCpuHasX86; } @@ -197,7 +197,7 @@ int InitCpuFlags(void) { cpu_info_ &= ~kCpuHasAVX2; } #elif defined(__mips__) && defined(__linux__) - // linux mips parse text file for dsp detect. + // Linux mips parse text file for dsp detect. cpu_info_ = MipsCpuCaps("dsp"); // set kCpuHasMIPS_DSP. #if defined(__mips_dspr2) cpu_info_ |= kCpuHasMIPS_DSPR2; @@ -215,7 +215,7 @@ int InitCpuFlags(void) { } #elif defined(__arm__) #if defined(__linux__) && (defined(__ARM_NEON__) || defined(LIBYUV_NEON)) - // linux arm parse text file for neon detect. + // Linux arm parse text file for neon detect. cpu_info_ = ArmCpuCaps("/proc/cpuinfo"); #elif defined(__ARM_NEON__) // gcc -mfpu=neon defines __ARM_NEON__ diff --git a/source/memcpy_mips.S b/source/memcpy_mips.S index 83292d2f9..722ef4fcf 100644 --- a/source/memcpy_mips.S +++ b/source/memcpy_mips.S @@ -1,171 +1,179 @@ #if defined (__mips__) - - .globl memcpy_MIPS; - .align 2; - .type memcpy_MIPS,@function; - .ent memcpy_MIPS,0; +# +# Copyright (c) 2012 The LibYuv project authors. All Rights Reserved. +# +# Use of this source code is governed by a BSD-style license +# that can be found in the LICENSE file in the root of the source +# tree. An additional intellectual property rights grant can be found +# in the file PATENTS. All contributing project authors may +# be found in the AUTHORS file in the root of the source tree. +# + .globl memcpy_MIPS; + .align 2; + .type memcpy_MIPS,@function; + .ent memcpy_MIPS,0; memcpy_MIPS: - .frame $sp,0,$ra - .set noreorder - .set noat + .frame $sp,0,$ra + .set noreorder + .set noat - slti $at,$a2,8 + slti $at,$a2,8 bne $at,$zero,last8 - move $v0,$a0 # memcpy returns the dst pointer + move $v0,$a0 # memcpy returns the dst pointer # Test if the src and dst are word-aligned, or can be made word-aligned - xor $t8,$a1,$a0 - andi $t8,$t8,0x3 # t8 is a0/a1 word-displacement + xor $t8,$a1,$a0 + andi $t8,$t8,0x3 # t8 is a0/a1 word-displacement - bne $t8,$zero,unaligned - negu $a3,$a0 + bne $t8,$zero,unaligned + negu $a3,$a0 - andi $a3,$a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned - beq $a3,$zero,chk16w # when a3=0 then the dst (a0) is word-aligned - subu $a2,$a2,$a3 # now a2 is the remining bytes count + andi $a3,$a3,0x3 # we need to copy a3 bytes to make a0/a1 aligned + beq $a3,$zero,chk16w # when a3=0 then the dst (a0) is + subu $a2,$a2,$a3 # word-aligned now a2 is the remining bytes count - lwr $t8,0($a1) - addu $a1,$a1,$a3 - swr $t8,0($a0) - addu $a0,$a0,$a3 + lwr $t8,0($a1) + addu $a1,$a1,$a3 + swr $t8,0($a0) + addu $a0,$a0,$a3 # Now the dst/src are mutually word-aligned with word-aligned addresses -chk16w: andi $t8,$a2,0x3f # any whole 64-byte chunks? - # t8 is the byte count after 64-byte chunks - - beq $a2,$t8,chk8w # if a2==t8, no 64-byte chunks - # There will be at most 1 32-byte chunk after it - subu $a3,$a2,$t8 # subtract from a2 the reminder - # Here a3 counts bytes in 16w chunks - addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks - - addu $t0,$a0,$a2 # t0 is the "past the end" address +chk16w: + andi $t8,$a2,0x3f # any whole 64-byte chunks? + # t8 is the byte count after 64-byte chunks + beq $a2,$t8,chk8w # if a2==t8, no 64-byte chunks + # There will be at most 1 32-byte chunk after it + subu $a3,$a2,$t8 # subtract from a2 the reminder + # Here a3 counts bytes in 16w chunks + addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks + addu $t0,$a0,$a2 # t0 is the "past the end" address # When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past # the "t0-32" address # This means: for x=128 the last "safe" a0 address is "t0-160" # Alternatively, for x=64 the last "safe" a0 address is "t0-96" # In the current version we will use "pref 30,128(a0)", so "t0-160" is the limit - subu $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address + subu $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address pref 0,0($a1) # bring the first line of src, addr 0 pref 0,32($a1) # bring the second line of src, addr 32 pref 0,64($a1) # bring the third line of src, addr 64 - pref 30,32($a0) # safe, as we have at least 64 bytes ahead + pref 30,32($a0) # safe, as we have at least 64 bytes ahead # In case the a0 > t9 don't use "pref 30" at all - sgtu $v1,$a0,$t9 - bgtz $v1,loop16w # skip "pref 30,64(a0)" for too short arrays - nop + sgtu $v1,$a0,$t9 + bgtz $v1,loop16w # skip "pref 30,64(a0)" for too short arrays + nop # otherwise, start with using pref30 - pref 30,64($a0) + pref 30,64($a0) loop16w: - pref 0,96($a1) - lw $t0,0($a1) - bgtz $v1,skip_pref30_96 # skip "pref 30,96(a0)" - lw $t1,4($a1) + pref 0,96($a1) + lw $t0,0($a1) + bgtz $v1,skip_pref30_96 # skip "pref 30,96(a0)" + lw $t1,4($a1) pref 30,96($a0) # continue setting up the dest, addr 96 skip_pref30_96: - lw $t2,8($a1) - lw $t3,12($a1) - lw $t4,16($a1) - lw $t5,20($a1) - lw $t6,24($a1) - lw $t7,28($a1) - pref 0,128($a1) # bring the next lines of src, addr 128 + lw $t2,8($a1) + lw $t3,12($a1) + lw $t4,16($a1) + lw $t5,20($a1) + lw $t6,24($a1) + lw $t7,28($a1) + pref 0,128($a1) # bring the next lines of src, addr 128 - sw $t0,0($a0) - sw $t1,4($a0) - sw $t2,8($a0) - sw $t3,12($a0) - sw $t4,16($a0) - sw $t5,20($a0) - sw $t6,24($a0) - sw $t7,28($a0) + sw $t0,0($a0) + sw $t1,4($a0) + sw $t2,8($a0) + sw $t3,12($a0) + sw $t4,16($a0) + sw $t5,20($a0) + sw $t6,24($a0) + sw $t7,28($a0) - lw $t0,32($a1) - bgtz $v1,skip_pref30_128 # skip "pref 30,128(a0)" - lw $t1,36($a1) + lw $t0,32($a1) + bgtz $v1,skip_pref30_128 # skip "pref 30,128(a0)" + lw $t1,36($a1) pref 30,128($a0) # continue setting up the dest, addr 128 skip_pref30_128: - lw $t2,40($a1) - lw $t3,44($a1) - lw $t4,48($a1) - lw $t5,52($a1) - lw $t6,56($a1) - lw $t7,60($a1) - pref 0, 160($a1) # bring the next lines of src, addr 160 + lw $t2,40($a1) + lw $t3,44($a1) + lw $t4,48($a1) + lw $t5,52($a1) + lw $t6,56($a1) + lw $t7,60($a1) + pref 0, 160($a1) # bring the next lines of src, addr 160 - sw $t0,32($a0) - sw $t1,36($a0) - sw $t2,40($a0) - sw $t3,44($a0) - sw $t4,48($a0) - sw $t5,52($a0) - sw $t6,56($a0) - sw $t7,60($a0) + sw $t0,32($a0) + sw $t1,36($a0) + sw $t2,40($a0) + sw $t3,44($a0) + sw $t4,48($a0) + sw $t5,52($a0) + sw $t6,56($a0) + sw $t7,60($a0) - addiu $a0,$a0,64 # adding 64 to dest - sgtu $v1,$a0,$t9 - bne $a0,$a3,loop16w - addiu $a1,$a1,64 # adding 64 to src - move $a2,$t8 + addiu $a0,$a0,64 # adding 64 to dest + sgtu $v1,$a0,$t9 + bne $a0,$a3,loop16w + addiu $a1,$a1,64 # adding 64 to src + move $a2,$t8 # Here we have src and dest word-aligned but less than 64-bytes to go chk8w: pref 0, 0x0($a1) - andi $t8,$a2,0x1f # is there a 32-byte chunk? - # the t8 is the reminder count past 32-bytes - beq $a2,$t8,chk1w # when a2=t8, no 32-byte chunk + andi $t8,$a2,0x1f # is there a 32-byte chunk? + # the t8 is the reminder count past 32-bytes + beq $a2,$t8,chk1w # when a2=t8, no 32-byte chunk nop - lw $t0,0($a1) - lw $t1,4($a1) - lw $t2,8($a1) - lw $t3,12($a1) - lw $t4,16($a1) - lw $t5,20($a1) - lw $t6,24($a1) - lw $t7,28($a1) - addiu $a1,$a1,32 + lw $t0,0($a1) + lw $t1,4($a1) + lw $t2,8($a1) + lw $t3,12($a1) + lw $t4,16($a1) + lw $t5,20($a1) + lw $t6,24($a1) + lw $t7,28($a1) + addiu $a1,$a1,32 - sw $t0,0($a0) - sw $t1,4($a0) - sw $t2,8($a0) - sw $t3,12($a0) - sw $t4,16($a0) - sw $t5,20($a0) - sw $t6,24($a0) - sw $t7,28($a0) - addiu $a0,$a0,32 + sw $t0,0($a0) + sw $t1,4($a0) + sw $t2,8($a0) + sw $t3,12($a0) + sw $t4,16($a0) + sw $t5,20($a0) + sw $t6,24($a0) + sw $t7,28($a0) + addiu $a0,$a0,32 chk1w: - andi $a2,$t8,0x3 # now a2 is the reminder past 1w chunks - beq $a2,$t8,last8 - subu $a3,$t8,$a2 # a3 is count of bytes in 1w chunks - addu $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks + andi $a2,$t8,0x3 # now a2 is the reminder past 1w chunks + beq $a2,$t8,last8 + subu $a3,$t8,$a2 # a3 is count of bytes in 1w chunks + addu $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks # copying in words (4-byte chunks) wordCopy_loop: - lw $t3,0($a1) # the first t3 may be equal t0 ... optimize? - addiu $a1,$a1,4 - addiu $a0,$a0,4 - bne $a0,$a3,wordCopy_loop - sw $t3,-4($a0) + lw $t3,0($a1) # the first t3 may be equal t0 ... optimize? + addiu $a1,$a1,4 + addiu $a0,$a0,4 + bne $a0,$a3,wordCopy_loop + sw $t3,-4($a0) # For the last (<8) bytes last8: - blez $a2,leave - addu $a3,$a0,$a2 # a3 is the last dst address + blez $a2,leave + addu $a3,$a0,$a2 # a3 is the last dst address last8loop: - lb $v1,0($a1) - addiu $a1,$a1,1 - addiu $a0,$a0,1 - bne $a0,$a3,last8loop - sb $v1,-1($a0) + lb $v1,0($a1) + addiu $a1,$a1,1 + addiu $a0,$a0,1 + bne $a0,$a3,last8loop + sb $v1,-1($a0) -leave: j $ra - nop +leave: + j $ra + nop # # UNALIGNED case @@ -173,174 +181,172 @@ leave: j $ra unaligned: # got here with a3="negu a0" - andi $a3,$a3,0x3 # test if the a0 is word aligned - beqz $a3,ua_chk16w - subu $a2,$a2,$a3 # bytes left after initial a3 bytes + andi $a3,$a3,0x3 # test if the a0 is word aligned + beqz $a3,ua_chk16w + subu $a2,$a2,$a3 # bytes left after initial a3 bytes - lwr $v1,0($a1) - lwl $v1,3($a1) - addu $a1,$a1,$a3 # a3 may be here 1, 2 or 3 - swr $v1,0($a0) - addu $a0,$a0,$a3 # below the dst will be word aligned (NOTE1) - -ua_chk16w: andi $t8,$a2,0x3f # any whole 64-byte chunks? - # t8 is the byte count after 64-byte chunks - beq $a2,$t8,ua_chk8w # if a2==t8, no 64-byte chunks - # There will be at most 1 32-byte chunk after it - subu $a3,$a2,$t8 # subtract from a2 the reminder - # Here a3 counts bytes in 16w chunks - addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks - - addu $t0,$a0,$a2 # t0 is the "past the end" address - - subu $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address + lwr $v1,0($a1) + lwl $v1,3($a1) + addu $a1,$a1,$a3 # a3 may be here 1, 2 or 3 + swr $v1,0($a0) + addu $a0,$a0,$a3 # below the dst will be word aligned (NOTE1) +ua_chk16w: + andi $t8,$a2,0x3f # any whole 64-byte chunks? + # t8 is the byte count after 64-byte chunks + beq $a2,$t8,ua_chk8w # if a2==t8, no 64-byte chunks + # There will be at most 1 32-byte chunk after it + subu $a3,$a2,$t8 # subtract from a2 the reminder + # Here a3 counts bytes in 16w chunks + addu $a3,$a0,$a3 # Now a3 is the final dst after 64-byte chunks + addu $t0,$a0,$a2 # t0 is the "past the end" address + subu $t9,$t0,160 # t9 is the "last safe pref 30,128(a0)" address pref 0,0($a1) # bring the first line of src, addr 0 pref 0,32($a1) # bring the second line of src, addr 32 pref 0,64($a1) # bring the third line of src, addr 64 - pref 30,32($a0) # safe, as we have at least 64 bytes ahead + pref 30,32($a0) # safe, as we have at least 64 bytes ahead # In case the a0 > t9 don't use "pref 30" at all - sgtu $v1,$a0,$t9 - bgtz $v1,ua_loop16w # skip "pref 30,64(a0)" for too short arrays - nop + sgtu $v1,$a0,$t9 + bgtz $v1,ua_loop16w # skip "pref 30,64(a0)" for too short arrays + nop # otherwise, start with using pref30 - pref 30,64($a0) + pref 30,64($a0) ua_loop16w: - pref 0,96($a1) - lwr $t0,0($a1) - lwl $t0,3($a1) - lwr $t1,4($a1) - bgtz $v1,ua_skip_pref30_96 - lwl $t1,7($a1) + pref 0,96($a1) + lwr $t0,0($a1) + lwl $t0,3($a1) + lwr $t1,4($a1) + bgtz $v1,ua_skip_pref30_96 + lwl $t1,7($a1) pref 30,96($a0) # continue setting up the dest, addr 96 ua_skip_pref30_96: - lwr $t2,8($a1) - lwl $t2,11($a1) - lwr $t3,12($a1) - lwl $t3,15($a1) - lwr $t4,16($a1) - lwl $t4,19($a1) - lwr $t5,20($a1) - lwl $t5,23($a1) - lwr $t6,24($a1) - lwl $t6,27($a1) - lwr $t7,28($a1) - lwl $t7,31($a1) - pref 0,128($a1) # bring the next lines of src, addr 128 + lwr $t2,8($a1) + lwl $t2,11($a1) + lwr $t3,12($a1) + lwl $t3,15($a1) + lwr $t4,16($a1) + lwl $t4,19($a1) + lwr $t5,20($a1) + lwl $t5,23($a1) + lwr $t6,24($a1) + lwl $t6,27($a1) + lwr $t7,28($a1) + lwl $t7,31($a1) + pref 0,128($a1) # bring the next lines of src, addr 128 - sw $t0,0($a0) - sw $t1,4($a0) - sw $t2,8($a0) - sw $t3,12($a0) - sw $t4,16($a0) - sw $t5,20($a0) - sw $t6,24($a0) - sw $t7,28($a0) + sw $t0,0($a0) + sw $t1,4($a0) + sw $t2,8($a0) + sw $t3,12($a0) + sw $t4,16($a0) + sw $t5,20($a0) + sw $t6,24($a0) + sw $t7,28($a0) - lwr $t0,32($a1) - lwl $t0,35($a1) - lwr $t1,36($a1) - bgtz $v1,ua_skip_pref30_128 - lwl $t1,39($a1) + lwr $t0,32($a1) + lwl $t0,35($a1) + lwr $t1,36($a1) + bgtz $v1,ua_skip_pref30_128 + lwl $t1,39($a1) pref 30,128($a0) # continue setting up the dest, addr 128 ua_skip_pref30_128: - lwr $t2,40($a1) - lwl $t2,43($a1) - lwr $t3,44($a1) - lwl $t3,47($a1) - lwr $t4,48($a1) - lwl $t4,51($a1) - lwr $t5,52($a1) - lwl $t5,55($a1) - lwr $t6,56($a1) - lwl $t6,59($a1) - lwr $t7,60($a1) - lwl $t7,63($a1) - pref 0, 160($a1) # bring the next lines of src, addr 160 + lwr $t2,40($a1) + lwl $t2,43($a1) + lwr $t3,44($a1) + lwl $t3,47($a1) + lwr $t4,48($a1) + lwl $t4,51($a1) + lwr $t5,52($a1) + lwl $t5,55($a1) + lwr $t6,56($a1) + lwl $t6,59($a1) + lwr $t7,60($a1) + lwl $t7,63($a1) + pref 0, 160($a1) # bring the next lines of src, addr 160 - sw $t0,32($a0) - sw $t1,36($a0) - sw $t2,40($a0) - sw $t3,44($a0) - sw $t4,48($a0) - sw $t5,52($a0) - sw $t6,56($a0) - sw $t7,60($a0) + sw $t0,32($a0) + sw $t1,36($a0) + sw $t2,40($a0) + sw $t3,44($a0) + sw $t4,48($a0) + sw $t5,52($a0) + sw $t6,56($a0) + sw $t7,60($a0) - addiu $a0,$a0,64 # adding 64 to dest - sgtu $v1,$a0,$t9 - bne $a0,$a3,ua_loop16w - addiu $a1,$a1,64 # adding 64 to src - move $a2,$t8 + addiu $a0,$a0,64 # adding 64 to dest + sgtu $v1,$a0,$t9 + bne $a0,$a3,ua_loop16w + addiu $a1,$a1,64 # adding 64 to src + move $a2,$t8 # Here we have src and dest word-aligned but less than 64-bytes to go ua_chk8w: - pref 0, 0x0($a1) - andi $t8,$a2,0x1f # is there a 32-byte chunk? - # the t8 is the reminder count - beq $a2,$t8,ua_chk1w # when a2=t8, no 32-byte chunk + pref 0, 0x0($a1) + andi $t8,$a2,0x1f # is there a 32-byte chunk? + # the t8 is the reminder count + beq $a2,$t8,ua_chk1w # when a2=t8, no 32-byte chunk - lwr $t0,0($a1) - lwl $t0,3($a1) - lwr $t1,4($a1) - lwl $t1,7($a1) - lwr $t2,8($a1) - lwl $t2,11($a1) - lwr $t3,12($a1) - lwl $t3,15($a1) - lwr $t4,16($a1) - lwl $t4,19($a1) - lwr $t5,20($a1) - lwl $t5,23($a1) - lwr $t6,24($a1) - lwl $t6,27($a1) - lwr $t7,28($a1) - lwl $t7,31($a1) - addiu $a1,$a1,32 + lwr $t0,0($a1) + lwl $t0,3($a1) + lwr $t1,4($a1) + lwl $t1,7($a1) + lwr $t2,8($a1) + lwl $t2,11($a1) + lwr $t3,12($a1) + lwl $t3,15($a1) + lwr $t4,16($a1) + lwl $t4,19($a1) + lwr $t5,20($a1) + lwl $t5,23($a1) + lwr $t6,24($a1) + lwl $t6,27($a1) + lwr $t7,28($a1) + lwl $t7,31($a1) + addiu $a1,$a1,32 - sw $t0,0($a0) - sw $t1,4($a0) - sw $t2,8($a0) - sw $t3,12($a0) - sw $t4,16($a0) - sw $t5,20($a0) - sw $t6,24($a0) - sw $t7,28($a0) - addiu $a0,$a0,32 + sw $t0,0($a0) + sw $t1,4($a0) + sw $t2,8($a0) + sw $t3,12($a0) + sw $t4,16($a0) + sw $t5,20($a0) + sw $t6,24($a0) + sw $t7,28($a0) + addiu $a0,$a0,32 ua_chk1w: - andi $a2,$t8,0x3 # now a2 is the reminder past 1w chunks - beq $a2,$t8,ua_smallCopy - subu $a3,$t8,$a2 # a3 is count of bytes in 1w chunks - addu $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks + andi $a2,$t8,0x3 # now a2 is the reminder past 1w chunks + beq $a2,$t8,ua_smallCopy + subu $a3,$t8,$a2 # a3 is count of bytes in 1w chunks + addu $a3,$a0,$a3 # now a3 is the dst address past the 1w chunks # copying in words (4-byte chunks) ua_wordCopy_loop: - lwr $v1,0($a1) - lwl $v1,3($a1) - addiu $a1,$a1,4 - addiu $a0,$a0,4 # note: dst=a0 is word aligned here, see NOTE1 - bne $a0,$a3,ua_wordCopy_loop - sw $v1,-4($a0) + lwr $v1,0($a1) + lwl $v1,3($a1) + addiu $a1,$a1,4 + addiu $a0,$a0,4 # note: dst=a0 is word aligned here, see NOTE1 + bne $a0,$a3,ua_wordCopy_loop + sw $v1,-4($a0) # Now less than 4 bytes (value in a2) left to copy ua_smallCopy: - beqz $a2,leave - addu $a3,$a0,$a2 # a3 is the last dst address + beqz $a2,leave + addu $a3,$a0,$a2 # a3 is the last dst address ua_smallCopy_loop: - lb $v1,0($a1) - addiu $a1,$a1,1 - addiu $a0,$a0,1 - bne $a0,$a3,ua_smallCopy_loop - sb $v1,-1($a0) + lb $v1,0($a1) + addiu $a1,$a1,1 + addiu $a0,$a0,1 + bne $a0,$a3,ua_smallCopy_loop + sb $v1,-1($a0) - j $ra - nop + j $ra + nop - .set at - .set reorder - .end memcpy_MIPS; - .size memcpy_MIPS,.-memcpy_MIPS + .set at + .set reorder + .end memcpy_MIPS; + .size memcpy_MIPS,.-memcpy_MIPS #endif // if defined (__mips__) diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 6de5f0342..58b6ace2f 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -46,6 +46,11 @@ void CopyPlane(const uint8* src_y, int src_stride_y, CopyRow = CopyRow_SSE2; } #endif +#if defined(HAS_COPYROW_MIPS) + if (TestCpuFlag(kCpuHasMIPS)) { + CopyRow = CopyRow_MIPS; + } +#endif // Copy plane for (int y = 0; y < height; ++y) { @@ -424,6 +429,14 @@ int I422ToBGRA(const uint8* src_y, int src_stride_y, } } } +#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) && + IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) && + IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) && + IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) && + IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) { + I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2; + } #endif for (int y = 0; y < height; ++y) { diff --git a/source/rotate.cc b/source/rotate.cc index 8f9883f47..0601dec07 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -56,6 +56,23 @@ void TransposeUVWx8_NEON(const uint8* src, int src_stride, int width); #endif // defined(__ARM_NEON__) +#if !defined(YUV_DISABLE_ASM) && defined(__mips__) +#if defined(__mips_dsp) && (__mips_dsp_rev >= 2) +#define HAS_TRANSPOSE_WX8_MIPS_DSPR2 +void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); + +void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, int width); +#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2 +void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width); +#endif +#endif + + #if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) #define HAS_TRANSPOSE_WX8_SSSE3 __declspec(naked) __declspec(align(16)) @@ -794,6 +811,16 @@ void TransposePlane(const uint8* src, int src_stride, TransposeWx8 = TransposeWx8_FAST_SSSE3; } #endif +#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2)) { + if (IS_ALIGNED(width, 4) && + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { + TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2; + } else { + TransposeWx8 = TransposeWx8_MIPS_DSPR2; + } + } +#endif // Work across the source in 8x8 tiles int i = height; @@ -856,6 +883,13 @@ void RotatePlane180(const uint8* src, int src_stride, IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) { MirrorRow = MirrorRow_SSSE3; } +#endif +#if defined(HAS_MIRRORROW_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) { + MirrorRow = MirrorRow_MIPS_DSPR2; + } #endif void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C; #if defined(HAS_COPYROW_NEON) @@ -952,6 +986,11 @@ void TransposeUV(const uint8* src, int src_stride, IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { TransposeUVWx8 = TransposeUVWx8_SSE2; } +#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) && + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { + TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2; + } #endif // Work through the source in 8x8 tiles. @@ -1021,6 +1060,11 @@ void RotateUV180(const uint8* src, int src_stride, IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) { MirrorRowUV = MirrorRowUV_SSSE3; } +#elif defined(HAS_MIRRORROWUV_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) { + MirrorRowUV = MirrorRowUV_MIPS_DSPR2; + } #endif dst_a += dst_stride_a * (height - 1); diff --git a/source/rotate_mips.cc b/source/rotate_mips.cc new file mode 100644 index 000000000..430e953ee --- /dev/null +++ b/source/rotate_mips.cc @@ -0,0 +1,485 @@ +/* + * Copyright 2011 The LibYuv Project Authors. All rights reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "libyuv/row.h" + +#include "libyuv/basic_types.h" + +#ifdef __cplusplus +namespace libyuv { +extern "C" { +#endif + +#if !defined(YUV_DISABLE_ASM) && defined(__mips_dsp) && (__mips_dsp_rev >= 2) + +void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 + "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 + "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 + "addu $t3, $t2, %[src_stride] \n" + "addu $t5, $t4, %[src_stride] \n" + "addu $t6, $t2, $t4 \n" + "andi $t0, %[dst], 0x3 \n" + "andi $t1, %[dst_stride], 0x3 \n" + "or $t0, $t0, $t1 \n" + "bnez $t0, 11f \n" + " subu $t7, $t9, %[src_stride] \n" +//dst + dst_stride word aligned + "1: \n" + "lbu $t0, 0(%[src]) \n" + "lbux $t1, %[src_stride](%[src]) \n" + "lbux $t8, $t2(%[src]) \n" + "lbux $t9, $t3(%[src]) \n" + "sll $t1, $t1, 16 \n" + "sll $t9, $t9, 16 \n" + "or $t0, $t0, $t1 \n" + "or $t8, $t8, $t9 \n" + "precr.qb.ph $s0, $t8, $t0 \n" + "lbux $t0, $t4(%[src]) \n" + "lbux $t1, $t5(%[src]) \n" + "lbux $t8, $t6(%[src]) \n" + "lbux $t9, $t7(%[src]) \n" + "sll $t1, $t1, 16 \n" + "sll $t9, $t9, 16 \n" + "or $t0, $t0, $t1 \n" + "or $t8, $t8, $t9 \n" + "precr.qb.ph $s1, $t8, $t0 \n" + "sw $s0, 0(%[dst]) \n" + "addiu %[width], -1 \n" + "addiu %[src], 1 \n" + "sw $s1, 4(%[dst]) \n" + "bnez %[width], 1b \n" + " addu %[dst], %[dst], %[dst_stride] \n" + "b 2f \n" +//dst + dst_stride unaligned + "11: \n" + "lbu $t0, 0(%[src]) \n" + "lbux $t1, %[src_stride](%[src]) \n" + "lbux $t8, $t2(%[src]) \n" + "lbux $t9, $t3(%[src]) \n" + "sll $t1, $t1, 16 \n" + "sll $t9, $t9, 16 \n" + "or $t0, $t0, $t1 \n" + "or $t8, $t8, $t9 \n" + "precr.qb.ph $s0, $t8, $t0 \n" + "lbux $t0, $t4(%[src]) \n" + "lbux $t1, $t5(%[src]) \n" + "lbux $t8, $t6(%[src]) \n" + "lbux $t9, $t7(%[src]) \n" + "sll $t1, $t1, 16 \n" + "sll $t9, $t9, 16 \n" + "or $t0, $t0, $t1 \n" + "or $t8, $t8, $t9 \n" + "precr.qb.ph $s1, $t8, $t0 \n" + "swr $s0, 0(%[dst]) \n" + "swl $s0, 3(%[dst]) \n" + "addiu %[width], -1 \n" + "addiu %[src], 1 \n" + "swr $s1, 4(%[dst]) \n" + "swl $s1, 7(%[dst]) \n" + "bnez %[width], 11b \n" + "addu %[dst], %[dst], %[dst_stride] \n" + "2: \n" + ".set pop \n" + :[src] "+r" (src), + [dst] "+r" (dst), + [width] "+r" (width) + :[src_stride] "r" (src_stride), + [dst_stride] "r" (dst_stride) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1" + ); +} + +void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst, int dst_stride, + int width) { + __asm__ __volatile__ ( + ".set noat \n" + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 + "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 + "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 + "addu $t3, $t2, %[src_stride] \n" + "addu $t5, $t4, %[src_stride] \n" + "addu $t6, $t2, $t4 \n" + + "srl $AT, %[width], 0x2 \n" + "andi $t0, %[dst], 0x3 \n" + "andi $t1, %[dst_stride], 0x3 \n" + "or $t0, $t0, $t1 \n" + "bnez $t0, 11f \n" + " subu $t7, $t9, %[src_stride] \n" +//dst + dst_stride word aligned + "1: \n" + "lw $t0, 0(%[src]) \n" + "lwx $t1, %[src_stride](%[src]) \n" + "lwx $t8, $t2(%[src]) \n" + "lwx $t9, $t3(%[src]) \n" + +// t0 = | 30 | 20 | 10 | 00 | +// t1 = | 31 | 21 | 11 | 01 | +// t8 = | 32 | 22 | 12 | 02 | +// t9 = | 33 | 23 | 13 | 03 | + + "precr.qb.ph $s0, $t1, $t0 \n" + "precr.qb.ph $s1, $t9, $t8 \n" + "precrq.qb.ph $s2, $t1, $t0 \n" + "precrq.qb.ph $s3, $t9, $t8 \n" + + // s0 = | 21 | 01 | 20 | 00 | + // s1 = | 23 | 03 | 22 | 02 | + // s2 = | 31 | 11 | 30 | 10 | + // s3 = | 33 | 13 | 32 | 12 | + + "precr.qb.ph $s4, $s1, $s0 \n" + "precrq.qb.ph $s5, $s1, $s0 \n" + "precr.qb.ph $s6, $s3, $s2 \n" + "precrq.qb.ph $s7, $s3, $s2 \n" + + // s4 = | 03 | 02 | 01 | 00 | + // s5 = | 23 | 22 | 21 | 20 | + // s6 = | 13 | 12 | 11 | 10 | + // s7 = | 33 | 32 | 31 | 30 | + + "lwx $t0, $t4(%[src]) \n" + "lwx $t1, $t5(%[src]) \n" + "lwx $t8, $t6(%[src]) \n" + "lwx $t9, $t7(%[src]) \n" + +// t0 = | 34 | 24 | 14 | 04 | +// t1 = | 35 | 25 | 15 | 05 | +// t8 = | 36 | 26 | 16 | 06 | +// t9 = | 37 | 27 | 17 | 07 | + + "precr.qb.ph $s0, $t1, $t0 \n" + "precr.qb.ph $s1, $t9, $t8 \n" + "precrq.qb.ph $s2, $t1, $t0 \n" + "precrq.qb.ph $s3, $t9, $t8 \n" + + // s0 = | 25 | 05 | 24 | 04 | + // s1 = | 27 | 07 | 26 | 06 | + // s2 = | 35 | 15 | 34 | 14 | + // s3 = | 37 | 17 | 36 | 16 | + + "precr.qb.ph $t0, $s1, $s0 \n" + "precrq.qb.ph $t1, $s1, $s0 \n" + "precr.qb.ph $t8, $s3, $s2 \n" + "precrq.qb.ph $t9, $s3, $s2 \n" + + // t0 = | 07 | 06 | 05 | 04 | + // t1 = | 27 | 26 | 25 | 24 | + // t8 = | 17 | 16 | 15 | 14 | + // t9 = | 37 | 36 | 35 | 34 | + + "addu $s0, %[dst], %[dst_stride] \n" + "addu $s1, $s0, %[dst_stride] \n" + "addu $s2, $s1, %[dst_stride] \n" + + "sw $s4, 0(%[dst]) \n" + "sw $t0, 4(%[dst]) \n" + "sw $s6, 0($s0) \n" + "sw $t8, 4($s0) \n" + "sw $s5, 0($s1) \n" + "sw $t1, 4($s1) \n" + "sw $s7, 0($s2) \n" + "sw $t9, 4($s2) \n" + + "addiu $AT, -1 \n" + "addiu %[src], 4 \n" + + "bnez $AT, 1b \n" + " addu %[dst], $s2, %[dst_stride] \n" + "b 2f \n" +//dst + dst_stride unaligned + "11: \n" + "lw $t0, 0(%[src]) \n" + "lwx $t1, %[src_stride](%[src]) \n" + "lwx $t8, $t2(%[src]) \n" + "lwx $t9, $t3(%[src]) \n" + +// t0 = | 30 | 20 | 10 | 00 | +// t1 = | 31 | 21 | 11 | 01 | +// t8 = | 32 | 22 | 12 | 02 | +// t9 = | 33 | 23 | 13 | 03 | + + "precr.qb.ph $s0, $t1, $t0 \n" + "precr.qb.ph $s1, $t9, $t8 \n" + "precrq.qb.ph $s2, $t1, $t0 \n" + "precrq.qb.ph $s3, $t9, $t8 \n" + + // s0 = | 21 | 01 | 20 | 00 | + // s1 = | 23 | 03 | 22 | 02 | + // s2 = | 31 | 11 | 30 | 10 | + // s3 = | 33 | 13 | 32 | 12 | + + "precr.qb.ph $s4, $s1, $s0 \n" + "precrq.qb.ph $s5, $s1, $s0 \n" + "precr.qb.ph $s6, $s3, $s2 \n" + "precrq.qb.ph $s7, $s3, $s2 \n" + + // s4 = | 03 | 02 | 01 | 00 | + // s5 = | 23 | 22 | 21 | 20 | + // s6 = | 13 | 12 | 11 | 10 | + // s7 = | 33 | 32 | 31 | 30 | + + "lwx $t0, $t4(%[src]) \n" + "lwx $t1, $t5(%[src]) \n" + "lwx $t8, $t6(%[src]) \n" + "lwx $t9, $t7(%[src]) \n" + +// t0 = | 34 | 24 | 14 | 04 | +// t1 = | 35 | 25 | 15 | 05 | +// t8 = | 36 | 26 | 16 | 06 | +// t9 = | 37 | 27 | 17 | 07 | + + "precr.qb.ph $s0, $t1, $t0 \n" + "precr.qb.ph $s1, $t9, $t8 \n" + "precrq.qb.ph $s2, $t1, $t0 \n" + "precrq.qb.ph $s3, $t9, $t8 \n" + + // s0 = | 25 | 05 | 24 | 04 | + // s1 = | 27 | 07 | 26 | 06 | + // s2 = | 35 | 15 | 34 | 14 | + // s3 = | 37 | 17 | 36 | 16 | + + "precr.qb.ph $t0, $s1, $s0 \n" + "precrq.qb.ph $t1, $s1, $s0 \n" + "precr.qb.ph $t8, $s3, $s2 \n" + "precrq.qb.ph $t9, $s3, $s2 \n" + + // t0 = | 07 | 06 | 05 | 04 | + // t1 = | 27 | 26 | 25 | 24 | + // t8 = | 17 | 16 | 15 | 14 | + // t9 = | 37 | 36 | 35 | 34 | + + "addu $s0, %[dst], %[dst_stride] \n" + "addu $s1, $s0, %[dst_stride] \n" + "addu $s2, $s1, %[dst_stride] \n" + + "swr $s4, 0(%[dst]) \n" + "swl $s4, 3(%[dst]) \n" + "swr $t0, 4(%[dst]) \n" + "swl $t0, 7(%[dst]) \n" + "swr $s6, 0($s0) \n" + "swl $s6, 3($s0) \n" + "swr $t8, 4($s0) \n" + "swl $t8, 7($s0) \n" + "swr $s5, 0($s1) \n" + "swl $s5, 3($s1) \n" + "swr $t1, 4($s1) \n" + "swl $t1, 7($s1) \n" + "swr $s7, 0($s2) \n" + "swl $s7, 3($s2) \n" + "swr $t9, 4($s2) \n" + "swl $t9, 7($s2) \n" + + "addiu $AT, -1 \n" + "addiu %[src], 4 \n" + + "bnez $AT, 11b \n" + " addu %[dst], $s2, %[dst_stride] \n" + "2: \n" + ".set pop \n" + ".set at \n" + :[src] "+r" (src), + [dst] "+r" (dst), + [width] "+r" (width) + :[src_stride] "r" (src_stride), + [dst_stride] "r" (dst_stride) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", "s4", + "s5", "s6", "s7" + ); +} + +void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride, + uint8* dst_a, int dst_stride_a, + uint8* dst_b, int dst_stride_b, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2 + "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4 + "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8 + "addu $t3, $t2, %[src_stride] \n" + "addu $t5, $t4, %[src_stride] \n" + "addu $t6, $t2, $t4 \n" + "subu $t7, $t9, %[src_stride] \n" + "srl $t1, %[width], 1 \n" + +// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b + "andi $t0, %[dst_a], 0x3 \n" + "andi $t8, %[dst_b], 0x3 \n" + "or $t0, $t0, $t8 \n" + "andi $t8, %[dst_stride_a], 0x3 \n" + "andi $s5, %[dst_stride_b], 0x3 \n" + "or $t8, $t8, $s5 \n" + "or $t0, $t0, $t8 \n" + "bnez $t0, 11f \n" + " nop \n" +// dst + dst_stride word aligned (both, a & b dst addresses) + "1: \n" + "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| + "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| + "addu $s5, %[dst_a], %[dst_stride_a] \n" + "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| + "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| + "addu $s6, %[dst_b], %[dst_stride_b] \n" + + "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| + "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| + "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| + + "sll $t0, $t0, 16 \n" + "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| + "sll $t9, $t9, 16 \n" + "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| + + "sw $s3, 0($s5) \n" + "sw $s4, 0($s6) \n" + + "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| + + "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| + "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| + "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| + "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| + "sw $s3, 0(%[dst_a]) \n" + "sw $s4, 0(%[dst_b]) \n" + + "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| + "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| + "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| + + "sll $t0, $t0, 16 \n" + "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| + "sll $t9, $t9, 16 \n" + "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| + "sw $s3, 4($s5) \n" + "sw $s4, 4($s6) \n" + + "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| + + "addiu %[src], 4 \n" + "addiu $t1, -1 \n" + "sll $t0, %[dst_stride_a], 1 \n" + "sll $t8, %[dst_stride_b], 1 \n" + "sw $s3, 4(%[dst_a]) \n" + "sw $s4, 4(%[dst_b]) \n" + "addu %[dst_a], %[dst_a], $t0 \n" + "bnez $t1, 1b \n" + " addu %[dst_b], %[dst_b], $t8 \n" + "b 2f \n" + " nop \n" + +// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned + "11: \n" + "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0| + "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1| + "addu $s5, %[dst_a], %[dst_stride_a] \n" + "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2| + "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3| + "addu $s6, %[dst_b], %[dst_stride_b] \n" + + "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0| + "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2| + "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0| + + "sll $t0, $t0, 16 \n" + "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0| + "sll $t9, $t9, 16 \n" + "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2| + + "swr $s3, 0($s5) \n" + "swl $s3, 3($s5) \n" + "swr $s4, 0($s6) \n" + "swl $s4, 3($s6) \n" + + "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0| + + "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4| + "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5| + "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6| + "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7| + "swr $s3, 0(%[dst_a]) \n" + "swl $s3, 3(%[dst_a]) \n" + "swr $s4, 0(%[dst_b]) \n" + "swl $s4, 3(%[dst_b]) \n" + + "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4| + "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7| + "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4| + + "sll $t0, $t0, 16 \n" + "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4| + "sll $t9, $t9, 16 \n" + "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6| + + "swr $s3, 4($s5) \n" + "swl $s3, 7($s5) \n" + "swr $s4, 4($s6) \n" + "swl $s4, 7($s6) \n" + + "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4| + "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4| + + "addiu %[src], 4 \n" + "addiu $t1, -1 \n" + "sll $t0, %[dst_stride_a], 1 \n" + "sll $t8, %[dst_stride_b], 1 \n" + "swr $s3, 4(%[dst_a]) \n" + "swl $s3, 7(%[dst_a]) \n" + "swr $s4, 4(%[dst_b]) \n" + "swl $s4, 7(%[dst_b]) \n" + "addu %[dst_a], %[dst_a], $t0 \n" + "bnez $t1, 11b \n" + " addu %[dst_b], %[dst_b], $t8 \n" + + "2: \n" + ".set pop \n" + : [src] "+r" (src), + [dst_a] "+r" (dst_a), + [dst_b] "+r" (dst_b), + [width] "+r" (width), + [src_stride] "+r" (src_stride) + : [dst_stride_a] "r" (dst_stride_a), + [dst_stride_b] "r" (dst_stride_b) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", + "s4", "s5", "s6" + ); +} + +#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) + +#ifdef __cplusplus +} // extern "C" +} // namespace libyuv +#endif diff --git a/source/row_mips.cc b/source/row_mips.cc index 20f5a4fa5..df4542fbf 100644 --- a/source/row_mips.cc +++ b/source/row_mips.cc @@ -16,6 +16,13 @@ extern "C" { #endif #if !defined(YUV_DISABLE_ASM) && defined(__mips__) +#if defined HAS_COPYROW_MIPS +extern "C" void memcpy_MIPS(uint8* dst, const uint8* src, int count); +void CopyRow_MIPS(const uint8* src, uint8* dst, int count) { + memcpy_MIPS(dst, src, count); +} +#endif + #ifdef HAS_SPLITUV_MIPS_DSPR2 void SplitUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { @@ -166,6 +173,400 @@ void SplitUV_Unaligned_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, } #endif // HAS_SPLITUV_MIPS_DSPR2 +#ifdef HAS_MIRRORROW_MIPS_DSPR2 +void MirrorRow_MIPS_DSPR2(const uint8* src, uint8* dst, int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "srl $t4, %[width], 4 \n" // multiplies of 16 + "andi $t5, %[width], 0xf \n" + "blez $t4, 2f \n" + " addu %[src], %[src], %[width] \n" // src += width + + "1: \n" + "lw $t0, -16(%[src]) \n" // |3|2|1|0| + "lw $t1, -12(%[src]) \n" // |7|6|5|4| + "lw $t2, -8(%[src]) \n" // |11|10|9|8| + "lw $t3, -4(%[src]) \n" // |15|14|13|12| + "wsbh $t0, $t0 \n" // |2|3|0|1| + "wsbh $t1, $t1 \n" // |6|7|4|5| + "wsbh $t2, $t2 \n" // |10|11|8|9| + "wsbh $t3, $t3 \n" // |14|15|12|13| + "rotr $t0, $t0, 16 \n" // |0|1|2|3| + "rotr $t1, $t1, 16 \n" // |4|5|6|7| + "rotr $t2, $t2, 16 \n" // |8|9|10|11| + "rotr $t3, $t3, 16 \n" // |12|13|14|15| + "addiu %[src], %[src], -16 \n" + "addiu $t4, $t4, -1 \n" + "sw $t3, 0(%[dst]) \n" // |15|14|13|12| + "sw $t2, 4(%[dst]) \n" // |11|10|9|8| + "sw $t1, 8(%[dst]) \n" // |7|6|5|4| + "sw $t0, 12(%[dst]) \n" // |3|2|1|0| + "bgtz $t4, 1b \n" + " addiu %[dst], %[dst], 16 \n" + "beqz $t5, 3f \n" + " nop \n" + + "2: \n" + "lbu $t0, -1(%[src]) \n" + "addiu $t5, $t5, -1 \n" + "addiu %[src], %[src], -1 \n" + "sb $t0, 0(%[dst]) \n" + "bgez $t5, 2b \n" + " addiu %[dst], %[dst], 1 \n" + + "3: \n" + ".set pop \n" + : [src] "+r" (src), [dst] "+r" (dst) + : [width] "r" (width) + : "t0", "t1", "t2", "t3", "t4", "t5" + ); +} +#endif // HAS_MIRRORROW_MIPS_DSPR2 + +#ifdef HAS_MIRRORROWUV_MIPS_DSPR2 +void MirrorRowUV_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, + int width) { + int x = 0; + int y = 0; + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "addu $t4, %[width], %[width] \n" + "srl %[x], %[width], 4 \n" + "andi %[y], %[width], 0xf \n" + "blez %[x], 2f \n" + " addu %[src_uv], %[src_uv], $t4 \n" + + "1: \n" + "lw $t0, -32(%[src_uv]) \n" // |3|2|1|0| + "lw $t1, -28(%[src_uv]) \n" // |7|6|5|4| + "lw $t2, -24(%[src_uv]) \n" // |11|10|9|8| + "lw $t3, -20(%[src_uv]) \n" // |15|14|13|12| + "lw $t4, -16(%[src_uv]) \n" // |19|18|17|16| + "lw $t6, -12(%[src_uv]) \n" // |23|22|21|20| + "lw $t7, -8(%[src_uv]) \n" // |27|26|25|24| + "lw $t8, -4(%[src_uv]) \n" // |31|30|29|28| + + "rotr $t0, $t0, 16 \n" // |1|0|3|2| + "rotr $t1, $t1, 16 \n" // |5|4|7|6| + "rotr $t2, $t2, 16 \n" // |9|8|11|10| + "rotr $t3, $t3, 16 \n" // |13|12|15|14| + "rotr $t4, $t4, 16 \n" // |17|16|19|18| + "rotr $t6, $t6, 16 \n" // |21|20|23|22| + "rotr $t7, $t7, 16 \n" // |25|24|27|26| + "rotr $t8, $t8, 16 \n" // |29|28|31|30| + "precr.qb.ph $t9, $t0, $t1 \n" // |0|2|4|6| + "precrq.qb.ph $t5, $t0, $t1 \n" // |1|3|5|7| + "precr.qb.ph $t0, $t2, $t3 \n" // |8|10|12|14| + "precrq.qb.ph $t1, $t2, $t3 \n" // |9|11|13|15| + "precr.qb.ph $t2, $t4, $t6 \n" // |16|18|20|22| + "precrq.qb.ph $t3, $t4, $t6 \n" // |17|19|21|23| + "precr.qb.ph $t4, $t7, $t8 \n" // |24|26|28|30| + "precrq.qb.ph $t6, $t7, $t8 \n" // |25|27|29|31| + "addiu %[src_uv], %[src_uv], -32 \n" + "addiu %[x], %[x], -1 \n" + "swr $t4, 0(%[dst_u]) \n" + "swl $t4, 3(%[dst_u]) \n" // |30|28|26|24| + "swr $t6, 0(%[dst_v]) \n" + "swl $t6, 3(%[dst_v]) \n" // |31|29|27|25| + "swr $t2, 4(%[dst_u]) \n" + "swl $t2, 7(%[dst_u]) \n" // |22|20|18|16| + "swr $t3, 4(%[dst_v]) \n" + "swl $t3, 7(%[dst_v]) \n" // |23|21|19|17| + "swr $t0, 8(%[dst_u]) \n" + "swl $t0, 11(%[dst_u]) \n" // |14|12|10|8| + "swr $t1, 8(%[dst_v]) \n" + "swl $t1, 11(%[dst_v]) \n" // |15|13|11|9| + "swr $t9, 12(%[dst_u]) \n" + "swl $t9, 15(%[dst_u]) \n" // |6|4|2|0| + "swr $t5, 12(%[dst_v]) \n" + "swl $t5, 15(%[dst_v]) \n" // |7|5|3|1| + "addiu %[dst_v], %[dst_v], 16 \n" + "bgtz %[x], 1b \n" + " addiu %[dst_u], %[dst_u], 16 \n" + "beqz %[y], 3f \n" + " nop \n" + "b 2f \n" + " nop \n" + + "2: \n" + "lbu $t0, -2(%[src_uv]) \n" + "lbu $t1, -1(%[src_uv]) \n" + "addiu %[src_uv], %[src_uv], -2 \n" + "addiu %[y], %[y], -1 \n" + "sb $t0, 0(%[dst_u]) \n" + "sb $t1, 0(%[dst_v]) \n" + "addiu %[dst_u], %[dst_u], 1 \n" + "bgtz %[y], 2b \n" + " addiu %[dst_v], %[dst_v], 1 \n" + + "3: \n" + ".set pop \n" + : [src_uv] "+r" (src_uv), + [dst_u] "+r" (dst_u), + [dst_v] "+r" (dst_v), + [x] "=&r" (x), + [y] "+r" (y) + : [width] "r" (width) + : "t0", "t1", "t2", "t3", "t4", + "t5", "t7", "t8", "t9" + ); +} +#endif // HAS_MIRRORROWUV_MIPS_DSPR2 + + + +// Convert (4 Y and 2 VU) I422 and arrange RGB values into +// t5 = | 0 | B0 | 0 | b0 | +// t4 = | 0 | B1 | 0 | b1 | +// t9 = | 0 | G0 | 0 | g0 | +// t8 = | 0 | G1 | 0 | g1 | +// t2 = | 0 | R0 | 0 | r0 | +// t1 = | 0 | R1 | 0 | r1 | +#define I422ToTransientMipsRGB \ + "lw $t0, 0(%[y_buf]) \n" \ + "lhu $t1, 0(%[u_buf]) \n" \ + "lhu $t2, 0(%[v_buf]) \n" \ + "preceu.ph.qbr $t1, $t1 \n" \ + "preceu.ph.qbr $t2, $t2 \n" \ + "preceu.ph.qbra $t3, $t0 \n" \ + "preceu.ph.qbla $t0, $t0 \n" \ + "subu.ph $t1, $t1, $s5 \n" \ + "subu.ph $t2, $t2, $s5 \n" \ + "subu.ph $t3, $t3, $s4 \n" \ + "subu.ph $t0, $t0, $s4 \n" \ + "mul.ph $t3, $t3, $s0 \n" \ + "mul.ph $t0, $t0, $s0 \n" \ + "shll.ph $t4, $t1, 0x7 \n" \ + "subu.ph $t4, $t4, $t1 \n" \ + "mul.ph $t6, $t1, $s1 \n" \ + "mul.ph $t1, $t2, $s2 \n" \ + "addq_s.ph $t5, $t4, $t3 \n" \ + "addq_s.ph $t4, $t4, $t0 \n" \ + "shra.ph $t5, $t5, 6 \n" \ + "shra.ph $t4, $t4, 6 \n" \ + "addiu %[u_buf], 2 \n" \ + "addiu %[v_buf], 2 \n" \ + "addu.ph $t6, $t6, $t1 \n" \ + "mul.ph $t1, $t2, $s3 \n" \ + "addu.ph $t9, $t6, $t3 \n" \ + "addu.ph $t8, $t6, $t0 \n" \ + "shra.ph $t9, $t9, 6 \n" \ + "shra.ph $t8, $t8, 6 \n" \ + "addu.ph $t2, $t1, $t3 \n" \ + "addu.ph $t1, $t1, $t0 \n" \ + "shra.ph $t2, $t2, 6 \n" \ + "shra.ph $t1, $t1, 6 \n" \ + "subu.ph $t5, $t5, $s5 \n" \ + "subu.ph $t4, $t4, $s5 \n" \ + "subu.ph $t9, $t9, $s5 \n" \ + "subu.ph $t8, $t8, $s5 \n" \ + "subu.ph $t2, $t2, $s5 \n" \ + "subu.ph $t1, $t1, $s5 \n" \ + "shll_s.ph $t5, $t5, 8 \n" \ + "shll_s.ph $t4, $t4, 8 \n" \ + "shll_s.ph $t9, $t9, 8 \n" \ + "shll_s.ph $t8, $t8, 8 \n" \ + "shll_s.ph $t2, $t2, 8 \n" \ + "shll_s.ph $t1, $t1, 8 \n" \ + "shra.ph $t5, $t5, 8 \n" \ + "shra.ph $t4, $t4, 8 \n" \ + "shra.ph $t9, $t9, 8 \n" \ + "shra.ph $t8, $t8, 8 \n" \ + "shra.ph $t2, $t2, 8 \n" \ + "shra.ph $t1, $t1, 8 \n" \ + "addu.ph $t5, $t5, $s5 \n" \ + "addu.ph $t4, $t4, $s5 \n" \ + "addu.ph $t9, $t9, $s5 \n" \ + "addu.ph $t8, $t8, $s5 \n" \ + "addu.ph $t2, $t2, $s5 \n" \ + "addu.ph $t1, $t1, $s5 \n" + +void I422ToARGBRow_MIPS_DSPR2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " repl.ph $s0, 74 \n" // |YG|YG| = |74|74| + "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25| + "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52| + "repl.ph $s3, 102 \n" // |VR|VR| = |102|102| + "repl.ph $s4, 16 \n" // |0|16|0|16| + "repl.ph $s5, 128 \n" // |128|128| // clipping + "lui $s6, 0xff00 \n" + "ori $s6, 0xff00 \n" // |ff|00|ff|00|ff| + "1: \n" + I422ToTransientMipsRGB +// Arranging into argb format + "precr.qb.ph $t4, $t8, $t4 \n" // |G1|g1|B1|b1| + "precr.qb.ph $t5, $t9, $t5 \n" // |G0|g0|B0|b0| + "addiu %[width], -4 \n" + "precrq.qb.ph $t8, $t4, $t5 \n" // |G1|B1|G0|B0| + "precr.qb.ph $t9, $t4, $t5 \n" // |g1|b1|g0|b0| + "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0| + + "addiu %[y_buf], 4 \n" + "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0| + "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0| + "or $t1, $t1, $s6 \n" // |ff|R1|ff|R0| + "or $t2, $t2, $s6 \n" // |ff|r1|ff|r0| + "precrq.ph.w $t0, $t2, $t9 \n" // |ff|r1|g1|b1| + "precrq.ph.w $t3, $t1, $t8 \n" // |ff|R1|G1|B1| + "sll $t9, $t9, 16 \n" + "sll $t8, $t8, 16 \n" + "packrl.ph $t2, $t2, $t9 \n" // |ff|r0|g0|b0| + "packrl.ph $t1, $t1, $t8 \n" // |ff|R0|G0|B0| +// Store results. + "sw $t2, 0(%[rgb_buf]) \n" + "sw $t0, 4(%[rgb_buf]) \n" + "sw $t1, 8(%[rgb_buf]) \n" + "sw $t3, 12(%[rgb_buf]) \n" + "bnez %[width], 1b \n" + " addiu %[rgb_buf], 16 \n" + "2: \n" + ".set pop \n" + :[y_buf] "+r" (y_buf), + [u_buf] "+r" (u_buf), + [v_buf] "+r" (v_buf), + [width] "+r" (width), + [rgb_buf] "+r" (rgb_buf) + : + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", + "s4", "s5", "s6" + ); +} + +void I422ToABGRRow_MIPS_DSPR2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm__ __volatile__ ( + ".set push \n\t" + ".set noreorder \n\t" + "beqz %[width], 2f \n\t" + " repl.ph $s0, 74 \n\t" // |YG|YG| = |74|74| + "repl.ph $s1, -25 \n\t" // |UG|UG| = |-25|-25| + "repl.ph $s2, -52 \n\t" // |VG|VG| = |-52|-52| + "repl.ph $s3, 102 \n\t" // |VR|VR| = |102|102| + "repl.ph $s4, 16 \n\t" // |0|16|0|16| + "repl.ph $s5, 128 \n\t" // |128|128| + "lui $s6, 0xff00 \n\t" + "ori $s6, 0xff00 \n\t" // |ff|00|ff|00| + "1: \n" + I422ToTransientMipsRGB +// Arranging into abgr format + "precr.qb.ph $t0, $t8, $t1 \n\t" // |G1|g1|R1|r1| + "precr.qb.ph $t3, $t9, $t2 \n\t" // |G0|g0|R0|r0| + "precrq.qb.ph $t8, $t0, $t3 \n\t" // |G1|R1|G0|R0| + "precr.qb.ph $t9, $t0, $t3 \n\t" // |g1|r1|g0|r0| + + "precr.qb.ph $t2, $t4, $t5 \n\t" // |B1|b1|B0|b0| + "addiu %[width], -4 \n\t" + "addiu %[y_buf], 4 \n\t" + "preceu.ph.qbla $t1, $t2 \n\t" // |0 |B1|0 |B0| + "preceu.ph.qbra $t2, $t2 \n\t" // |0 |b1|0 |b0| + "or $t1, $t1, $s6 \n\t" // |ff|B1|ff|B0| + "or $t2, $t2, $s6 \n\t" // |ff|b1|ff|b0| + "precrq.ph.w $t0, $t2, $t9 \n\t" // |ff|b1|g1|r1| + "precrq.ph.w $t3, $t1, $t8 \n\t" // |ff|B1|G1|R1| + "sll $t9, $t9, 16 \n\t" + "sll $t8, $t8, 16 \n\t" + "packrl.ph $t2, $t2, $t9 \n\t" // |ff|b0|g0|r0| + "packrl.ph $t1, $t1, $t8 \n\t" // |ff|B0|G0|R0| +// Store results. + "sw $t2, 0(%[rgb_buf]) \n\t" + "sw $t0, 4(%[rgb_buf]) \n\t" + "sw $t1, 8(%[rgb_buf]) \n\t" + "sw $t3, 12(%[rgb_buf]) \n\t" + "bnez %[width], 1b \n\t" + " addiu %[rgb_buf], 16 \n\t" + "2: \n\t" + ".set pop \n\t" + :[y_buf] "+r" (y_buf), + [u_buf] "+r" (u_buf), + [v_buf] "+r" (v_buf), + [width] "+r" (width), + [rgb_buf] "+r" (rgb_buf) + : + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", + "s4", "s5", "s6" + ); +} + +void I422ToBGRARow_MIPS_DSPR2(const uint8* y_buf, + const uint8* u_buf, + const uint8* v_buf, + uint8* rgb_buf, + int width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "beqz %[width], 2f \n" + " repl.ph $s0, 74 \n" // |YG|YG| = |74 |74 | + "repl.ph $s1, -25 \n" // |UG|UG| = |-25|-25| + "repl.ph $s2, -52 \n" // |VG|VG| = |-52|-52| + "repl.ph $s3, 102 \n" // |VR|VR| = |102|102| + "repl.ph $s4, 16 \n" // |0|16|0|16| + "repl.ph $s5, 128 \n" // |128|128| + "lui $s6, 0xff \n" + "ori $s6, 0xff \n" // |00|ff|00|ff| + "1: \n" + I422ToTransientMipsRGB + // Arranging into bgra format + "precr.qb.ph $t4, $t4, $t8 \n" // |B1|b1|G1|g1| + "precr.qb.ph $t5, $t5, $t9 \n" // |B0|b0|G0|g0| + "precrq.qb.ph $t8, $t4, $t5 \n" // |B1|G1|B0|G0| + "precr.qb.ph $t9, $t4, $t5 \n" // |b1|g1|b0|g0| + + "precr.qb.ph $t2, $t1, $t2 \n" // |R1|r1|R0|r0| + "addiu %[width], -4 \n" + "addiu %[y_buf], 4 \n" + "preceu.ph.qbla $t1, $t2 \n" // |0 |R1|0 |R0| + "preceu.ph.qbra $t2, $t2 \n" // |0 |r1|0 |r0| + "sll $t1, $t1, 8 \n" // |R1|0 |R0|0 | + "sll $t2, $t2, 8 \n" // |r1|0 |r0|0 | + "or $t1, $t1, $s6 \n" // |R1|ff|R0|ff| + "or $t2, $t2, $s6 \n" // |r1|ff|r0|ff| + "precrq.ph.w $t0, $t9, $t2 \n" // |b1|g1|r1|ff| + "precrq.ph.w $t3, $t8, $t1 \n" // |B1|G1|R1|ff| + "sll $t1, $t1, 16 \n" + "sll $t2, $t2, 16 \n" + "packrl.ph $t2, $t9, $t2 \n" // |b0|g0|r0|ff| + "packrl.ph $t1, $t8, $t1 \n" // |B0|G0|R0|ff| +// Store results. + "sw $t2, 0(%[rgb_buf]) \n" + "sw $t0, 4(%[rgb_buf]) \n" + "sw $t1, 8(%[rgb_buf]) \n" + "sw $t3, 12(%[rgb_buf]) \n" + "bnez %[width], 1b \n" + " addiu %[rgb_buf], 16 \n" + "2: \n" + ".set pop \n" + :[y_buf] "+r" (y_buf), + [u_buf] "+r" (u_buf), + [v_buf] "+r" (v_buf), + [width] "+r" (width), + [rgb_buf] "+r" (rgb_buf) + : + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9", + "s0", "s1", "s2", "s3", + "s4", "s5", "s6" + ); +} + #endif // __mips__ #ifdef __cplusplus diff --git a/source/scale.cc b/source/scale.cc index a7732c4bc..1793b6f19 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -1957,6 +1957,26 @@ void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr, const unsigned char* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction); +#define HAS_SCALEROWDOWN4_MIPS_DSPR2 +void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown4Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width); +#define HAS_SCALEROWDOWN34_MIPS_DSPR2 +void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +void ScaleRowDown34_1_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width); +#define HAS_SCALEROWDOWN38_MIPS_DSPR2 +void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width); +void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); +void ScaleRowDown38_3_Int_MIPS_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width); #endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2) // CPU agnostic row functions @@ -2331,7 +2351,7 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */, IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { ScaleRowDown2 = filtering ? - ScaleRowDown2Int_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2; + ScaleRowDown2Int_MIPS_DSPR2 : ScaleRowDown2_MIPS_DSPR2; } #endif @@ -2368,6 +2388,13 @@ static void ScalePlaneDown4(int /* src_width */, int /* src_height */, IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16)) { ScaleRowDown4 = filtering ? ScaleRowDown4Int_SSE2 : ScaleRowDown4_SSE2; } +#elif defined(HAS_SCALEROWDOWN4_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + ScaleRowDown4 = filtering ? + ScaleRowDown4Int_MIPS_DSPR2 : ScaleRowDown4_MIPS_DSPR2; + } #endif for (int y = 0; y < dst_height; ++y) { @@ -2461,6 +2488,19 @@ static void ScalePlaneDown34(int /* src_width */, int /* src_height */, } } #endif +#if defined(HAS_SCALEROWDOWN34_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 24 == 0) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + if (!filtering) { + ScaleRowDown34_0 = ScaleRowDown34_MIPS_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_MIPS_DSPR2; + } else { + ScaleRowDown34_0 = ScaleRowDown34_0_Int_MIPS_DSPR2; + ScaleRowDown34_1 = ScaleRowDown34_1_Int_MIPS_DSPR2; + } + } +#endif for (int y = 0; y < dst_height - 2; y += 3) { ScaleRowDown34_0(src_ptr, src_stride, dst_ptr, dst_width); @@ -2541,6 +2581,18 @@ static void ScalePlaneDown38(int /* src_width */, int /* src_height */, ScaleRowDown38_2 = ScaleRowDown38_2_Int_SSSE3; } } +#elif defined(HAS_SCALEROWDOWN38_MIPS_DSPR2) + if (TestCpuFlag(kCpuHasMIPS_DSPR2) && (dst_width % 12 == 0) && + IS_ALIGNED(src_ptr, 4) && IS_ALIGNED(src_stride, 4) && + IS_ALIGNED(dst_ptr, 4) && IS_ALIGNED(dst_stride, 4)) { + if (!filtering) { + ScaleRowDown38_3 = ScaleRowDown38_MIPS_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_MIPS_DSPR2; + } else { + ScaleRowDown38_3 = ScaleRowDown38_3_Int_MIPS_DSPR2; + ScaleRowDown38_2 = ScaleRowDown38_2_Int_MIPS_DSPR2; + } + } #endif for (int y = 0; y < dst_height - 2; y += 3) { diff --git a/source/scale_mips.cc b/source/scale_mips.cc index ce7241662..8f380d475 100644 --- a/source/scale_mips.cc +++ b/source/scale_mips.cc @@ -173,6 +173,460 @@ void ScaleRowDown2Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, ); } +void ScaleRowDown4_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "srl $t9, %[dst_width], 3 \n" + "beqz $t9, 2f \n" + " nop \n" + + "1: \n" + "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| + "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| + "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| + "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| + "precr.qb.ph $t1, $t2, $t1 \n" // |6|4|2|0| + "precr.qb.ph $t2, $t4, $t3 \n" // |14|12|10|8| + "precr.qb.ph $t5, $t6, $t5 \n" // |22|20|18|16| + "precr.qb.ph $t6, $t8, $t7 \n" // |30|28|26|24| + "precr.qb.ph $t1, $t2, $t1 \n" // |12|8|4|0| + "precr.qb.ph $t5, $t6, $t5 \n" // |28|24|20|16| + "addiu %[src_ptr], %[src_ptr], 32 \n" + "addiu $t9, $t9, -1 \n" + "sw $t1, 0(%[dst]) \n" + "sw $t5, 4(%[dst]) \n" + "bgtz $t9, 1b \n" + " addiu %[dst], %[dst], 8 \n" + + "2: \n" + "andi $t9, %[dst_width], 7 \n" // residue + "beqz $t9, 3f \n" + " nop \n" + + "21: \n" + "lbu $t1, 0(%[src_ptr]) \n" + "addiu %[src_ptr], %[src_ptr], 4 \n" + "addiu $t9, $t9, -1 \n" + "sb $t1, 0(%[dst]) \n" + "bgtz $t9, 21b \n" + " addiu %[dst], %[dst], 1 \n" + + "3: \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst) + : [dst_width] "r" (dst_width) + : "t1", "t2", "t3", "t4", "t5", + "t6", "t7", "t8", "t9" + ); +} + +void ScaleRowDown4Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + intptr_t stride = src_stride; + const uint8* s1 = src_ptr + stride; + const uint8* s2 = s1 + stride; + const uint8* s3 = s2 + stride; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + + "srl $t9, %[dst_width], 1 \n" + "andi $t8, %[dst_width], 1 \n" + + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 0(%[s1]) \n" // |7|6|5|4| + "lw $t2, 0(%[s2]) \n" // |11|10|9|8| + "lw $t3, 0(%[s3]) \n" // |15|14|13|12| + "lw $t4, 4(%[src_ptr]) \n" // |19|18|17|16| + "lw $t5, 4(%[s1]) \n" // |23|22|21|20| + "lw $t6, 4(%[s2]) \n" // |27|26|25|24| + "lw $t7, 4(%[s3]) \n" // |31|30|29|28| + "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| + "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| + "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| + "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| + "raddu.w.qb $t4, $t4 \n" // |19 + 18 + 17 + 16| + "raddu.w.qb $t5, $t5 \n" // |23 + 22 + 21 + 20| + "raddu.w.qb $t6, $t6 \n" // |27 + 26 + 25 + 24| + "raddu.w.qb $t7, $t7 \n" // |31 + 30 + 29 + 28| + "add $t0, $t0, $t1 \n" + "add $t1, $t2, $t3 \n" + "add $t0, $t0, $t1 \n" + "add $t4, $t4, $t5 \n" + "add $t6, $t6, $t7 \n" + "add $t4, $t4, $t6 \n" + "shra_r.w $t0, $t0, 4 \n" + "shra_r.w $t4, $t4, 4 \n" + "sb $t0, 0(%[dst]) \n" + "sb $t4, 1(%[dst]) \n" + "addiu %[src_ptr], %[src_ptr], 8 \n" + "addiu %[s1], %[s1], 8 \n" + "addiu %[s2], %[s2], 8 \n" + "addiu %[s3], %[s3], 8 \n" + "addiu $t9, $t9, -1 \n" + "bgtz $t9, 1b \n" + " addiu %[dst], %[dst], 2 \n" + "beqz $t8, 2f \n" + " nop \n" + + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 0(%[s1]) \n" // |7|6|5|4| + "lw $t2, 0(%[s2]) \n" // |11|10|9|8| + "lw $t3, 0(%[s3]) \n" // |15|14|13|12| + "raddu.w.qb $t0, $t0 \n" // |3 + 2 + 1 + 0| + "raddu.w.qb $t1, $t1 \n" // |7 + 6 + 5 + 4| + "raddu.w.qb $t2, $t2 \n" // |11 + 10 + 9 + 8| + "raddu.w.qb $t3, $t3 \n" // |15 + 14 + 13 + 12| + "add $t0, $t0, $t1 \n" + "add $t1, $t2, $t3 \n" + "add $t0, $t0, $t1 \n" + "shra_r.w $t0, $t0, 4 \n" + "sb $t0, 0(%[dst]) \n" + + "2: \n" + ".set pop \n" + + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst), + [s1] "+r" (s1), + [s2] "+r" (s2), + [s3] "+r" (s3) + : [dst_width] "r" (dst_width) + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6","t7", "t8", "t9" + ); +} + +void ScaleRowDown34_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "1: \n" + "lw $t1, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t2, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t3, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t4, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t5, 16(%[src_ptr]) \n" // |19|18|17|16| + "lw $t6, 20(%[src_ptr]) \n" // |23|22|21|20| + "lw $t7, 24(%[src_ptr]) \n" // |27|26|25|24| + "lw $t8, 28(%[src_ptr]) \n" // |31|30|29|28| + "precrq.qb.ph $t0, $t2, $t4 \n" // |7|5|15|13| + "precrq.qb.ph $t9, $t6, $t8 \n" // |23|21|31|30| + "addiu %[dst_width], %[dst_width], -24 \n" + "ins $t1, $t1, 8, 16 \n" // |3|1|0|X| + "ins $t4, $t0, 8, 16 \n" // |X|15|13|12| + "ins $t5, $t5, 8, 16 \n" // |19|17|16|X| + "ins $t8, $t9, 8, 16 \n" // |X|31|29|28| + "addiu %[src_ptr], %[src_ptr], 32 \n" + "packrl.ph $t0, $t3, $t0 \n" // |9|8|7|5| + "packrl.ph $t9, $t7, $t9 \n" // |25|24|23|21| + "prepend $t1, $t2, 8 \n" // |4|3|1|0| + "prepend $t3, $t4, 24 \n" // |15|13|12|11| + "prepend $t5, $t6, 8 \n" // |20|19|17|16| + "prepend $t7, $t8, 24 \n" // |31|29|28|27| + "sw $t1, 0(%[dst]) \n" + "sw $t0, 4(%[dst]) \n" + "sw $t3, 8(%[dst]) \n" + "sw $t5, 12(%[dst]) \n" + "sw $t9, 16(%[dst]) \n" + "sw $t7, 20(%[dst]) \n" + "bnez %[dst_width], 1b \n" + " addiu %[dst], %[dst], 24 \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst), + [dst_width] "+r" (dst_width) + : + : "t0", "t1", "t2", "t3", "t4", "t5", + "t6","t7", "t8", "t9" + ); +} + +void ScaleRowDown34_0_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "repl.ph $t3, 3 \n" // 0x00030003 + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| + "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| + "rotr $t2, $t0, 8 \n" // |S0|S3|S2|S1| + "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| + "muleu_s.ph.qbl $t4, $t2, $t3 \n" // |S0*3|S3*3| + "muleu_s.ph.qbl $t5, $t6, $t3 \n" // |T0*3|T3*3| + "andi $t0, $t2, 0xFFFF \n" // |0|0|S2|S1| + "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| + "raddu.w.qb $t0, $t0 \n" + "raddu.w.qb $t1, $t1 \n" + "shra_r.w $t0, $t0, 1 \n" + "shra_r.w $t1, $t1, 1 \n" + "preceu.ph.qbr $t2, $t2 \n" // |0|S2|0|S1| + "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| + "rotr $t2, $t2, 16 \n" // |0|S1|0|S2| + "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| + "addu.ph $t2, $t2, $t4 \n" + "addu.ph $t6, $t6, $t5 \n" + "sll $t5, $t0, 1 \n" + "add $t0, $t5, $t0 \n" + "shra_r.ph $t2, $t2, 2 \n" + "shra_r.ph $t6, $t6, 2 \n" + "shll.ph $t4, $t2, 1 \n" + "addq.ph $t4, $t4, $t2 \n" + "addu $t0, $t0, $t1 \n" + "addiu %[src_ptr], %[src_ptr], 4 \n" + "shra_r.w $t0, $t0, 2 \n" + "addu.ph $t6, $t6, $t4 \n" + "shra_r.ph $t6, $t6, 2 \n" + "srl $t1, $t6, 16 \n" + "addiu %[dst_width], %[dst_width], -3 \n" + "sb $t1, 0(%[d]) \n" + "sb $t0, 1(%[d]) \n" + "sb $t6, 2(%[d]) \n" + "bgtz %[dst_width], 1b \n" + " addiu %[d], %[d], 3 \n" + "3: \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [src_stride] "+r" (src_stride), + [d] "+r" (d), + [dst_width] "+r" (dst_width) + : + : "t0", "t1", "t2", "t3", + "t4", "t5", "t6" + ); +} + +void ScaleRowDown34_1_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* d, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "repl.ph $t2, 3 \n" // 0x00030003 + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| + "lwx $t1, %[src_stride](%[src_ptr]) \n" // |T3|T2|T1|T0| + "rotr $t4, $t0, 8 \n" // |S0|S3|S2|S1| + "rotr $t6, $t1, 8 \n" // |T0|T3|T2|T1| + "muleu_s.ph.qbl $t3, $t4, $t2 \n" // |S0*3|S3*3| + "muleu_s.ph.qbl $t5, $t6, $t2 \n" // |T0*3|T3*3| + "andi $t0, $t4, 0xFFFF \n" // |0|0|S2|S1| + "andi $t1, $t6, 0xFFFF \n" // |0|0|T2|T1| + "raddu.w.qb $t0, $t0 \n" + "raddu.w.qb $t1, $t1 \n" + "shra_r.w $t0, $t0, 1 \n" + "shra_r.w $t1, $t1, 1 \n" + "preceu.ph.qbr $t4, $t4 \n" // |0|S2|0|S1| + "preceu.ph.qbr $t6, $t6 \n" // |0|T2|0|T1| + "rotr $t4, $t4, 16 \n" // |0|S1|0|S2| + "rotr $t6, $t6, 16 \n" // |0|T1|0|T2| + "addu.ph $t4, $t4, $t3 \n" + "addu.ph $t6, $t6, $t5 \n" + "shra_r.ph $t6, $t6, 2 \n" + "shra_r.ph $t4, $t4, 2 \n" + "addu.ph $t6, $t6, $t4 \n" + "addiu %[src_ptr], %[src_ptr], 4 \n" + "shra_r.ph $t6, $t6, 1 \n" + "addu $t0, $t0, $t1 \n" + "addiu %[dst_width], %[dst_width], -3 \n" + "shra_r.w $t0, $t0, 1 \n" + "srl $t1, $t6, 16 \n" + "sb $t1, 0(%[d]) \n" + "sb $t0, 1(%[d]) \n" + "sb $t6, 2(%[d]) \n" + "bgtz %[dst_width], 1b \n" + " addiu %[d], %[d], 3 \n" + "3: \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [src_stride] "+r" (src_stride), + [d] "+r" (d), + [dst_width] "+r" (dst_width) + : + : "t0", "t1", "t2", "t3", + "t4", "t5", "t6" + ); +} + +void ScaleRowDown38_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, + uint8* dst, int dst_width) { + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |3|2|1|0| + "lw $t1, 4(%[src_ptr]) \n" // |7|6|5|4| + "lw $t2, 8(%[src_ptr]) \n" // |11|10|9|8| + "lw $t3, 12(%[src_ptr]) \n" // |15|14|13|12| + "lw $t4, 16(%[src_ptr]) \n" // |19|18|17|16| + "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| + "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| + "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| + "wsbh $t0, $t0 \n" // |2|3|0|1| + "wsbh $t6, $t6 \n" // |26|27|24|25| + "srl $t0, $t0, 8 \n" // |X|2|3|0| + "srl $t3, $t3, 16 \n" // |X|X|15|14| + "srl $t5, $t5, 16 \n" // |X|X|23|22| + "srl $t7, $t7, 16 \n" // |X|X|31|30| + "ins $t1, $t2, 24, 8 \n" // |8|6|5|4| + "ins $t6, $t5, 0, 8 \n" // |26|27|24|22| + "ins $t1, $t0, 0, 16 \n" // |8|6|3|0| + "ins $t6, $t7, 24, 8 \n" // |30|27|24|22| + "prepend $t2, $t3, 24 \n" // |X|15|14|11| + "ins $t4, $t4, 16, 8 \n" // |19|16|17|X| + "ins $t4, $t2, 0, 16 \n" // |19|16|14|11| + "addiu %[src_ptr], %[src_ptr], 32 \n" + "addiu %[dst_width], %[dst_width], -12 \n" + "addiu $t8,%[dst_width], -12 \n" + "sw $t1, 0(%[dst]) \n" + "sw $t4, 4(%[dst]) \n" + "sw $t6, 8(%[dst]) \n" + "bgez $t8, 1b \n" + " addiu %[dst], %[dst], 12 \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst] "+r" (dst), + [dst_width] "+r" (dst_width) + : + : "t0", "t1", "t2", "t3", "t4", + "t5", "t6", "t7", "t8" + ); +} + +void ScaleRowDown38_2_Int_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + const uint8* t = src_ptr + stride; + const int c = 0x2AAA; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| + "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| + "lw $t2, 0(%[t]) \n" // |T3|T2|T1|T0| + "lw $t3, 4(%[t]) \n" // |T7|T6|T5|T4| + "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| + "packrl.ph $t4, $t1, $t3 \n" // |S7|S6|T7|T6| + "packrl.ph $t5, $t3, $t1 \n" // |T5|T4|S5|S4| + "raddu.w.qb $t4, $t4 \n" // S7+S6+T7+T6 + "raddu.w.qb $t5, $t5 \n" // T5+T4+S5+S4 + "precrq.qb.ph $t6, $t0, $t2 \n" // |S3|S1|T3|T1| + "precrq.qb.ph $t6, $t6, $t6 \n" // |S3|T3|S3|T3| + "srl $t4, $t4, 2 \n" // t4 / 4 + "srl $t6, $t6, 16 \n" // |0|0|S3|T3| + "raddu.w.qb $t6, $t6 \n" // 0+0+S3+T3 + "addu $t6, $t5, $t6 \n" + "mul $t6, $t6, %[c] \n" // t6 * 0x2AAA + "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| + "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| + "raddu.w.qb $t0, $t0 \n" // S2+S1+S0+0 + "raddu.w.qb $t2, $t2 \n" // T2+T1+T0+0 + "addu $t0, $t0, $t2 \n" + "mul $t0, $t0, %[c] \n" // t0 * 0x2AAA + "addiu %[src_ptr], %[src_ptr], 8 \n" + "addiu %[t], %[t], 8 \n" + "addiu %[dst_width], %[dst_width], -3 \n" + "addiu %[dst_ptr], %[dst_ptr], 3 \n" + "srl $t6, $t6, 16 \n" + "srl $t0, $t0, 16 \n" + "sb $t4, -1(%[dst_ptr]) \n" + "sb $t6, -2(%[dst_ptr]) \n" + "bgtz %[dst_width], 1b \n" + " sb $t0, -3(%[dst_ptr]) \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst_ptr] "+r" (dst_ptr), + [t] "+r" (t), + [dst_width] "+r" (dst_width) + : [c] "r" (c) + : "t0", "t1", "t2", "t3", "t4", "t5", "t6" + ); +} + +void ScaleRowDown38_3_Int_MIPS_DSPR2(const uint8* src_ptr, + ptrdiff_t src_stride, + uint8* dst_ptr, int dst_width) { + intptr_t stride = src_stride; + const uint8* s1 = src_ptr + stride; + stride += stride; + const uint8* s2 = src_ptr + stride; + const int c1 = 0x1C71; + const int c2 = 0x2AAA; + + __asm__ __volatile__ ( + ".set push \n" + ".set noreorder \n" + "1: \n" + "lw $t0, 0(%[src_ptr]) \n" // |S3|S2|S1|S0| + "lw $t1, 4(%[src_ptr]) \n" // |S7|S6|S5|S4| + "lw $t2, 0(%[s1]) \n" // |T3|T2|T1|T0| + "lw $t3, 4(%[s1]) \n" // |T7|T6|T5|T4| + "lw $t4, 0(%[s2]) \n" // |R3|R2|R1|R0| + "lw $t5, 4(%[s2]) \n" // |R7|R6|R5|R4| + "rotr $t1, $t1, 16 \n" // |S5|S4|S7|S6| + "packrl.ph $t6, $t1, $t3 \n" // |S7|S6|T7|T6| + "raddu.w.qb $t6, $t6 \n" // S7+S6+T7+T6 + "packrl.ph $t7, $t3, $t1 \n" // |T5|T4|S5|S4| + "raddu.w.qb $t7, $t7 \n" // T5+T4+S5+S4 + "sll $t8, $t5, 16 \n" // |R5|R4|0|0| + "raddu.w.qb $t8, $t8 \n" // R5+R4 + "addu $t7, $t7, $t8 \n" + "srl $t8, $t5, 16 \n" // |0|0|R7|R6| + "raddu.w.qb $t8, $t8 \n" // R7 + R6 + "addu $t6, $t6, $t8 \n" + "mul $t6, $t6, %[c2] \n" // t6 * 0x2AAA + "precrq.qb.ph $t8, $t0, $t2 \n" // |S3|S1|T3|T1| + "precrq.qb.ph $t8, $t8, $t4 \n" // |S3|T3|R3|R1| + "srl $t8, $t8, 8 \n" // |0|S3|T3|R3| + "raddu.w.qb $t8, $t8 \n" // S3 + T3 + R3 + "addu $t7, $t7, $t8 \n" + "mul $t7, $t7, %[c1] \n" // t7 * 0x1C71 + "sll $t0, $t0, 8 \n" // |S2|S1|S0|0| + "sll $t2, $t2, 8 \n" // |T2|T1|T0|0| + "sll $t4, $t4, 8 \n" // |R2|R1|R0|0| + "raddu.w.qb $t0, $t0 \n" + "raddu.w.qb $t2, $t2 \n" + "raddu.w.qb $t4, $t4 \n" + "addu $t0, $t0, $t2 \n" + "addu $t0, $t0, $t4 \n" + "mul $t0, $t0, %[c1] \n" // t0 * 0x1C71 + "addiu %[src_ptr], %[src_ptr], 8 \n" + "addiu %[s1], %[s1], 8 \n" + "addiu %[s2], %[s2], 8 \n" + "addiu %[dst_width], %[dst_width], -3 \n" + "addiu %[dst_ptr], %[dst_ptr], 3 \n" + "srl $t6, $t6, 16 \n" + "srl $t7, $t7, 16 \n" + "srl $t0, $t0, 16 \n" + "sb $t6, -1(%[dst_ptr]) \n" + "sb $t7, -2(%[dst_ptr]) \n" + "bgtz %[dst_width], 1b \n" + " sb $t0, -3(%[dst_ptr]) \n" + ".set pop \n" + : [src_ptr] "+r" (src_ptr), + [dst_ptr] "+r" (dst_ptr), + [s1] "+r" (s1), + [s2] "+r" (s2), + [dst_width] "+r" (dst_width) + : [c1] "r" (c1), [c2] "r" (c2) + : "t0", "t1", "t2", "t3", "t4", + "t5", "t6", "t7", "t8" + ); +} + void ScaleFilterRows_MIPS_DSPR2(unsigned char *dst_ptr, const unsigned char* src_ptr, ptrdiff_t src_stride, diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index a13fcbabf..7ed9bd0fa 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -630,4 +630,70 @@ TEST_F(libyuvTest, TestAffine) { #endif } +TEST_F(libyuvTest, TestCopyPlane) { + int err = 0; + int yw = benchmark_width_; + int yh = benchmark_height_; + int b = 12; + int i, j; + + int y_plane_size = (yw + b * 2) * (yh + b * 2); + srandom(time(NULL)); + align_buffer_16(orig_y, y_plane_size) + align_buffer_16(dst_c, y_plane_size) + align_buffer_16(dst_opt, y_plane_size); + + memset(orig_y, 0, y_plane_size); + memset(dst_c, 0, y_plane_size); + memset(dst_opt, 0, y_plane_size); + + // Fill image buffers with random data. + for (i = b; i < (yh + b); ++i) { + for (j = b; j < (yw + b); ++j) { + orig_y[i * (yw + b * 2) + j] = random() & 0xff; + } + } + + // Fill destination buffers with random data. + for (i = 0; i < y_plane_size; ++i) { + uint8 random_number = random() & 0x7f; + dst_c[i] = random_number; + dst_opt[i] = dst_c[i]; + } + + int y_off = b * (yw + b * 2) + b; + + int y_st = yw + b * 2; + int stride = 8; + + // Disable all optimizations. + MaskCpuFlags(0); + double c_time = get_time(); + for (j = 0; j < benchmark_iterations_; j++) { + CopyPlane(orig_y + y_off, y_st, dst_c + y_off, stride, yw, yh); + } + c_time = (get_time() - c_time) / benchmark_iterations_; + + // Enable optimizations. + MaskCpuFlags(-1); + double opt_time = get_time(); + for (j = 0; j < benchmark_iterations_; j++) { + CopyPlane(orig_y + y_off, y_st, dst_opt + y_off, stride, yw, yh); + } + opt_time = (get_time() - opt_time) / benchmark_iterations_; + printf(" %8d us C - %8d us OPT\n", + static_cast(c_time * 1e6), static_cast(opt_time * 1e6)); + + for (i = 0; i < y_plane_size; ++i) { + if (dst_c[i] != dst_opt[i]) + ++err; + } + + free_aligned_buffer_16(orig_y) + free_aligned_buffer_16(dst_c) + free_aligned_buffer_16(dst_opt) + + EXPECT_EQ(0, err); +} + } // namespace libyuv