From 7e050595571728eacb2855f4b2201c346aef6a61 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Wed, 1 Apr 2020 05:55:49 -0700 Subject: [PATCH] Apply clang format to libyuv source Bug: None Change-Id: Ifd16b59d7f0dbf4402dd5741bb89d1ec06dfaac8 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2131868 Reviewed-by: Frank Barchard Reviewed-by: Hsiu Wang Commit-Queue: Frank Barchard --- README.chromium | 2 +- include/libyuv/planar_functions.h | 10 +- include/libyuv/rotate.h | 14 - include/libyuv/version.h | 2 +- source/planar_functions.cc | 32 +- source/row_common.cc | 3 +- source/row_gcc.cc | 29 +- source/row_mmi.cc | 2668 +++++++++++++++-------------- source/row_neon.cc | 34 +- source/row_neon64.cc | 62 +- 10 files changed, 1481 insertions(+), 1375 deletions(-) diff --git a/README.chromium b/README.chromium index 0aac5acc8..f24f37bb4 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1747 +Version: 1748 License: BSD License File: LICENSE diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 8d96c8fc9..30a6d3bcd 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -319,11 +319,11 @@ int ARGBMirror(const uint8_t* src_argb, // RGB24 mirror. LIBYUV_API int RGB24Mirror(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height); + int src_stride_rgb24, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); // Mirror a plane of data. LIBYUV_API diff --git a/include/libyuv/rotate.h b/include/libyuv/rotate.h index 7a550b989..308882242 100644 --- a/include/libyuv/rotate.h +++ b/include/libyuv/rotate.h @@ -83,20 +83,6 @@ int NV12ToI420Rotate(const uint8_t* src_y, int height, enum RotationMode mode); -// Rotate NV12 input and store in NV12. -LIBYUV_API -int NV12Rotate(const uint8_t* src_y, - int src_stride_y, - const uint8_t* src_uv, - int src_stride_uv, - uint8_t* dst_y, - int dst_stride_y, - uint8_t* dst_uv, - int dst_stride_uv, - int width, - int height, - enum RotationMode mode); - // Rotate a plane by 0, 90, 180, or 270. LIBYUV_API int RotatePlane(const uint8_t* src, diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 3f314b072..6e30190b2 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1747 +#define LIBYUV_VERSION 1748 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index e42795446..eea4fdc56 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -986,8 +986,12 @@ int YUY2ToY(const uint8_t* src_yuy2, // Mirror a plane of data. // See Also I400Mirror LIBYUV_API -void MirrorPlane(const uint8_t* src_y, int src_stride_y, uint8_t* dst_y, - int dst_stride_y, int width, int height) { +void MirrorPlane(const uint8_t* src_y, + int src_stride_y, + uint8_t* dst_y, + int dst_stride_y, + int width, + int height) { int y; void (*MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = MirrorRow_C; // Negative height means invert the image. @@ -1182,11 +1186,11 @@ int ARGBMirror(const uint8_t* src_argb, // RGB24 mirror. LIBYUV_API int RGB24Mirror(const uint8_t* src_rgb24, - int src_stride_rgb24, - uint8_t* dst_rgb24, - int dst_stride_rgb24, - int width, - int height) { + int src_stride_rgb24, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { int y; void (*RGB24MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = RGB24MirrorRow_C; @@ -3101,14 +3105,11 @@ int GaussPlane_F32(const float* src, int width, int height) { int y; - void (*GaussCol_F32)(const float* src0, - const float* src1, - const float* src2, - const float* src3, - const float* src4, - float* dst, + void (*GaussCol_F32)(const float* src0, const float* src1, const float* src2, + const float* src3, const float* src4, float* dst, int width) = GaussCol_F32_C; - void (*GaussRow_F32)(const float* src, float* dst, int width) = GaussRow_F32_C; + void (*GaussRow_F32)(const float* src, float* dst, int width) = + GaussRow_F32_C; if (!src || !dst || width <= 0 || height == 0) { return -1; } @@ -3139,10 +3140,9 @@ int GaussPlane_F32(const float* src, const float* src1 = src; const float* src2 = src; const float* src3 = src2 + ((height > 1) ? src_stride : 0); - const float* src4 = src3 + ((height > 2) ? src_stride: 0); + const float* src4 = src3 + ((height > 2) ? src_stride : 0); for (y = 0; y < height; ++y) { - GaussCol_F32(src0, src1, src2, src3, src4, row, width); // Extrude edge by 2 floats diff --git a/source/row_common.cc b/source/row_common.cc index 9cabaaaca..800e22012 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2201,8 +2201,7 @@ void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { } } -void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, - int width) { +void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, int width) { int x; src_rgb24 += width * 3 - 3; for (x = 0; x < width; ++x) { diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 087e76292..d8480d56e 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -3266,14 +3266,15 @@ void MirrorUVRow_SSSE3(const uint8_t* src, // Shuffle first 5 pixels to last 5 mirrored. first byte zero static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, - 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u}; + 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u}; // Shuffle last 5 pixels to first 5 mirrored. last byte zero -static const uvec8 kShuffleMirrorRGB1 = {13u, 14u, 15u, 10u, 11u, 12u, 7u, - 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u}; +static const uvec8 kShuffleMirrorRGB1 = { + 13u, 14u, 15u, 10u, 11u, 12u, 7u, 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u}; // Shuffle 5 pixels at a time (15 bytes) -void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_rgb24, +void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, int width) { intptr_t temp_width = (intptr_t)(width); src_rgb24 += width * 3 - 48; @@ -3292,21 +3293,21 @@ void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_rgb24, "pshufb %%xmm4,%%xmm2 \n" "pshufb %%xmm5,%%xmm3 \n" "lea -0x30(%0),%0 \n" - "movdqu %%xmm0,32(%1) \n" // last 5 - "movdqu %%xmm1,17(%1) \n" // next 5 - "movdqu %%xmm2,2(%1) \n" // next 5 - "movlpd %%xmm3,0(%1) \n" // first 1 + "movdqu %%xmm0,32(%1) \n" // last 5 + "movdqu %%xmm1,17(%1) \n" // next 5 + "movdqu %%xmm2,2(%1) \n" // next 5 + "movlpd %%xmm3,0(%1) \n" // first 1 "lea 0x30(%1),%1 \n" "sub $0x10,%2 \n" "jg 1b \n" - : "+r"(src_rgb24), // %0 - "+r"(dst_rgb24), // %1 - "+r"(temp_width) // %2 - : "m"(kShuffleMirrorRGB0), // %3 - "m"(kShuffleMirrorRGB1) // %4 + : "+r"(src_rgb24), // %0 + "+r"(dst_rgb24), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorRGB0), // %3 + "m"(kShuffleMirrorRGB1) // %4 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); } -#endif // HAS_RGB24MIRRORROW_SSSE3 +#endif // HAS_RGB24MIRRORROW_SSSE3 #ifdef HAS_ARGBMIRRORROW_SSE2 diff --git a/source/row_mmi.cc b/source/row_mmi.cc index 50cfca726..f778d25e1 100644 --- a/source/row_mmi.cc +++ b/source/row_mmi.cc @@ -21,6 +21,8 @@ extern "C" { // This module is for Mips MMI. #if !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) +// clang-format off + void RGB24ToARGBRow_MMI(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { @@ -6040,90 +6042,93 @@ void I444ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y, u, v; - uint64_t b_vec[2], g_vec[2], r_vec[2]; + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" // yg - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" // bb - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" // ub - "or %[ub], %[ub], %[mask] \n\t" // must - // sign - // extension - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" // bg - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" // ug - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" // vg - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" // br - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" // vr - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" // sign - // extension + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + __asm__ volatile ( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub + "or %[ub], %[ub], %[mask] \n\t"//must sign extension + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t"//sign extension - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t" // y1 + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - "punpcklbh %[u], %[u], %[zero] \n\t" // u - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t"//u + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" // v - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" // u*ug - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" // v*vg - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t"//v + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" // v*vr - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" // ffffgggg - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" // gbgbgbgb - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" // frfrfrfr - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]), - [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]), - [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]), - [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug), - [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg), - [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6), - [five] "f"(0x55), [mask] "f"(mask) - : "memory"); + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [y]"=&f"(y), + [u]"=&f"(u), [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [alpha]"f"(-1), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask) + : "memory" + ); } // Also used for 420 @@ -6133,96 +6138,99 @@ void I422ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y, u, v; - uint64_t b_vec[2], g_vec[2], r_vec[2]; + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" // yg - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" // bb - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" // ub - "or %[ub], %[ub], %[mask] \n\t" // must - // sign - // extension - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" // bg - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" // ug - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" // vg - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" // br - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" // vr - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" // sign - // extension + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub + "or %[ub], %[ub], %[mask] \n\t"//must sign extension + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t"//sign extension - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t" // y1 + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" // u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" // v - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" // u*ug - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" // v*vg - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t"//v + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" // v*vr - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" // ffffgggg - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" // gbgbgbgb - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" // frfrfrfr - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]), - [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]), - [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]), - [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug), - [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg), - [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6), - [five] "f"(0x55), [mask] "f"(mask) - : "memory"); + : [y]"=&f"(y), + [u]"=&f"(u), [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [alpha]"f"(-1), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask) + : "memory" + ); } // 10 bit YUV to ARGB @@ -6232,96 +6240,102 @@ void I210ToARGBRow_MMI(const uint16_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y, u, v; - uint64_t b_vec[2], g_vec[2], r_vec[2]; + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" - "1: \n\t" - "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "psllh %[y], %[y], %[six] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "psllh %[y], %[y], %[six] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "punpcklhw %[u], %[u], %[u] \n\t" - "psrah %[u], %[u], %[two] \n\t" - "punpcklhw %[v], %[v], %[v] \n\t" - "psrah %[v], %[v], %[two] \n\t" - "pminsh %[u], %[u], %[mask1] \n\t" - "pminsh %[v], %[v], %[mask1] \n\t" + "punpcklhw %[u], %[u], %[u] \n\t" + "psrah %[u], %[u], %[two] \n\t" + "punpcklhw %[v], %[v], %[v] \n\t" + "psrah %[v], %[v], %[two] \n\t" + "pminsh %[u], %[u], %[mask1] \n\t" + "pminsh %[v], %[v], %[mask1] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]), - [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]), - [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]), - [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug), - [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg), - [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6), - [five] "f"(0x55), [mask] "f"(mask), [two] "f"(0x02), - [mask1] "f"(0x00ff00ff00ff00ff) - : "memory"); + : [y]"=&f"(y), + [u]"=&f"(u), [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [alpha]"f"(-1), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask), [two]"f"(0x02), + [mask1]"f"(0x00ff00ff00ff00ff) + : "memory" + ); } void I422AlphaToARGBRow_MMI(const uint8_t* src_y, @@ -6331,96 +6345,102 @@ void I422AlphaToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y, u, v, a; - uint64_t b_vec[2], g_vec[2], r_vec[2]; + uint64_t y,u,v,a; + uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t" - "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t" + "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t" // y1 + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" // u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[a] \n\t" - "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t" // aaaagggg - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[a] \n\t" + "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t"//aaaagggg + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [a] "=&f"(a), - [b_vec0] "=&f"(b_vec[0]), [b_vec1] "=&f"(b_vec[1]), - [g_vec0] "=&f"(g_vec[0]), [g_vec1] "=&f"(g_vec[1]), - [r_vec0] "=&f"(r_vec[0]), [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), - [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), - [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [a_ptr] "r"(src_a), [zero] "f"(0x00), - [six] "f"(0x6), [five] "f"(0x55), [mask] "f"(mask) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), [a]"=&f"(a), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [a_ptr]"r"(src_a), [zero]"f"(0x00), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask) + : "memory" + ); } void I422ToRGB24Row_MMI(const uint8_t* src_y, @@ -6429,105 +6449,113 @@ void I422ToRGB24Row_MMI(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y, u, v; - uint64_t b_vec[2], g_vec[2], r_vec[2]; + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t" // y1 + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" // u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" - "packushb %[g_vec0], %[g_vec0], %[zero] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" + "packushb %[g_vec0], %[g_vec0], %[zero] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t" - "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t" - "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t" - "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t" - "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t" - "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "pextrh %[r_vec1], %[g_vec1], %[one] \n\t" - "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t" - "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t" - "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t" + "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t" + "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t" + "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t" + "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t" + "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "pextrh %[r_vec1], %[g_vec1], %[one] \n\t" + "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t" + "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t" + "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]), - [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]), - [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]), - [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug), - [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg), - [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), - [mask] "f"(mask), [lmove1] "f"(0x18), [rmove1] "f"(0x8), [one] "f"(0x1) - : "memory"); + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask]"f"(mask), + [lmove1]"f"(0x18), [rmove1]"f"(0x8), + [one]"f"(0x1) + : "memory" + ); } void I422ToARGB4444Row_MMI(const uint8_t* src_y, @@ -6538,103 +6566,110 @@ void I422ToARGB4444Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t" // y1 + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" // u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "and %[g_vec], %[g_vec], %[mask1] \n\t" - "psrlw %[g_vec], %[g_vec], %[four] \n\t" - "psrlw %[r_vec], %[g_vec], %[four] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" - "and %[g_vec], %[g_vec], %[r_vec] \n\t" + "and %[g_vec], %[g_vec], %[mask1] \n\t" + "psrlw %[g_vec], %[g_vec], %[four] \n\t" + "psrlw %[r_vec], %[g_vec], %[four] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" + "and %[g_vec], %[g_vec], %[r_vec] \n\t" - "and %[b_vec], %[b_vec], %[mask1] \n\t" - "psrlw %[b_vec], %[b_vec], %[four] \n\t" - "psrlw %[r_vec], %[b_vec], %[four] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" - "and %[b_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[b_vec] \n\t" + "and %[b_vec], %[b_vec], %[mask1] \n\t" + "psrlw %[b_vec], %[b_vec], %[four] \n\t" + "psrlw %[r_vec], %[b_vec], %[four] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" + "and %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[b_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [dst_argb4444] "r"(dst_argb4444), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), - [mask] "f"(0xff00ff00ff00ff00), [four] "f"(0x4), - [mask1] "f"(0xf0f0f0f0f0f0f0f0), [alpha] "f"(-1) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [dst_argb4444]"r"(dst_argb4444), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask]"f"(0xff00ff00ff00ff00), + [four]"f"(0x4), [mask1]"f"(0xf0f0f0f0f0f0f0f0), + [alpha]"f"(-1) + : "memory" + ); } void I422ToARGB1555Row_MMI(const uint8_t* src_y, @@ -6645,118 +6680,125 @@ void I422ToARGB1555Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "psrlw %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "or %[g_vec], %[g_vec], %[mask3] \n\t" + "psrlw %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "or %[g_vec], %[g_vec], %[mask3] \n\t" - "psrlw %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "or %[b_vec], %[b_vec], %[mask3] \n\t" + "psrlw %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "or %[b_vec], %[b_vec], %[mask3] \n\t" - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [dst_argb1555] "r"(dst_argb1555), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), - [mask1] "f"(0xff00ff00ff00ff00), [three] "f"(0x3), - [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), - [mask3] "f"(0x800000008000), [lmove5] "f"(0x5) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [dst_argb1555]"r"(dst_argb1555), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [three]"f"(0x3), [mask2]"f"(0x1f0000001f), + [eight]"f"(0x8), [mask3]"f"(0x800000008000), + [lmove5]"f"(0x5) + : "memory" + ); } void I422ToRGB565Row_MMI(const uint8_t* src_y, @@ -6767,120 +6809,127 @@ void I422ToRGB565Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "psrlh %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "psrlh %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "psrlh %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "psrlh %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [dst_rgb565] "r"(dst_rgb565), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), - [mask1] "f"(0xff00ff00ff00ff00), [three] "f"(0x3), - [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), [seven] "f"(0x7), - [lmove5] "f"(0x5) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [dst_rgb565]"r"(dst_rgb565), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [three]"f"(0x3), [mask2]"f"(0x1f0000001f), + [eight]"f"(0x8), [seven]"f"(0x7), + [lmove5]"f"(0x5) + : "memory" + ); } void NV12ToARGBRow_MMI(const uint8_t* src_y, @@ -6890,83 +6939,91 @@ void NV12ToARGBRow_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1) + : "memory" + ); } void NV21ToARGBRow_MMI(const uint8_t* src_y, @@ -6976,83 +7033,91 @@ void NV21ToARGBRow_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[ushu] \n\t" - "pshufh %[u], %[u], %[vshu] \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[ushu] \n\t" + "pshufh %[u], %[u], %[vshu] \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [vu_ptr] "r"(src_vu), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1) + : "memory" + ); } void NV12ToRGB24Row_MMI(const uint8_t* src_y, @@ -7062,95 +7127,103 @@ void NV12ToRGB24Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" - "psllw %[temp], %[r_vec], %[lmove1] \n\t" - "or %[g_vec], %[g_vec], %[temp] \n\t" - "psrlw %[temp], %[r_vec], %[rmove1] \n\t" - "pextrh %[temp], %[temp], %[zero] \n\t" - "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[zero] \n\t" - "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[one] \n\t" - "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" - "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" - "or %[b_vec], %[b_vec], %[temp] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" + "psllw %[temp], %[r_vec], %[lmove1] \n\t" + "or %[g_vec], %[g_vec], %[temp] \n\t" + "psrlw %[temp], %[r_vec], %[rmove1] \n\t" + "pextrh %[temp], %[temp], %[zero] \n\t" + "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[zero] \n\t" + "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[one] \n\t" + "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" + "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" + "or %[b_vec], %[b_vec], %[temp] \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [lmove1] "f"(0x18), - [one] "f"(0x1), [rmove1] "f"(0x8) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1), [lmove1]"f"(0x18), + [one]"f"(0x1), [rmove1]"f"(0x8) + : "memory" + ); } void NV21ToRGB24Row_MMI(const uint8_t* src_y, @@ -7160,95 +7233,103 @@ void NV21ToRGB24Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[ushu] \n\t" - "pshufh %[u], %[u], %[vshu] \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[ushu] \n\t" + "pshufh %[u], %[u], %[vshu] \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" - "psllw %[temp], %[r_vec], %[lmove1] \n\t" - "or %[g_vec], %[g_vec], %[temp] \n\t" - "psrlw %[temp], %[r_vec], %[rmove1] \n\t" - "pextrh %[temp], %[temp], %[zero] \n\t" - "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[zero] \n\t" - "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[one] \n\t" - "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" - "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" - "or %[b_vec], %[b_vec], %[temp] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" + "psllw %[temp], %[r_vec], %[lmove1] \n\t" + "or %[g_vec], %[g_vec], %[temp] \n\t" + "psrlw %[temp], %[r_vec], %[rmove1] \n\t" + "pextrh %[temp], %[temp], %[zero] \n\t" + "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[zero] \n\t" + "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[one] \n\t" + "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" + "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" + "or %[b_vec], %[b_vec], %[temp] \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [vu_ptr] "r"(src_vu), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [lmove1] "f"(0x18), - [rmove1] "f"(0x8), [one] "f"(0x1) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [lmove1]"f"(0x18), [rmove1]"f"(0x8), + [one]"f"(0x1) + : "memory" + ); } void NV12ToRGB565Row_MMI(const uint8_t* src_y, @@ -7258,115 +7339,123 @@ void NV12ToRGB565Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "psrlh %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psubb %[y], %[eight], %[three] \n\t" // 5 - "psllw %[r_vec], %[r_vec], %[y] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "psrlh %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psubb %[y], %[eight], %[three] \n\t"//5 + "psllw %[r_vec], %[r_vec], %[y] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "psrlh %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psubb %[y], %[eight], %[three] \n\t" // 5 - "psllw %[r_vec], %[r_vec], %[y] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "psrlh %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psubb %[y], %[eight], %[three] \n\t"//5 + "psllw %[r_vec], %[r_vec], %[y] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [dst_rgb565] "r"(dst_rgb565), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [three] "f"(0x3), - [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), [seven] "f"(0x7) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), + [dst_rgb565]"r"(dst_rgb565), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [three]"f"(0x3), [mask2]"f"(0x1f0000001f), + [eight]"f"(0x8), [seven]"f"(0x7) + : "memory" + ); } void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2, @@ -7375,83 +7464,90 @@ void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t" - "psrlh %[temp], %[y], %[eight] \n\t" - "pshufh %[u], %[temp], %[ushu] \n\t" - "pshufh %[v], %[temp], %[vshu] \n\t" + "1: \n\t" + "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t" + "psrlh %[temp], %[y], %[eight] \n\t" + "pshufh %[u], %[temp], %[ushu] \n\t" + "pshufh %[v], %[temp], %[vshu] \n\t" - "psrlh %[temp], %[mask1], %[eight] \n\t" - "and %[y], %[y], %[temp] \n\t" - "psllh %[temp], %[y], %[eight] \n\t" - "or %[y], %[y], %[temp] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "psrlh %[temp], %[mask1], %[eight] \n\t" + "and %[y], %[y], %[temp] \n\t" + "psllh %[temp], %[y], %[eight] \n\t" + "or %[y], %[y], %[temp] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [yuy2_ptr] "r"(src_yuy2), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [eight] "f"(0x8) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [yuy2_ptr]"r"(src_yuy2), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1), [eight]"f"(0x8) + : "memory" + ); } void UYVYToARGBRow_MMI(const uint8_t* src_uyvy, @@ -7460,83 +7556,90 @@ void UYVYToARGBRow_MMI(const uint8_t* src_uyvy, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t" - "psrlh %[temp], %[mask1], %[eight] \n\t" - "and %[temp], %[y], %[temp] \n\t" - "pshufh %[u], %[temp], %[ushu] \n\t" - "pshufh %[v], %[temp], %[vshu] \n\t" + "1: \n\t" + "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t" + "psrlh %[temp], %[mask1], %[eight] \n\t" + "and %[temp], %[y], %[temp] \n\t" + "pshufh %[u], %[temp], %[ushu] \n\t" + "pshufh %[v], %[temp], %[vshu] \n\t" - "psrlh %[y], %[y], %[eight] \n\t" - "psllh %[temp], %[y], %[eight] \n\t" - "or %[y], %[y], %[temp] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "psrlh %[y], %[y], %[eight] \n\t" + "psllh %[temp], %[y], %[eight] \n\t" + "or %[y], %[y], %[temp] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [uyvy_ptr] "r"(src_uyvy), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [eight] "f"(0x8) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [uyvy_ptr]"r"(src_uyvy), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1), [eight]"f"(0x8) + : "memory" + ); } void I422ToRGBARow_MMI(const uint8_t* src_y, @@ -7547,105 +7650,114 @@ void I422ToRGBARow_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t" - "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t" - "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t" + "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t" + "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), - [mask1] "f"(0xff00ff00ff00ff00), [alpha] "f"(-1) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [alpha]"f"(-1) + : "memory" + ); } void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) { - __asm__ volatile( - "punpcklwd %[v32], %[v32], %[v32] \n\t" - "1: \n\t" - "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t" - "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t" + __asm__ volatile ( + "punpcklwd %[v32], %[v32], %[v32] \n\t" + "1: \n\t" + "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t" + "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "bnez %[width], 1b \n\t" - : [v32] "+&f"(v32) - : [dst_ptr] "r"(dst_argb), [width] "r"(width) - : "memory"); + "daddi %[width], %[width], -0x04 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "bnez %[width], 1b \n\t" + : [v32]"+&f"(v32) + : [dst_ptr]"r"(dst_argb), [width]"r"(width) + : "memory" + ); } +// clang-format on // 10 bit YUV to ARGB #endif // !defined(LIBYUV_DISABLE_MMI) && defined(_MIPS_ARCH_LOONGSON3A) diff --git a/source/row_neon.cc b/source/row_neon.cc index c3e1cf814..eecec2910 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -726,24 +726,29 @@ void MirrorUVRow_NEON(const uint8_t* src_uv, : "cc", "memory", "r12", "q0"); } -void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { - src += width * 4 - 16; +void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( + "add %0, %0, %2, lsl #2 \n" + "sub %0, #32 \n" + "1: \n" - "vld1.8 {q0}, [%0], %3 \n" // src -= 16 - "subs %2, #4 \n" // 4 pixels per loop. - "vrev64.32 q0, q0 \n" - "vst1.8 {d1}, [%1]! \n" // dst += 16 - "vst1.8 {d0}, [%1]! \n" + "vld4.8 {d0, d1, d2, d3}, [%0], %3 \n" // src -= 32 + "subs %2, #8 \n" // 8 pixels per loop. + "vrev64.8 d0, d0 \n" + "vrev64.8 d1, d1 \n" + "vrev64.8 d2, d2 \n" + "vrev64.8 d3, d3 \n" + "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // dst += 32 "bgt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(-16) // %3 - : "cc", "memory", "q0"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"(-32) // %3 + : "cc", "memory", "d0", "d1", "d2", "d3"); } -void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_rgb24, +void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, int width) { src_rgb24 += width * 3 - 24; asm volatile( @@ -762,7 +767,8 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_rgb24, : "cc", "memory", "d0", "d1", "d2"); } -void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, +void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_argb, int width) { asm volatile( "vmov.u8 d4, #255 \n" // Alpha diff --git a/source/row_neon64.cc b/source/row_neon64.cc index f9d56992c..6e1bdf142 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -265,6 +265,8 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y, "sri v0.8h, v21.8h, #5 \n" /* RG */ \ "sri v0.8h, v20.8h, #11 \n" /* RGB */ +// clang-format off + void I422ToRGB565Row_NEON(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -272,15 +274,15 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile( - YUVTORGB_SETUP - "1: \n" READYUV422 YUVTORGB( - v22, v21, - v20) "subs %w4, %w4, #8 \n" ARGBTORGB565 - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels - // RGB565. - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 - // lines ahead - "b.gt 1b \n" + YUVTORGB_SETUP + "1: \n" + READYUV422 + YUVTORGB(v22, v21, v20) + "subs %w4, %w4, #8 \n" + ARGBTORGB565 + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -310,16 +312,16 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, const struct YuvConstants* yuvconstants, int width) { asm volatile( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" - "1: \n" READYUV422 YUVTORGB( - v22, v21, - v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555 - "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels - // RGB565. - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 - // lines ahead - "b.gt 1b \n" + YUVTORGB_SETUP + "movi v23.8b, #255 \n" + "1: \n" + READYUV422 + YUVTORGB(v22, v21, v20) + "subs %w4, %w4, #8 \n" + ARGBTOARGB1555 + "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -332,6 +334,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"); } +// clang-format on #define ARGBTOARGB4444 \ /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ @@ -786,8 +789,7 @@ void MirrorUVRow_NEON(const uint8_t* src_uv, : "cc", "memory", "v0", "v1"); } -void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, - int width) { +void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { asm volatile( "ld1 {v4.16b}, [%4] \n" // shuffler "add %0, %0, %w2, sxtw #2 \n" // Start at end of row. @@ -801,23 +803,26 @@ void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, "tbl v3.16b, {v3.16b}, v4.16b \n" "st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%1], #64 \n" // dst += 64 "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_argb), // %1 + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 "+r"(width) // %2 : "r"((ptrdiff_t)-64), // %3 "r"(&kShuffleMirror) // %4 : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); } -void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_rgb24, +void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, + uint8_t* dst_rgb24, int width) { - src_rgb24 += width * 3 - 48; asm volatile( "ld1 {v3.16b}, [%4] \n" // shuffler + "add %0, %0, %w2, sxtw #1 \n" // Start at end of row. + "add %0, %0, %w2, sxtw \n" + "sub %0, %0, #48 \n" "1: \n" "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3\n" // src -= 48 - "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "subs %w2, %w2, #16 \n" // 16 pixels per loop. "tbl v0.16b, {v0.16b}, v3.16b \n" "tbl v1.16b, {v1.16b}, v3.16b \n" "tbl v2.16b, {v2.16b}, v3.16b \n" @@ -2211,11 +2216,9 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { "umull v0.8h, v0.8b, v4.8b \n" // B "umlal v0.8h, v1.8b, v5.8b \n" // G "umlal v0.8h, v2.8b, v6.8b \n" // R - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 cache lines - // ahead + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_yj), // %1 @@ -2369,7 +2372,6 @@ void ARGBAttenuateRow_NEON(const uint8_t* src_argb, "uqrshrn v1.8b, v5.8h, #8 \n" // g >>= 8 "uqrshrn v2.8b, v6.8h, #8 \n" // r >>= 8 "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" // store 8 ARGB - // pixels "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "b.gt 1b \n" : "+r"(src_argb), // %0