From aabcc477bdca9457bff773d3e767342e3d6af274 Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 24 Mar 2020 12:07:52 -0700 Subject: [PATCH] RGB24Mirror function Bug: b/151960427 Change-Id: I413db0011a4ed87eefc0dd166bb8e076b5aa4b1d Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2116639 Commit-Queue: Frank Barchard Reviewed-by: richard winterton --- include/libyuv/planar_functions.h | 12 + include/libyuv/rotate.h | 14 + include/libyuv/row.h | 16 +- include/libyuv/video_common.h | 12 +- source/convert.cc | 24 +- source/convert_argb.cc | 3 - source/planar_functions.cc | 48 +- source/rotate_argb.cc | 2 +- source/row_any.cc | 8 +- source/row_common.cc | 28 +- source/row_gcc.cc | 46 + source/row_mmi.cc | 2665 ++++++++++++++--------------- source/row_neon.cc | 35 +- source/row_neon64.cc | 119 +- unit_test/convert_test.cc | 1 + unit_test/scale_test.cc | 110 +- 16 files changed, 1614 insertions(+), 1529 deletions(-) diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 2aa95335a..8d96c8fc9 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -313,6 +313,18 @@ int ARGBMirror(const uint8_t* src_argb, int width, int height); +// Alias +#define RGB24ToRGB24Mirror RGB24Mirror + +// RGB24 mirror. +LIBYUV_API +int RGB24Mirror(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height); + // Mirror a plane of data. LIBYUV_API void MirrorPlane(const uint8_t* src_y, diff --git a/include/libyuv/rotate.h b/include/libyuv/rotate.h index 308882242..7a550b989 100644 --- a/include/libyuv/rotate.h +++ b/include/libyuv/rotate.h @@ -83,6 +83,20 @@ int NV12ToI420Rotate(const uint8_t* src_y, int height, enum RotationMode mode); +// Rotate NV12 input and store in NV12. +LIBYUV_API +int NV12Rotate(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_uv, + int src_stride_uv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_uv, + int dst_stride_uv, + int width, + int height, + enum RotationMode mode); + // Rotate a plane by 0, 90, 180, or 270. LIBYUV_API int RotatePlane(const uint8_t* src, diff --git a/include/libyuv/row.h b/include/libyuv/row.h index d66dfcd96..1e177a611 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -279,6 +279,7 @@ extern "C" { #define HAS_I422TOAR30ROW_SSSE3 #define HAS_MERGERGBROW_SSSE3 #define HAS_RAWTORGBAROW_SSSE3 +#define HAS_RGB24MIRRORROW_SSSE3 #define HAS_RGBATOYJROW_SSSE3 #define HAS_SPLITRGBROW_SSSE3 #define HAS_SWAPUVROW_SSSE3 @@ -408,6 +409,7 @@ extern "C" { #define HAS_ARGBCOLORMATRIXROW_NEON #define HAS_ARGBGRAYROW_NEON #define HAS_ARGBMIRRORROW_NEON +#define HAS_RGB24MIRRORROW_NEON #define HAS_ARGBMULTIPLYROW_NEON #define HAS_ARGBQUANTIZEROW_NEON #define HAS_ARGBSEPIAROW_NEON @@ -1196,7 +1198,9 @@ void BGRAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ABGRToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGBAToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); -void RGB24ToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24ToYJRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); void RAWToYRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RAWToYJRow_Any_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void RGB24ToYJRow_Any_AVX2(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); @@ -1608,6 +1612,16 @@ void ARGBMirrorRow_Any_NEON(const uint8_t* src_ptr, void ARGBMirrorRow_Any_MSA(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); void ARGBMirrorRow_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_ptr, int width); +void RGB24MirrorRow_SSSE3(const uint8_t* src, uint8_t* dst, int width); +void RGB24MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width); +void RGB24MirrorRow_C(const uint8_t* src, uint8_t* dst, int width); +void RGB24MirrorRow_Any_SSSE3(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); +void RGB24MirrorRow_Any_NEON(const uint8_t* src_ptr, + uint8_t* dst_ptr, + int width); + void SplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, diff --git a/include/libyuv/video_common.h b/include/libyuv/video_common.h index 666eb3439..cbfe95732 100644 --- a/include/libyuv/video_common.h +++ b/include/libyuv/video_common.h @@ -86,10 +86,14 @@ enum FourCC { FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'), FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'), FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420. - FOURCC_J420 = FOURCC('J', '4', '2', '0'), // jpeg (bt.601 full), unofficial fourcc - FOURCC_J422 = FOURCC('J', '4', '2', '2'), // jpeg (bt.601 full), unofficial fourcc - FOURCC_J444 = FOURCC('J', '4', '4', '4'), // jpeg (bt.601 full), unofficial fourcc - FOURCC_J400 = FOURCC('J', '4', '0', '0'), // jpeg (bt.601 full), unofficial fourcc + FOURCC_J420 = + FOURCC('J', '4', '2', '0'), // jpeg (bt.601 full), unofficial fourcc + FOURCC_J422 = + FOURCC('J', '4', '2', '2'), // jpeg (bt.601 full), unofficial fourcc + FOURCC_J444 = + FOURCC('J', '4', '4', '4'), // jpeg (bt.601 full), unofficial fourcc + FOURCC_J400 = + FOURCC('J', '4', '0', '0'), // jpeg (bt.601 full), unofficial fourcc FOURCC_H420 = FOURCC('H', '4', '2', '0'), // bt.709, unofficial fourcc FOURCC_H422 = FOURCC('H', '4', '2', '2'), // bt.709, unofficial fourcc FOURCC_H444 = FOURCC('H', '4', '4', '4'), // bt.709, unofficial fourcc diff --git a/source/convert.cc b/source/convert.cc index d1dc9a42e..3a03e456d 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -1599,8 +1599,7 @@ int RGB24ToJ420(const uint8_t* src_rgb24, int height) { int y; #if (defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \ - defined(HAS_RGB24TOYJROW_MSA) || \ - defined(HAS_RGB24TOYJROW_MMI) + defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI) void (*RGB24ToUVJRow)(const uint8_t* src_rgb24, int src_stride_rgb24, uint8_t* dst_u, uint8_t* dst_v, int width) = RGB24ToUVJRow_C; @@ -1698,8 +1697,8 @@ int RGB24ToJ420(const uint8_t* src_rgb24, #endif for (y = 0; y < height - 1; y += 2) { -#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \ - defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)) +#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \ + defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)) RGB24ToUVJRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width); RGB24ToYJRow(src_rgb24, dst_y, width); RGB24ToYJRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width); @@ -1716,8 +1715,8 @@ int RGB24ToJ420(const uint8_t* src_rgb24, dst_v += dst_stride_v; } if (height & 1) { -#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \ - defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)) +#if ((defined(HAS_RGB24TOYJROW_NEON) && defined(HAS_RGB24TOUVJROW_NEON)) || \ + defined(HAS_RGB24TOYJROW_MSA) || defined(HAS_RGB24TOYJROW_MMI)) RGB24ToUVJRow(src_rgb24, 0, dst_u, dst_v, width); RGB24ToYJRow(src_rgb24, dst_y, width); #else @@ -1748,8 +1747,7 @@ int RAWToI420(const uint8_t* src_raw, int height) { int y; #if (defined(HAS_RAWTOYROW_NEON) && defined(HAS_RAWTOUVROW_NEON)) || \ - defined(HAS_RAWTOYROW_MSA) || \ - defined(HAS_RAWTOYROW_MMI) + defined(HAS_RAWTOYROW_MSA) || defined(HAS_RAWTOYROW_MMI) void (*RAWToUVRow)(const uint8_t* src_raw, int src_stride_raw, uint8_t* dst_u, uint8_t* dst_v, int width) = RAWToUVRow_C; void (*RAWToYRow)(const uint8_t* src_raw, uint8_t* dst_y, int width) = @@ -2448,11 +2446,11 @@ int RGB24ToJ400(const uint8_t* src_rgb24, // Convert RAW to J400. LIBYUV_API int RAWToJ400(const uint8_t* src_raw, - int src_stride_raw, - uint8_t* dst_yj, - int dst_stride_yj, - int width, - int height) { + int src_stride_raw, + uint8_t* dst_yj, + int dst_stride_yj, + int width, + int height) { int y; void (*RAWToYJRow)(const uint8_t* src_raw, uint8_t* dst_yj, int width) = RAWToYJRow_C; diff --git a/source/convert_argb.cc b/source/convert_argb.cc index 4217b1dc9..e2455a857 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -1270,9 +1270,6 @@ static int I210ToARGBMatrix(const uint16_t* src_y, return 0; } - - - // Convert I210 to ARGB. LIBYUV_API int I210ToARGB(const uint16_t* src_y, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index b6aac913d..e42795446 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -1132,7 +1132,7 @@ int ARGBMirror(const uint8_t* src_argb, #if defined(HAS_ARGBMIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBMirrorRow = ARGBMirrorRow_Any_NEON; - if (IS_ALIGNED(width, 4)) { + if (IS_ALIGNED(width, 16)) { ARGBMirrorRow = ARGBMirrorRow_NEON; } } @@ -1179,6 +1179,52 @@ int ARGBMirror(const uint8_t* src_argb, return 0; } +// RGB24 mirror. +LIBYUV_API +int RGB24Mirror(const uint8_t* src_rgb24, + int src_stride_rgb24, + uint8_t* dst_rgb24, + int dst_stride_rgb24, + int width, + int height) { + int y; + void (*RGB24MirrorRow)(const uint8_t* src, uint8_t* dst, int width) = + RGB24MirrorRow_C; + if (!src_rgb24 || !dst_rgb24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24; + src_stride_rgb24 = -src_stride_rgb24; + } +#if defined(HAS_RGB24MIRRORROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + RGB24MirrorRow = RGB24MirrorRow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + RGB24MirrorRow = RGB24MirrorRow_NEON; + } + } +#endif +#if defined(HAS_RGB24MIRRORROW_SSSE3) + if (TestCpuFlag(kCpuHasSSSE3)) { + RGB24MirrorRow = RGB24MirrorRow_Any_SSSE3; + if (IS_ALIGNED(width, 16)) { + RGB24MirrorRow = RGB24MirrorRow_SSSE3; + } + } +#endif + + // Mirror plane + for (y = 0; y < height; ++y) { + RGB24MirrorRow(src_rgb24, dst_rgb24, width); + src_rgb24 += src_stride_rgb24; + dst_rgb24 += dst_stride_rgb24; + } + return 0; +} + // Get a blender that optimized for the CPU and pixel count. // As there are 6 blenders to choose from, the caller should try to use // the same blend function for all pixels if possible. diff --git a/source/rotate_argb.cc b/source/rotate_argb.cc index 12a240f3e..cdfdcc5c5 100644 --- a/source/rotate_argb.cc +++ b/source/rotate_argb.cc @@ -126,7 +126,7 @@ static int ARGBRotate180(const uint8_t* src_argb, #if defined(HAS_ARGBMIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { ARGBMirrorRow = ARGBMirrorRow_Any_NEON; - if (IS_ALIGNED(width, 4)) { + if (IS_ALIGNED(width, 16)) { ARGBMirrorRow = ARGBMirrorRow_NEON; } } diff --git a/source/row_any.cc b/source/row_any.cc index 0cf1a6f5e..2e9538bde 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1189,7 +1189,7 @@ ANY11M(ARGBMirrorRow_Any_AVX2, ARGBMirrorRow_AVX2, 4, 7) ANY11M(ARGBMirrorRow_Any_SSE2, ARGBMirrorRow_SSE2, 4, 3) #endif #ifdef HAS_ARGBMIRRORROW_NEON -ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 3) +ANY11M(ARGBMirrorRow_Any_NEON, ARGBMirrorRow_NEON, 4, 15) #endif #ifdef HAS_ARGBMIRRORROW_MSA ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) @@ -1197,6 +1197,12 @@ ANY11M(ARGBMirrorRow_Any_MSA, ARGBMirrorRow_MSA, 4, 15) #ifdef HAS_ARGBMIRRORROW_MMI ANY11M(ARGBMirrorRow_Any_MMI, ARGBMirrorRow_MMI, 4, 1) #endif +#ifdef HAS_RGB24MIRRORROW_SSSE3 +ANY11M(RGB24MirrorRow_Any_SSSE3, RGB24MirrorRow_SSSE3, 3, 15) +#endif +#ifdef HAS_RGB24MIRRORROW_NEON +ANY11M(RGB24MirrorRow_Any_NEON, RGB24MirrorRow_NEON, 3, 15) +#endif #undef ANY11M // Any 1 plane. (memset) diff --git a/source/row_common.cc b/source/row_common.cc index 97451dffa..9cabaaaca 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2201,6 +2201,22 @@ void ARGBMirrorRow_C(const uint8_t* src, uint8_t* dst, int width) { } } +void RGB24MirrorRow_C(const uint8_t* src_rgb24, uint8_t* dst_rgb24, + int width) { + int x; + src_rgb24 += width * 3 - 3; + for (x = 0; x < width; ++x) { + uint8_t b = src_rgb24[0]; + uint8_t g = src_rgb24[1]; + uint8_t r = src_rgb24[2]; + dst_rgb24[0] = b; + dst_rgb24[1] = g; + dst_rgb24[2] = r; + src_rgb24 -= 3; + dst_rgb24 += 3; + } +} + void SplitUVRow_C(const uint8_t* src_uv, uint8_t* dst_u, uint8_t* dst_v, @@ -3321,7 +3337,7 @@ void RGB24ToYJRow_AVX2(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { width -= twidth; } } -#endif //HAS_RGB24TOYJROW_AVX2 +#endif // HAS_RGB24TOYJROW_AVX2 #ifdef HAS_RAWTOYJROW_AVX2 // Convert 16 RAW pixels (64 bytes) to 16 YJ values. @@ -3337,7 +3353,7 @@ void RAWToYJRow_AVX2(const uint8_t* src_raw, uint8_t* dst_yj, int width) { width -= twidth; } } -#endif //HAS_RAWTOYJROW_AVX2 +#endif // HAS_RAWTOYJROW_AVX2 #ifdef HAS_RGB24TOYJROW_SSSE3 // Convert 16 RGB24 pixels (64 bytes) to 16 YJ values. @@ -3353,7 +3369,7 @@ void RGB24ToYJRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { width -= twidth; } } -#endif //HAS_RGB24TOYJROW_SSSE3 +#endif // HAS_RGB24TOYJROW_SSSE3 #ifdef HAS_RAWTOYJROW_SSSE3 // Convert 16 RAW pixels (64 bytes) to 16 YJ values. @@ -3369,7 +3385,7 @@ void RAWToYJRow_SSSE3(const uint8_t* src_raw, uint8_t* dst_yj, int width) { width -= twidth; } } -#endif //HAS_RAWTOYJROW_SSSE3 +#endif // HAS_RAWTOYJROW_SSSE3 float ScaleSumSamples_C(const float* src, float* dst, float scale, int width) { float fsum = 0.f; @@ -3427,8 +3443,8 @@ void GaussCol_C(const uint16_t* src0, void GaussRow_F32_C(const float* src, float* dst, int width) { int i; for (i = 0; i < width; ++i) { - *dst++ = - (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) * (1.0f / 256.0f); + *dst++ = (src[0] + src[1] * 4 + src[2] * 6 + src[3] * 4 + src[4]) * + (1.0f / 256.0f); ++src; } } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index c041ba112..087e76292 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -3262,6 +3262,52 @@ void MirrorUVRow_SSSE3(const uint8_t* src, } #endif // HAS_MIRRORUVROW_SSSE3 +#ifdef HAS_RGB24MIRRORROW_SSSE3 + +// Shuffle first 5 pixels to last 5 mirrored. first byte zero +static const uvec8 kShuffleMirrorRGB0 = {128u, 12u, 13u, 14u, 9u, 10u, 11u, 6u, + 7u, 8u, 3u, 4u, 5u, 0u, 1u, 2u}; + +// Shuffle last 5 pixels to first 5 mirrored. last byte zero +static const uvec8 kShuffleMirrorRGB1 = {13u, 14u, 15u, 10u, 11u, 12u, 7u, + 8u, 9u, 4u, 5u, 6u, 1u, 2u, 3u, 128u}; + +// Shuffle 5 pixels at a time (15 bytes) +void RGB24MirrorRow_SSSE3(const uint8_t* src_rgb24, uint8_t* dst_rgb24, + int width) { + intptr_t temp_width = (intptr_t)(width); + src_rgb24 += width * 3 - 48; + asm volatile( + "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" + + LABELALIGN + "1: \n" + "movdqu (%0),%%xmm0 \n" // first 5 + "movdqu 15(%0),%%xmm1 \n" // next 5 + "movdqu 30(%0),%%xmm2 \n" // next 5 + "movdqu 32(%0),%%xmm3 \n" // last 1 special + "pshufb %%xmm4,%%xmm0 \n" + "pshufb %%xmm4,%%xmm1 \n" + "pshufb %%xmm4,%%xmm2 \n" + "pshufb %%xmm5,%%xmm3 \n" + "lea -0x30(%0),%0 \n" + "movdqu %%xmm0,32(%1) \n" // last 5 + "movdqu %%xmm1,17(%1) \n" // next 5 + "movdqu %%xmm2,2(%1) \n" // next 5 + "movlpd %%xmm3,0(%1) \n" // first 1 + "lea 0x30(%1),%1 \n" + "sub $0x10,%2 \n" + "jg 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_rgb24), // %1 + "+r"(temp_width) // %2 + : "m"(kShuffleMirrorRGB0), // %3 + "m"(kShuffleMirrorRGB1) // %4 + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"); +} +#endif // HAS_RGB24MIRRORROW_SSSE3 + #ifdef HAS_ARGBMIRRORROW_SSE2 void ARGBMirrorRow_SSE2(const uint8_t* src, uint8_t* dst, int width) { diff --git a/source/row_mmi.cc b/source/row_mmi.cc index d7d34e47f..50cfca726 100644 --- a/source/row_mmi.cc +++ b/source/row_mmi.cc @@ -6040,93 +6040,90 @@ void I444ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y,u,v; - uint64_t b_vec[2],g_vec[2],r_vec[2]; + uint64_t y, u, v; + uint64_t b_vec[2], g_vec[2], r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; - __asm__ volatile ( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub - "or %[ub], %[ub], %[mask] \n\t"//must sign extension - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t"//sign extension + uint64_t ub, ug, vg, vr, bb, bg, br, yg; + __asm__ volatile( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" // yg + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" // bb + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" // ub + "or %[ub], %[ub], %[mask] \n\t" // must + // sign + // extension + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" // bg + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" // ug + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" // vg + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" // br + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" // vr + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" // sign + // extension - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t" // y1 - "punpcklbh %[u], %[u], %[zero] \n\t"//u - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" // u + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t"//v - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" // v + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" // u*ug + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" // v*vg + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" // v*vr + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" // ffffgggg + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" // gbgbgbgb + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" // frfrfrfr + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [y]"=&f"(y), - [u]"=&f"(u), [v]"=&f"(v), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [alpha]"f"(-1), - [six]"f"(0x6), [five]"f"(0x55), - [mask]"f"(mask) - : "memory" - ); + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]), + [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]), + [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]), + [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug), + [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg), + [br] "=&f"(br), [yg] "=&f"(yg) + : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), + [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), + [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6), + [five] "f"(0x55), [mask] "f"(mask) + : "memory"); } // Also used for 420 @@ -6136,99 +6133,96 @@ void I422ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y,u,v; - uint64_t b_vec[2],g_vec[2],r_vec[2]; + uint64_t y, u, v; + uint64_t b_vec[2], g_vec[2], r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; + uint64_t ub, ug, vg, vr, bb, bg, br, yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub - "or %[ub], %[ub], %[mask] \n\t"//must sign extension - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t"//sign extension + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" // yg + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" // bb + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" // ub + "or %[ub], %[ub], %[mask] \n\t" // must + // sign + // extension + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" // bg + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" // ug + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" // vg + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" // br + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" // vr + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" // sign + // extension - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t" // y1 - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t"//u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + // u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t" // u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t"//v - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + // v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" // v + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" // u*ug + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" // v*vg + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" // v*vr + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" // ffffgggg + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" // gbgbgbgb + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" // frfrfrfr + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y]"=&f"(y), - [u]"=&f"(u), [v]"=&f"(v), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [alpha]"f"(-1), - [six]"f"(0x6), [five]"f"(0x55), - [mask]"f"(mask) - : "memory" - ); + : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]), + [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]), + [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]), + [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug), + [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg), + [br] "=&f"(br), [yg] "=&f"(yg) + : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), + [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), + [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6), + [five] "f"(0x55), [mask] "f"(mask) + : "memory"); } // 10 bit YUV to ARGB @@ -6238,102 +6232,96 @@ void I210ToARGBRow_MMI(const uint16_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y,u,v; - uint64_t b_vec[2],g_vec[2],r_vec[2]; + uint64_t y, u, v; + uint64_t b_vec[2], g_vec[2], r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; + uint64_t ub, ug, vg, vr, bb, bg, br, yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" - "1: \n\t" - "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "psllh %[y], %[y], %[six] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "psllh %[y], %[y], %[six] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "punpcklhw %[u], %[u], %[u] \n\t" - "psrah %[u], %[u], %[two] \n\t" - "punpcklhw %[v], %[v], %[v] \n\t" - "psrah %[v], %[v], %[two] \n\t" - "pminsh %[u], %[u], %[mask1] \n\t" - "pminsh %[v], %[v], %[mask1] \n\t" + "punpcklhw %[u], %[u], %[u] \n\t" + "psrah %[u], %[u], %[two] \n\t" + "punpcklhw %[v], %[v], %[v] \n\t" + "psrah %[v], %[v], %[two] \n\t" + "pminsh %[u], %[u], %[mask1] \n\t" + "pminsh %[v], %[v], %[mask1] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y]"=&f"(y), - [u]"=&f"(u), [v]"=&f"(v), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [alpha]"f"(-1), - [six]"f"(0x6), [five]"f"(0x55), - [mask]"f"(mask), [two]"f"(0x02), - [mask1]"f"(0x00ff00ff00ff00ff) - : "memory" - ); + : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]), + [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]), + [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]), + [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug), + [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg), + [br] "=&f"(br), [yg] "=&f"(yg) + : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), + [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), + [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6), + [five] "f"(0x55), [mask] "f"(mask), [two] "f"(0x02), + [mask1] "f"(0x00ff00ff00ff00ff) + : "memory"); } void I422AlphaToARGBRow_MMI(const uint8_t* src_y, @@ -6343,102 +6331,96 @@ void I422AlphaToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y,u,v,a; - uint64_t b_vec[2],g_vec[2],r_vec[2]; + uint64_t y, u, v, a; + uint64_t b_vec[2], g_vec[2], r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; + uint64_t ub, ug, vg, vr, bb, bg, br, yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t" - "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t" + "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t" // y1 - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t"//u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + // u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t" // u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + // v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[a] \n\t" - "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t"//aaaagggg - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[a] \n\t" + "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t" // aaaagggg + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), [a]"=&f"(a), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [a_ptr]"r"(src_a), [zero]"f"(0x00), - [six]"f"(0x6), [five]"f"(0x55), - [mask]"f"(mask) - : "memory" - ); + : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [a] "=&f"(a), + [b_vec0] "=&f"(b_vec[0]), [b_vec1] "=&f"(b_vec[1]), + [g_vec0] "=&f"(g_vec[0]), [g_vec1] "=&f"(g_vec[1]), + [r_vec0] "=&f"(r_vec[0]), [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), + [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), + [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) + : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), + [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), + [width] "r"(width), [a_ptr] "r"(src_a), [zero] "f"(0x00), + [six] "f"(0x6), [five] "f"(0x55), [mask] "f"(mask) + : "memory"); } void I422ToRGB24Row_MMI(const uint8_t* src_y, @@ -6447,113 +6429,105 @@ void I422ToRGB24Row_MMI(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y,u,v; - uint64_t b_vec[2],g_vec[2],r_vec[2]; + uint64_t y, u, v; + uint64_t b_vec[2], g_vec[2], r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; + uint64_t ub, ug, vg, vr, bb, bg, br, yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t" // y1 - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t"//u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + // u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t" // u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + // v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" - "packushb %[g_vec0], %[g_vec0], %[zero] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" + "packushb %[g_vec0], %[g_vec0], %[zero] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t" - "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t" - "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t" - "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t" - "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t" - "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "pextrh %[r_vec1], %[g_vec1], %[one] \n\t" - "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t" - "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t" - "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t" + "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t" + "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t" + "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t" + "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t" + "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "pextrh %[r_vec1], %[g_vec1], %[one] \n\t" + "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t" + "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t" + "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), - [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), - [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask]"f"(mask), - [lmove1]"f"(0x18), [rmove1]"f"(0x8), - [one]"f"(0x1) - : "memory" - ); + : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]), + [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]), + [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]), + [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug), + [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg), + [br] "=&f"(br), [yg] "=&f"(yg) + : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), + [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), + [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), + [mask] "f"(mask), [lmove1] "f"(0x18), [rmove1] "f"(0x8), [one] "f"(0x1) + : "memory"); } void I422ToARGB4444Row_MMI(const uint8_t* src_y, @@ -6564,110 +6538,103 @@ void I422ToARGB4444Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; + uint64_t ub, ug, vg, vr, bb, bg, br, yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t"//y1 + "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t" // y1 - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t"//u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + // u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t" // u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + // v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "and %[g_vec], %[g_vec], %[mask1] \n\t" - "psrlw %[g_vec], %[g_vec], %[four] \n\t" - "psrlw %[r_vec], %[g_vec], %[four] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" - "and %[g_vec], %[g_vec], %[r_vec] \n\t" + "and %[g_vec], %[g_vec], %[mask1] \n\t" + "psrlw %[g_vec], %[g_vec], %[four] \n\t" + "psrlw %[r_vec], %[g_vec], %[four] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" + "and %[g_vec], %[g_vec], %[r_vec] \n\t" - "and %[b_vec], %[b_vec], %[mask1] \n\t" - "psrlw %[b_vec], %[b_vec], %[four] \n\t" - "psrlw %[r_vec], %[b_vec], %[four] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" - "and %[b_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[b_vec] \n\t" + "and %[b_vec], %[b_vec], %[mask1] \n\t" + "psrlw %[b_vec], %[b_vec], %[four] \n\t" + "psrlw %[r_vec], %[b_vec], %[four] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" + "and %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[b_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [dst_argb4444]"r"(dst_argb4444), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask]"f"(0xff00ff00ff00ff00), - [four]"f"(0x4), [mask1]"f"(0xf0f0f0f0f0f0f0f0), - [alpha]"f"(-1) - : "memory" - ); + : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), + [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), + [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), + [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) + : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), + [dst_argb4444] "r"(dst_argb4444), [yuvcons_ptr] "r"(yuvconstants), + [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), + [mask] "f"(0xff00ff00ff00ff00), [four] "f"(0x4), + [mask1] "f"(0xf0f0f0f0f0f0f0f0), [alpha] "f"(-1) + : "memory"); } void I422ToARGB1555Row_MMI(const uint8_t* src_y, @@ -6678,125 +6645,118 @@ void I422ToARGB1555Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; + uint64_t ub, ug, vg, vr, bb, bg, br, yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + // u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + // v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "psrlw %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "or %[g_vec], %[g_vec], %[mask3] \n\t" + "psrlw %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "or %[g_vec], %[g_vec], %[mask3] \n\t" - "psrlw %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "or %[b_vec], %[b_vec], %[mask3] \n\t" + "psrlw %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "or %[b_vec], %[b_vec], %[mask3] \n\t" - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [dst_argb1555]"r"(dst_argb1555), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [three]"f"(0x3), [mask2]"f"(0x1f0000001f), - [eight]"f"(0x8), [mask3]"f"(0x800000008000), - [lmove5]"f"(0x5) - : "memory" - ); + : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), + [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), + [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), + [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) + : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), + [dst_argb1555] "r"(dst_argb1555), [yuvcons_ptr] "r"(yuvconstants), + [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), + [mask1] "f"(0xff00ff00ff00ff00), [three] "f"(0x3), + [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), + [mask3] "f"(0x800000008000), [lmove5] "f"(0x5) + : "memory"); } void I422ToRGB565Row_MMI(const uint8_t* src_y, @@ -6807,127 +6767,120 @@ void I422ToRGB565Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; + uint64_t ub, ug, vg, vr, bb, bg, br, yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - //u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + // u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - //v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + // v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "psrlh %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "psrlh %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "psrlh %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "psrlh %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [dst_rgb565]"r"(dst_rgb565), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [three]"f"(0x3), [mask2]"f"(0x1f0000001f), - [eight]"f"(0x8), [seven]"f"(0x7), - [lmove5]"f"(0x5) - : "memory" - ); + : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), + [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), + [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), + [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) + : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), + [dst_rgb565] "r"(dst_rgb565), [yuvcons_ptr] "r"(yuvconstants), + [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), + [mask1] "f"(0xff00ff00ff00ff00), [three] "f"(0x3), + [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), [seven] "f"(0x7), + [lmove5] "f"(0x5) + : "memory"); } void NV12ToARGBRow_MMI(const uint8_t* src_y, @@ -6937,91 +6890,83 @@ void NV12ToARGBRow_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; + uint64_t ub, ug, vg, vr, bb, bg, br, yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), - [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1) - : "memory" - ); + : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), + [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), + [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), + [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) + : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [rgbbuf_ptr] "r"(rgb_buf), + [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), + [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), + [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1) + : "memory"); } void NV21ToARGBRow_MMI(const uint8_t* src_y, @@ -7031,91 +6976,83 @@ void NV21ToARGBRow_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; + uint64_t ub, ug, vg, vr, bb, bg, br, yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[ushu] \n\t" - "pshufh %[u], %[u], %[vshu] \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[ushu] \n\t" + "pshufh %[u], %[u], %[vshu] \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), - [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1) - : "memory" - ); + : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), + [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), + [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), + [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) + : [y_ptr] "r"(src_y), [vu_ptr] "r"(src_vu), [rgbbuf_ptr] "r"(rgb_buf), + [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), + [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), + [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1) + : "memory"); } void NV12ToRGB24Row_MMI(const uint8_t* src_y, @@ -7125,103 +7062,95 @@ void NV12ToRGB24Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; + uint64_t ub, ug, vg, vr, bb, bg, br, yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" - "psllw %[temp], %[r_vec], %[lmove1] \n\t" - "or %[g_vec], %[g_vec], %[temp] \n\t" - "psrlw %[temp], %[r_vec], %[rmove1] \n\t" - "pextrh %[temp], %[temp], %[zero] \n\t" - "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[zero] \n\t" - "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[one] \n\t" - "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" - "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" - "or %[b_vec], %[b_vec], %[temp] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" + "psllw %[temp], %[r_vec], %[lmove1] \n\t" + "or %[g_vec], %[g_vec], %[temp] \n\t" + "psrlw %[temp], %[r_vec], %[rmove1] \n\t" + "pextrh %[temp], %[temp], %[zero] \n\t" + "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[zero] \n\t" + "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[one] \n\t" + "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" + "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" + "or %[b_vec], %[b_vec], %[temp] \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), - [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1), [lmove1]"f"(0x18), - [one]"f"(0x1), [rmove1]"f"(0x8) - : "memory" - ); + : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), + [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), + [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), + [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) + : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [rgbbuf_ptr] "r"(rgb_buf), + [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), + [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), + [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [lmove1] "f"(0x18), + [one] "f"(0x1), [rmove1] "f"(0x8) + : "memory"); } void NV21ToRGB24Row_MMI(const uint8_t* src_y, @@ -7231,103 +7160,95 @@ void NV21ToRGB24Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; + uint64_t ub, ug, vg, vr, bb, bg, br, yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[ushu] \n\t" - "pshufh %[u], %[u], %[vshu] \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[ushu] \n\t" + "pshufh %[u], %[u], %[vshu] \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" - "psllw %[temp], %[r_vec], %[lmove1] \n\t" - "or %[g_vec], %[g_vec], %[temp] \n\t" - "psrlw %[temp], %[r_vec], %[rmove1] \n\t" - "pextrh %[temp], %[temp], %[zero] \n\t" - "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[zero] \n\t" - "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[one] \n\t" - "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" - "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" - "or %[b_vec], %[b_vec], %[temp] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" + "psllw %[temp], %[r_vec], %[lmove1] \n\t" + "or %[g_vec], %[g_vec], %[temp] \n\t" + "psrlw %[temp], %[r_vec], %[rmove1] \n\t" + "pextrh %[temp], %[temp], %[zero] \n\t" + "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[zero] \n\t" + "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[one] \n\t" + "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" + "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" + "or %[b_vec], %[b_vec], %[temp] \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), - [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [lmove1]"f"(0x18), [rmove1]"f"(0x8), - [one]"f"(0x1) - : "memory" - ); + : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), + [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), + [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), + [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) + : [y_ptr] "r"(src_y), [vu_ptr] "r"(src_vu), [rgbbuf_ptr] "r"(rgb_buf), + [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), + [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), + [ushu] "f"(0xA0), [vshu] "f"(0xf5), [lmove1] "f"(0x18), + [rmove1] "f"(0x8), [one] "f"(0x1) + : "memory"); } void NV12ToRGB565Row_MMI(const uint8_t* src_y, @@ -7337,123 +7258,115 @@ void NV12ToRGB565Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; + uint64_t ub, ug, vg, vr, bb, bg, br, yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "psrlh %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psubb %[y], %[eight], %[three] \n\t"//5 - "psllw %[r_vec], %[r_vec], %[y] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "psrlh %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psubb %[y], %[eight], %[three] \n\t" // 5 + "psllw %[r_vec], %[r_vec], %[y] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "psrlh %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psubb %[y], %[eight], %[three] \n\t"//5 - "psllw %[r_vec], %[r_vec], %[y] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "psrlh %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psubb %[y], %[eight], %[three] \n\t" // 5 + "psllw %[r_vec], %[r_vec], %[y] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), - [dst_rgb565]"r"(dst_rgb565), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [three]"f"(0x3), [mask2]"f"(0x1f0000001f), - [eight]"f"(0x8), [seven]"f"(0x7) - : "memory" - ); + : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), + [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), + [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), + [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) + : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [dst_rgb565] "r"(dst_rgb565), + [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), + [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), + [ushu] "f"(0xA0), [vshu] "f"(0xf5), [three] "f"(0x3), + [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), [seven] "f"(0x7) + : "memory"); } void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2, @@ -7462,90 +7375,83 @@ void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; + uint64_t ub, ug, vg, vr, bb, bg, br, yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t" - "psrlh %[temp], %[y], %[eight] \n\t" - "pshufh %[u], %[temp], %[ushu] \n\t" - "pshufh %[v], %[temp], %[vshu] \n\t" + "1: \n\t" + "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t" + "psrlh %[temp], %[y], %[eight] \n\t" + "pshufh %[u], %[temp], %[ushu] \n\t" + "pshufh %[v], %[temp], %[vshu] \n\t" - "psrlh %[temp], %[mask1], %[eight] \n\t" - "and %[y], %[y], %[temp] \n\t" - "psllh %[temp], %[y], %[eight] \n\t" - "or %[y], %[y], %[temp] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "psrlh %[temp], %[mask1], %[eight] \n\t" + "and %[y], %[y], %[temp] \n\t" + "psllh %[temp], %[y], %[eight] \n\t" + "or %[y], %[y], %[temp] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [yuy2_ptr]"r"(src_yuy2), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1), [eight]"f"(0x8) - : "memory" - ); + : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), + [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), + [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), + [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) + : [yuy2_ptr] "r"(src_yuy2), [rgbbuf_ptr] "r"(rgb_buf), + [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), + [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), + [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [eight] "f"(0x8) + : "memory"); } void UYVYToARGBRow_MMI(const uint8_t* src_uyvy, @@ -7554,90 +7460,83 @@ void UYVYToARGBRow_MMI(const uint8_t* src_uyvy, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; + uint64_t ub, ug, vg, vr, bb, bg, br, yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t" - "psrlh %[temp], %[mask1], %[eight] \n\t" - "and %[temp], %[y], %[temp] \n\t" - "pshufh %[u], %[temp], %[ushu] \n\t" - "pshufh %[v], %[temp], %[vshu] \n\t" + "1: \n\t" + "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t" + "psrlh %[temp], %[mask1], %[eight] \n\t" + "and %[temp], %[y], %[temp] \n\t" + "pshufh %[u], %[temp], %[ushu] \n\t" + "pshufh %[v], %[temp], %[vshu] \n\t" - "psrlh %[y], %[y], %[eight] \n\t" - "psllh %[temp], %[y], %[eight] \n\t" - "or %[y], %[y], %[temp] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "psrlh %[y], %[y], %[eight] \n\t" + "psllh %[temp], %[y], %[eight] \n\t" + "or %[y], %[y], %[temp] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [uyvy_ptr]"r"(src_uyvy), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [ushu]"f"(0xA0), [vshu]"f"(0xf5), - [alpha]"f"(-1), [eight]"f"(0x8) - : "memory" - ); + : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), + [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), + [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), + [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) + : [uyvy_ptr] "r"(src_uyvy), [rgbbuf_ptr] "r"(rgb_buf), + [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), + [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), + [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [eight] "f"(0x8) + : "memory"); } void I422ToRGBARow_MMI(const uint8_t* src_y, @@ -7648,112 +7547,104 @@ void I422ToRGBARow_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub,ug,vg,vr,bb,bg,br,yg; + uint64_t ub, ug, vg, vr, bb, bg, br, yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t" - "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t" - "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t" + "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t" + "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y]"=&f"(y), [u]"=&f"(u), - [v]"=&f"(v), - [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), - [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), - [ub]"=&f"(ub), [ug]"=&f"(ug), - [vg]"=&f"(vg), [vr]"=&f"(vr), - [bb]"=&f"(bb), [bg]"=&f"(bg), - [br]"=&f"(br), [yg]"=&f"(yg) - : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), - [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), - [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), - [zero]"f"(0x00), [five]"f"(0x55), - [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), - [alpha]"f"(-1) - : "memory" - ); + : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), + [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), + [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), + [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) + : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), + [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), + [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), + [mask1] "f"(0xff00ff00ff00ff00), [alpha] "f"(-1) + : "memory"); } void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) { - __asm__ volatile ( - "punpcklwd %[v32], %[v32], %[v32] \n\t" - "1: \n\t" - "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t" - "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t" + __asm__ volatile( + "punpcklwd %[v32], %[v32], %[v32] \n\t" + "1: \n\t" + "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t" + "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "bnez %[width], 1b \n\t" - : [v32]"+&f"(v32) - : [dst_ptr]"r"(dst_argb), [width]"r"(width) - : "memory" - ); + "daddi %[width], %[width], -0x04 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "bnez %[width], 1b \n\t" + : [v32] "+&f"(v32) + : [dst_ptr] "r"(dst_argb), [width] "r"(width) + : "memory"); } // 10 bit YUV to ARGB diff --git a/source/row_neon.cc b/source/row_neon.cc index b379bbf61..c3e1cf814 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -727,14 +727,10 @@ void MirrorUVRow_NEON(const uint8_t* src_uv, } void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { + src += width * 4 - 16; asm volatile( - // Start at end of source row. - "mov r3, #-16 \n" - "add %0, %0, %2, lsl #2 \n" - "sub %0, #16 \n" - "1: \n" - "vld1.8 {q0}, [%0], r3 \n" // src -= 16 + "vld1.8 {q0}, [%0], %3 \n" // src -= 16 "subs %2, #4 \n" // 4 pixels per loop. "vrev64.32 q0, q0 \n" "vst1.8 {d1}, [%1]! \n" // dst += 16 @@ -743,12 +739,30 @@ void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 - : - : "cc", "memory", "r3", "q0"); + : "r"(-16) // %3 + : "cc", "memory", "q0"); } -void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, - uint8_t* dst_argb, +void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_rgb24, + int width) { + src_rgb24 += width * 3 - 24; + asm volatile( + "1: \n" + "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24 + "subs %2, #8 \n" // 8 pixels per loop. + "vrev64.8 d0, d0 \n" + "vrev64.8 d1, d1 \n" + "vrev64.8 d2, d2 \n" + "vst3.8 {d0, d1, d2}, [%1]! \n" // dst += 24 + "bgt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : "r"(-24) // %3 + : "cc", "memory", "d0", "d1", "d2"); +} + +void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_argb, int width) { asm volatile( "vmov.u8 d4, #255 \n" // Alpha @@ -2065,7 +2079,6 @@ void RAWToYRow_NEON(const uint8_t* src_raw, uint8_t* dst_y, int width) { : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"); } - void RGB24ToYJRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_yj, int width) { asm volatile( "vmov.u8 d4, #29 \n" // B * 0.1140 coefficient diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 3f64010a4..f9d56992c 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -278,8 +278,9 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y, v20) "subs %w4, %w4, #8 \n" ARGBTORGB565 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels // RGB565. - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "b.gt 1b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 + // lines ahead + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -316,8 +317,9 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y, v20) "subs %w4, %w4, #8 \n" ARGBTOARGB1555 "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels // RGB565. - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "b.gt 1b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 + // lines ahead + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 @@ -530,8 +532,9 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y, v20) "subs %w3, %w3, #8 \n" ARGBTORGB565 "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels // RGB565. - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead - "b.gt 1b \n" + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 + // lines ahead + "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_uv), // %1 "+r"(dst_rgb565), // %2 @@ -750,13 +753,12 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { "tbl v1.16b, {v1.16b}, v3.16b \n" "tbl v0.16b, {v2.16b}, v3.16b \n" "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)-32), // %3 - "r"(&kShuffleMirror) // %4 + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-32), // %3 + "r"(&kShuffleMirror) // %4 : "cc", "memory", "v0", "v1", "v2", "v3"); } @@ -775,7 +777,6 @@ void MirrorUVRow_NEON(const uint8_t* src_uv, "rev64 v1.8b, v1.8b \n" "st1 {v0.8b}, [%1], #8 \n" // dst += 8 "st1 {v1.8b}, [%2], #8 \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead "b.gt 1b \n" : "+r"(src_uv), // %0 "+r"(dst_u), // %1 @@ -785,24 +786,49 @@ void MirrorUVRow_NEON(const uint8_t* src_uv, : "cc", "memory", "v0", "v1"); } -void ARGBMirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { +void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, + int width) { asm volatile( - // Start at end of source row. - "add %0, %0, %w2, sxtw #2 \n" - "sub %0, %0, #16 \n" + "ld1 {v4.16b}, [%4] \n" // shuffler + "add %0, %0, %w2, sxtw #2 \n" // Start at end of row. + "sub %0, %0, #64 \n" "1: \n" - "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 - "subs %w2, %w2, #4 \n" // 4 pixels per loop. - "rev64 v0.4s, v0.4s \n" - "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 - "st1 {v0.D}[0], [%1], #8 \n" - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead + "ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], %3\n" // src -= 64 + "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "tbl v0.16b, {v0.16b}, v4.16b \n" + "tbl v1.16b, {v1.16b}, v4.16b \n" + "tbl v2.16b, {v2.16b}, v4.16b \n" + "tbl v3.16b, {v3.16b}, v4.16b \n" + "st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [%1], #64 \n" // dst += 64 "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"((ptrdiff_t)-16) // %3 - : "cc", "memory", "v0"); + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-64), // %3 + "r"(&kShuffleMirror) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3", "v4"); +} + +void RGB24MirrorRow_NEON(const uint8_t* src_rgb24, uint8_t* dst_rgb24, + int width) { + src_rgb24 += width * 3 - 48; + asm volatile( + "ld1 {v3.16b}, [%4] \n" // shuffler + + "1: \n" + "ld3 {v0.16b, v1.16b, v2.16b}, [%0], %3\n" // src -= 48 + "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "tbl v0.16b, {v0.16b}, v3.16b \n" + "tbl v1.16b, {v1.16b}, v3.16b \n" + "tbl v2.16b, {v2.16b}, v3.16b \n" + "st3 {v0.16b, v1.16b, v2.16b}, [%1], #48 \n" // dst += 48 + "b.gt 1b \n" + : "+r"(src_rgb24), // %0 + "+r"(dst_rgb24), // %1 + "+r"(width) // %2 + : "r"((ptrdiff_t)-48), // %3 + "r"(&kShuffleMirror) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3"); } void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, @@ -2185,7 +2211,8 @@ void RAWToYJRow_NEON(const uint8_t* src_raw, uint8_t* dst_yj, int width) { "umull v0.8h, v0.8b, v4.8b \n" // B "umlal v0.8h, v1.8b, v5.8b \n" // G "umlal v0.8h, v2.8b, v6.8b \n" // R - "prfm pldl1keep, [%0, 448] \n" // prefetch 7 cache lines ahead + "prfm pldl1keep, [%0, 448] \n" // prefetch 7 cache lines + // ahead "uqrshrn v0.8b, v0.8h, #8 \n" // 16 bit to 8 bit Y "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels Y. "prfm pldl1keep, [%0, 448] \n" // prefetch 7 lines ahead @@ -3077,21 +3104,19 @@ void GaussCol_F32_NEON(const float* src0, "subs %w6, %w6, #8 \n" // 8 processed per loop "st1 {v0.4s, v1.4s}, [%5], #32 \n" // store 8 samples "b.gt 1b \n" - : "+r"(src0), // %0 - "+r"(src1), // %1 - "+r"(src2), // %2 - "+r"(src3), // %3 - "+r"(src4), // %4 - "+r"(dst), // %5 - "+r"(width) // %6 - : "r"(&kGaussCoefficients) // %7 + : "+r"(src0), // %0 + "+r"(src1), // %1 + "+r"(src2), // %2 + "+r"(src3), // %3 + "+r"(src4), // %4 + "+r"(dst), // %5 + "+r"(width) // %6 + : "r"(&kGaussCoefficients) // %7 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } // filter 5 rows with 1, 4, 6, 4, 1 coefficients to produce 1 row. -void GaussRow_F32_NEON(const float* src, - float* dst, - int width) { +void GaussRow_F32_NEON(const float* src, float* dst, int width) { asm volatile( "ld3r {v6.4s, v7.4s, v8.4s}, [%3] \n" // constants 4, 6, 1/256 @@ -3114,13 +3139,13 @@ void GaussRow_F32_NEON(const float* src, "subs %w2, %w2, #8 \n" // 8 processed per loop "st1 {v0.4s, v1.4s}, [%1], #32 \n" // store 8 samples "b.gt 1b \n" - : "+r"(src), // %0 - "+r"(dst), // %1 - "+r"(width) // %2 - : "r"(&kGaussCoefficients), // %3 - "r"(8LL), // %4 - "r"(-4LL), // %5 - "r"(20LL) // %6 + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(&kGaussCoefficients), // %3 + "r"(8LL), // %4 + "r"(-4LL), // %5 + "r"(20LL) // %6 : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8"); } diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index af1c94ad6..f66b2b849 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -1245,6 +1245,7 @@ TESTATOB(RAW, 3, 3, 1, RGBA, 4, 4, 1, 0) TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1, 0) TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0) TESTATOB(RGB24, 3, 3, 1, J400, 1, 1, 1, 0) +TESTATOB(RGB24, 3, 3, 1, RGB24Mirror, 3, 3, 1, 0) TESTATOB(RAW, 3, 3, 1, J400, 1, 1, 1, 0) #ifdef INTEL_TEST TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0) diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index ac1e26ea3..1508bf250 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -500,53 +500,53 @@ static int I444TestFilter_16(int src_width, #define DX(x, nom, denom) static_cast(((Abs(x) / nom + 1) / 2) * nom * 2) #define SX(x, nom, denom) static_cast(((x / nom + 1) / 2) * denom * 2) -#define TEST_FACTOR1(DISABLED_, name, filter, nom, denom, max_diff) \ - TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter) { \ - int diff = I420TestFilter( \ - SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ - DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ - kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ - benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ - } \ - TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter) { \ - int diff = I444TestFilter( \ - SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ - DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ - kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ - benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ - } \ +#define TEST_FACTOR1(DISABLED_, name, filter, nom, denom, max_diff) \ + TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter) { \ + int diff = I420TestFilter( \ + SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ + DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ + kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ + TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter) { \ + int diff = I444TestFilter( \ + SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ + DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ + kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ TEST_F(LibYUVScaleTest, DISABLED_##I420ScaleDownBy##name##_##filter##_16) { \ - int diff = I420TestFilter_16( \ - SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ - DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ - kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ - benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ - } \ + int diff = I420TestFilter_16( \ + SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ + DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ + kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ + } \ TEST_F(LibYUVScaleTest, DISABLED_##I444ScaleDownBy##name##_##filter##_16) { \ - int diff = I444TestFilter_16( \ - SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ - DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ - kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ - benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ + int diff = I444TestFilter_16( \ + SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ + DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ + kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ } // Test a scale factor with all 4 filters. Expect unfiltered to be exact, but // filtering is different fixed point implementations for SSSE3, Neon and C. #ifdef ENABLE_SLOW_TESTS -#define TEST_FACTOR(name, nom, denom, boxdiff) \ - TEST_FACTOR1(, name, None, nom, denom, 0) \ - TEST_FACTOR1(, name, Linear, nom, denom, 3) \ - TEST_FACTOR1(, name, Bilinear, nom, denom, 3) \ +#define TEST_FACTOR(name, nom, denom, boxdiff) \ + TEST_FACTOR1(, name, None, nom, denom, 0) \ + TEST_FACTOR1(, name, Linear, nom, denom, 3) \ + TEST_FACTOR1(, name, Bilinear, nom, denom, 3) \ TEST_FACTOR1(, name, Box, nom, denom, boxdiff) #else -#define TEST_FACTOR(name, nom, denom, boxdiff) \ - TEST_FACTOR1(DISABLED_, name, None, nom, denom, 0) \ - TEST_FACTOR1(DISABLED_, name, Linear, nom, denom, 3) \ - TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \ +#define TEST_FACTOR(name, nom, denom, boxdiff) \ + TEST_FACTOR1(DISABLED_, name, None, nom, denom, 0) \ + TEST_FACTOR1(DISABLED_, name, Linear, nom, denom, 3) \ + TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \ TEST_FACTOR1(DISABLED_, name, Box, nom, denom, boxdiff) #endif @@ -561,7 +561,7 @@ TEST_FACTOR(3, 1, 3, 0) #undef SX #undef DX -#define TEST_SCALETO1(DISABLED_, name, width, height, filter, max_diff) \ +#define TEST_SCALETO1(DISABLED_, name, width, height, filter, max_diff) \ TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter) { \ int diff = I420TestFilter(benchmark_width_, benchmark_height_, width, \ height, kFilter##filter, benchmark_iterations_, \ @@ -574,13 +574,15 @@ TEST_FACTOR(3, 1, 3, 0) disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(LibYUVScaleTest, DISABLED_##I420##name##To##width##x##height##_##filter##_16) { \ + TEST_F(LibYUVScaleTest, \ + DISABLED_##I420##name##To##width##x##height##_##filter##_16) { \ int diff = I420TestFilter_16( \ benchmark_width_, benchmark_height_, width, height, kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(LibYUVScaleTest, DISABLED_##I444##name##To##width##x##height##_##filter##_16) { \ + TEST_F(LibYUVScaleTest, \ + DISABLED_##I444##name##To##width##x##height##_##filter##_16) { \ int diff = I444TestFilter_16( \ benchmark_width_, benchmark_height_, width, height, kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \ @@ -601,7 +603,7 @@ TEST_FACTOR(3, 1, 3, 0) EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, \ - DISABLED_##I420##name##From##width##x##height##_##filter##_16) { \ + DISABLED_##I420##name##From##width##x##height##_##filter##_16) { \ int diff = I420TestFilter_16(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, \ @@ -609,7 +611,7 @@ TEST_FACTOR(3, 1, 3, 0) EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, \ - DISABLED_##I444##name##From##width##x##height##_##filter##_16) { \ + DISABLED_##I444##name##From##width##x##height##_##filter##_16) { \ int diff = I444TestFilter_16(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, \ @@ -619,14 +621,14 @@ TEST_FACTOR(3, 1, 3, 0) #ifdef ENABLE_SLOW_TESTS // Test scale to a specified size with all 4 filters. -#define TEST_SCALETO(name, width, height) \ +#define TEST_SCALETO(name, width, height) \ TEST_SCALETO1(, name, width, height, None, 0) \ TEST_SCALETO1(, name, width, height, Linear, 3) \ TEST_SCALETO1(, name, width, height, Bilinear, 3) \ TEST_SCALETO1(, name, width, height, Box, 3) #else - // Test scale to a specified size with all 4 filters. -#define TEST_SCALETO(name, width, height) \ +// Test scale to a specified size with all 4 filters. +#define TEST_SCALETO(name, width, height) \ TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \ TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \ TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \ @@ -897,14 +899,14 @@ static int TestPlaneFilter_16(int src_width, #define DX(x, nom, denom) static_cast(((Abs(x) / nom + 1) / 2) * nom * 2) #define SX(x, nom, denom) static_cast(((x / nom + 1) / 2) * denom * 2) -#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ - TEST_F(LibYUVScaleTest, DISABLED_##ScalePlaneDownBy##name##_##filter##_16) {\ - int diff = TestPlaneFilter_16( \ - SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ - DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ - kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ - benchmark_cpu_info_); \ - EXPECT_LE(diff, max_diff); \ +#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ + TEST_F(LibYUVScaleTest, DISABLED_##ScalePlaneDownBy##name##_##filter##_16) { \ + int diff = TestPlaneFilter_16( \ + SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ + DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ + kFilter##filter, benchmark_iterations_, disable_cpu_flags_, \ + benchmark_cpu_info_); \ + EXPECT_LE(diff, max_diff); \ } // Test a scale factor with all 4 filters. Expect unfiltered to be exact, but