diff --git a/BUILD.gn b/BUILD.gn index 2196be415..8a6f96dba 100644 --- a/BUILD.gn +++ b/BUILD.gn @@ -73,7 +73,7 @@ group("libyuv") { deps += [ ":libyuv_mmi" ] } - if (!is_ios) { + if (!is_ios && !libyuv_disable_jpeg) { # Make sure that clients of libyuv link with libjpeg. This can't go in # libyuv_internal because in Windows x64 builds that will generate a clang # build of libjpeg, and we don't want two copies. @@ -150,7 +150,7 @@ static_library("libyuv_internal") { configs += [ "//build/config/gcc:symbol_visibility_default" ] } - if (!is_ios) { + if (!is_ios && !libyuv_disable_jpeg) { defines += [ "HAVE_JPEG" ] # Needed to pull in libjpeg headers. Can't add //third_party:jpeg to deps diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 1aa151b62..f7cb4f8fe 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -734,7 +734,7 @@ void MirrorPlane(const uint8_t* src_y, #if defined(HAS_MIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MirrorRow = MirrorRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 32)) { MirrorRow = MirrorRow_NEON; } } diff --git a/source/rotate.cc b/source/rotate.cc index d414186a5..cea835d10 100644 --- a/source/rotate.cc +++ b/source/rotate.cc @@ -142,7 +142,7 @@ void RotatePlane180(const uint8_t* src, #if defined(HAS_MIRRORROW_NEON) if (TestCpuFlag(kCpuHasNEON)) { MirrorRow = MirrorRow_Any_NEON; - if (IS_ALIGNED(width, 16)) { + if (IS_ALIGNED(width, 32)) { MirrorRow = MirrorRow_NEON; } } @@ -207,11 +207,11 @@ void RotatePlane180(const uint8_t* src, // Odd height will harmlessly mirror the middle row twice. for (y = 0; y < half_height; ++y) { - MirrorRow(src, row, width); // Mirror first row into a buffer - src += src_stride; + CopyRow(src, row, width); // Copy first row into buffer MirrorRow(src_bot, dst, width); // Mirror last row into first row + MirrorRow(row, dst_bot, width); // Mirror buffer into last row + src += src_stride; dst += dst_stride; - CopyRow(row, dst_bot, width); // Copy first mirrored row into last src_bot -= src_stride; dst_bot -= dst_stride; } diff --git a/source/row_any.cc b/source/row_any.cc index 9b29b2bfb..3592ffb6c 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -1156,7 +1156,7 @@ ANY11M(MirrorRow_Any_AVX2, MirrorRow_AVX2, 1, 31) ANY11M(MirrorRow_Any_SSSE3, MirrorRow_SSSE3, 1, 15) #endif #ifdef HAS_MIRRORROW_NEON -ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 15) +ANY11M(MirrorRow_Any_NEON, MirrorRow_NEON, 1, 31) #endif #ifdef HAS_MIRRORROW_MSA ANY11M(MirrorRow_Any_MSA, MirrorRow_MSA, 1, 63) diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 3088bb755..fa7b8cb31 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -84,7 +84,7 @@ static const uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u}; static const uvec16 kSub128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u, - 0x8080u, 0x8080u, 0x8080u, 0x8080u}; + 0x8080u, 0x8080u, 0x8080u, 0x8080u}; #endif // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3) @@ -1101,10 +1101,8 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { "lea 0x40(%0),%0 \n" \ "phaddw %%xmm0,%%xmm6 \n" \ "phaddw %%xmm2,%%xmm1 \n" \ - "paddw %%" #round \ - ",%%xmm6 \n" \ - "paddw %%" #round \ - ",%%xmm1 \n" \ + "paddw %%" #round ",%%xmm6 \n" \ + "paddw %%" #round ",%%xmm1 \n" \ "psrlw $0x8,%%xmm6 \n" \ "psrlw $0x8,%%xmm1 \n" \ "packuswb %%xmm1,%%xmm6 \n" \ @@ -1113,35 +1111,33 @@ void ABGRToAR30Row_AVX2(const uint8_t* src, uint8_t* dst, int width) { "sub $0x10,%2 \n" \ "jg 1b \n" -#define RGBTOY_AVX2(round) \ - "1: \n" \ - "vmovdqu (%0),%%ymm0 \n" \ - "vmovdqu 0x20(%0),%%ymm1 \n" \ - "vmovdqu 0x40(%0),%%ymm2 \n" \ - "vmovdqu 0x60(%0),%%ymm3 \n" \ - "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \ - "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \ - "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \ - "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \ - "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \ - "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \ - "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \ - "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \ - "lea 0x80(%0),%0 \n" \ - "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \ - "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \ - "vpaddw %%" #round \ - ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \ - "vpaddw %%" #round \ - ",%%ymm2,%%ymm2 \n" \ - "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \ - "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \ - "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \ - "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \ - "vmovdqu %%ymm0,(%1) \n" \ - "lea 0x20(%1),%1 \n" \ - "sub $0x20,%2 \n" \ - "jg 1b \n" \ +#define RGBTOY_AVX2(round) \ + "1: \n" \ + "vmovdqu (%0),%%ymm0 \n" \ + "vmovdqu 0x20(%0),%%ymm1 \n" \ + "vmovdqu 0x40(%0),%%ymm2 \n" \ + "vmovdqu 0x60(%0),%%ymm3 \n" \ + "vpsubb %%ymm5, %%ymm0, %%ymm0 \n" \ + "vpsubb %%ymm5, %%ymm1, %%ymm1 \n" \ + "vpsubb %%ymm5, %%ymm2, %%ymm2 \n" \ + "vpsubb %%ymm5, %%ymm3, %%ymm3 \n" \ + "vpmaddubsw %%ymm0,%%ymm4,%%ymm0 \n" \ + "vpmaddubsw %%ymm1,%%ymm4,%%ymm1 \n" \ + "vpmaddubsw %%ymm2,%%ymm4,%%ymm2 \n" \ + "vpmaddubsw %%ymm3,%%ymm4,%%ymm3 \n" \ + "lea 0x80(%0),%0 \n" \ + "vphaddw %%ymm1,%%ymm0,%%ymm0 \n" /* mutates. */ \ + "vphaddw %%ymm3,%%ymm2,%%ymm2 \n" \ + "vpaddw %%" #round ",%%ymm0,%%ymm0 \n" /* Add .5 for rounding. */ \ + "vpaddw %%" #round ",%%ymm2,%%ymm2 \n" \ + "vpsrlw $0x8,%%ymm0,%%ymm0 \n" \ + "vpsrlw $0x8,%%ymm2,%%ymm2 \n" \ + "vpackuswb %%ymm2,%%ymm0,%%ymm0 \n" /* mutates. */ \ + "vpermd %%ymm0,%%ymm6,%%ymm0 \n" /* unmutate. */ \ + "vmovdqu %%ymm0,(%1) \n" \ + "lea 0x20(%1),%1 \n" \ + "sub $0x20,%2 \n" \ + "jg 1b \n" \ "vzeroupper \n" #ifdef HAS_ARGBTOYROW_SSSE3 @@ -1152,15 +1148,15 @@ void ARGBToYRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" - LABELALIGN RGBTOY(xmm7) + LABELALIGN + RGBTOY(xmm7) : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kARGBToY), // %3 "m"(kSub128), // %4 "m"(kAddY16) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ARGBTOYROW_SSSE3 @@ -1172,7 +1168,8 @@ void ARGBToYJRow_SSSE3(const uint8_t* src_argb, uint8_t* dst_y, int width) { "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" - LABELALIGN RGBTOY(xmm5) + LABELALIGN + RGBTOY(xmm5) : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1190,7 +1187,8 @@ void RGBAToYJRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { "movdqa %3,%%xmm4 \n" "movdqa %4,%%xmm5 \n" - LABELALIGN RGBTOY(xmm5) + LABELALIGN + RGBTOY(xmm5) : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1212,7 +1210,8 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { "vbroadcastf128 %5,%%ymm7 \n" "vmovdqu %6,%%ymm6 \n" - LABELALIGN RGBTOY_AVX2(ymm7) + LABELALIGN + RGBTOY_AVX2(ymm7) : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1220,8 +1219,7 @@ void ARGBToYRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { "m"(kSub128), // %4 "m"(kAddY16), // %5 "m"(kPermdARGBToY_AVX) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ARGBTOYROW_AVX2 @@ -1234,7 +1232,8 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { "vbroadcastf128 %5,%%ymm7 \n" "vmovdqu %6,%%ymm6 \n" - LABELALIGN RGBTOY_AVX2(ymm7) + LABELALIGN + RGBTOY_AVX2(ymm7) : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1242,8 +1241,7 @@ void ABGRToYRow_AVX2(const uint8_t* src_abgr, uint8_t* dst_y, int width) { "m"(kSub128), // %4 "m"(kAddY16), // %5 "m"(kPermdARGBToY_AVX) // %6 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ABGRTOYROW_AVX2 @@ -1255,15 +1253,15 @@ void ARGBToYJRow_AVX2(const uint8_t* src_argb, uint8_t* dst_y, int width) { "vbroadcastf128 %4,%%ymm5 \n" "vmovdqu %5,%%ymm6 \n" - LABELALIGN RGBTOY_AVX2(ymm5) + LABELALIGN + RGBTOY_AVX2(ymm5) : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kARGBToYJ), // %3 "m"(kSub128), // %4 "m"(kPermdARGBToY_AVX) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } #endif // HAS_ARGBTOYJROW_AVX2 @@ -1275,8 +1273,9 @@ void RGBAToYJRow_AVX2(const uint8_t* src_rgba, uint8_t* dst_y, int width) { "vbroadcastf128 %4,%%ymm5 \n" "vmovdqu %5,%%ymm6 \n" - LABELALIGN RGBTOY_AVX2( - ymm5) "vzeroupper \n" + LABELALIGN + RGBTOY_AVX2(ymm5) + "vzeroupper \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 @@ -1537,7 +1536,7 @@ void ARGBToUVJRow_AVX2(const uint8_t* src_argb0, "+r"(dst_v), // %2 "+rm"(width) // %3 : "r"((intptr_t)(src_stride_argb)), // %4 - "m"(kSub128), // %5 + "m"(kSub128), // %5 "m"(kARGBToVJ), // %6 "m"(kARGBToUJ), // %7 "m"(kShufARGBToUV_AVX) // %8 @@ -1607,7 +1606,7 @@ void ARGBToUVJRow_SSSE3(const uint8_t* src_argb0, : "r"((intptr_t)(src_stride_argb)), // %4 "m"(kARGBToVJ), // %5 "m"(kARGBToUJ), // %6 - "m"(kSub128) // %7 + "m"(kSub128) // %7 : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"); } #endif // HAS_ARGBTOUVJROW_SSSE3 @@ -1676,15 +1675,15 @@ void BGRAToYRow_SSSE3(const uint8_t* src_bgra, uint8_t* dst_y, int width) { "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" - LABELALIGN RGBTOY(xmm7) + LABELALIGN + RGBTOY(xmm7) : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kBGRAToY), // %3 "m"(kSub128), // %4 "m"(kAddY16) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } void BGRAToUVRow_SSSE3(const uint8_t* src_bgra0, @@ -1756,15 +1755,15 @@ void ABGRToYRow_SSSE3(const uint8_t* src_abgr, uint8_t* dst_y, int width) { "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" - LABELALIGN RGBTOY(xmm7) + LABELALIGN + RGBTOY(xmm7) : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kABGRToY), // %3 "m"(kSub128), // %4 "m"(kAddY16) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { @@ -1773,15 +1772,15 @@ void RGBAToYRow_SSSE3(const uint8_t* src_rgba, uint8_t* dst_y, int width) { "movdqa %4,%%xmm5 \n" "movdqa %5,%%xmm7 \n" - LABELALIGN RGBTOY(xmm7) + LABELALIGN + RGBTOY(xmm7) : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 "+r"(width) // %2 : "m"(kRGBAToY), // %3 "m"(kSub128), // %4 "m"(kAddY16) // %5 - : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", - "xmm7"); + : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"); } void ABGRToUVRow_SSSE3(const uint8_t* src_abgr0, diff --git a/source/row_mmi.cc b/source/row_mmi.cc index 50cfca726..d7d34e47f 100644 --- a/source/row_mmi.cc +++ b/source/row_mmi.cc @@ -6040,90 +6040,93 @@ void I444ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y, u, v; - uint64_t b_vec[2], g_vec[2], r_vec[2]; + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; - __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" // yg - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" // bb - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" // ub - "or %[ub], %[ub], %[mask] \n\t" // must - // sign - // extension - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" // bg - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" // ug - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" // vg - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" // br - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" // vr - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" // sign - // extension + uint64_t ub,ug,vg,vr,bb,bg,br,yg; + __asm__ volatile ( + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub + "or %[ub], %[ub], %[mask] \n\t"//must sign extension + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t"//sign extension - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t" // y1 + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - "punpcklbh %[u], %[u], %[zero] \n\t" // u - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t"//u + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" // v - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" // u*ug - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" // v*vg - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t"//v + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" // v*vr - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" // ffffgggg - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" // gbgbgbgb - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" // frfrfrfr - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]), - [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]), - [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]), - [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug), - [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg), - [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6), - [five] "f"(0x55), [mask] "f"(mask) - : "memory"); + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + : [y]"=&f"(y), + [u]"=&f"(u), [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [alpha]"f"(-1), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask) + : "memory" + ); } // Also used for 420 @@ -6133,96 +6136,99 @@ void I422ToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y, u, v; - uint64_t b_vec[2], g_vec[2], r_vec[2]; + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" // yg - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" // bb - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" // ub - "or %[ub], %[ub], %[mask] \n\t" // must - // sign - // extension - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" // bg - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" // ug - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" // vg - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" // br - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" // vr - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" // sign - // extension + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t"//yg + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t"//bb + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t"//ub + "or %[ub], %[ub], %[mask] \n\t"//must sign extension + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t"//bg + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t"//ug + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t"//vg + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t"//br + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t"//vr + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t"//sign extension - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t" // y1 + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" // u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" // v - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" // u*ug - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" // v*vg - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t"//v + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t"//u*ug + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t"//v*vg + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" // v*vr - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t"//v*vr + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" // ffffgggg - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" // gbgbgbgb - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" // frfrfrfr - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" // frgbfrgb - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t"//ffffgggg + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t"//gbgbgbgb + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t"//frfrfrfr + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t"//frgbfrgb + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]), - [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]), - [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]), - [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug), - [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg), - [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6), - [five] "f"(0x55), [mask] "f"(mask) - : "memory"); + : [y]"=&f"(y), + [u]"=&f"(u), [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [alpha]"f"(-1), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask) + : "memory" + ); } // 10 bit YUV to ARGB @@ -6232,96 +6238,102 @@ void I210ToARGBRow_MMI(const uint16_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y, u, v; - uint64_t b_vec[2], g_vec[2], r_vec[2]; + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" - "1: \n\t" - "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gsldlc1 %[y], 0x07(%[y_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "psllh %[y], %[y], %[six] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "psllh %[y], %[y], %[six] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "punpcklhw %[u], %[u], %[u] \n\t" - "psrah %[u], %[u], %[two] \n\t" - "punpcklhw %[v], %[v], %[v] \n\t" - "psrah %[v], %[v], %[two] \n\t" - "pminsh %[u], %[u], %[mask1] \n\t" - "pminsh %[v], %[v], %[mask1] \n\t" + "punpcklhw %[u], %[u], %[u] \n\t" + "psrah %[u], %[u], %[two] \n\t" + "punpcklhw %[v], %[v], %[v] \n\t" + "psrah %[v], %[v], %[two] \n\t" + "pminsh %[u], %[u], %[mask1] \n\t" + "pminsh %[v], %[v], %[mask1] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" - "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" + "packushb %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklwd %[g_vec0], %[g_vec0], %[alpha] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x08 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x04 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]), - [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]), - [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]), - [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug), - [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg), - [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [alpha] "f"(-1), [six] "f"(0x6), - [five] "f"(0x55), [mask] "f"(mask), [two] "f"(0x02), - [mask1] "f"(0x00ff00ff00ff00ff) - : "memory"); + : [y]"=&f"(y), + [u]"=&f"(u), [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [alpha]"f"(-1), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask), [two]"f"(0x02), + [mask1]"f"(0x00ff00ff00ff00ff) + : "memory" + ); } void I422AlphaToARGBRow_MMI(const uint8_t* src_y, @@ -6331,96 +6343,102 @@ void I422AlphaToARGBRow_MMI(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y, u, v, a; - uint64_t b_vec[2], g_vec[2], r_vec[2]; + uint64_t y,u,v,a; + uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t" - "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "gslwlc1 %[a], 0x03(%[a_ptr]) \n\t" + "gslwrc1 %[a], 0x00(%[a_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t" // y1 + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" // u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" // rrrrbbbb - "packushb %[g_vec0], %[g_vec0], %[a] \n\t" - "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t" // aaaagggg - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t"//rrrrbbbb + "packushb %[g_vec0], %[g_vec0], %[a] \n\t" + "punpcklwd %[g_vec0], %[g_vec0], %[a] \n\t"//aaaagggg + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec1], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[a_ptr], %[a_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [a] "=&f"(a), - [b_vec0] "=&f"(b_vec[0]), [b_vec1] "=&f"(b_vec[1]), - [g_vec0] "=&f"(g_vec[0]), [g_vec1] "=&f"(g_vec[1]), - [r_vec0] "=&f"(r_vec[0]), [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), - [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), - [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [a_ptr] "r"(src_a), [zero] "f"(0x00), - [six] "f"(0x6), [five] "f"(0x55), [mask] "f"(mask) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), [a]"=&f"(a), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [a_ptr]"r"(src_a), [zero]"f"(0x00), + [six]"f"(0x6), [five]"f"(0x55), + [mask]"f"(mask) + : "memory" + ); } void I422ToRGB24Row_MMI(const uint8_t* src_y, @@ -6429,105 +6447,113 @@ void I422ToRGB24Row_MMI(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width) { - uint64_t y, u, v; - uint64_t b_vec[2], g_vec[2], r_vec[2]; + uint64_t y,u,v; + uint64_t b_vec[2],g_vec[2],r_vec[2]; uint64_t mask = 0xff00ff00ff00ff00ULL; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t" // y1 + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" // u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec0], %[y], %[bb] \n\t" - "pmullh %[b_vec1], %[u], %[ub] \n\t" - "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" - "psrah %[b_vec0], %[b_vec0], %[six] \n\t" + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec0], %[y], %[bb] \n\t" + "pmullh %[b_vec1], %[u], %[ub] \n\t" + "psubsh %[b_vec0], %[b_vec0], %[b_vec1] \n\t" + "psrah %[b_vec0], %[b_vec0], %[six] \n\t" - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec0], %[y], %[bg] \n\t" - "pmullh %[g_vec1], %[u], %[ug] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "pmullh %[g_vec1], %[v], %[vg] \n\t" - "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" - "psrah %[g_vec0], %[g_vec0], %[six] \n\t" + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec0], %[y], %[bg] \n\t" + "pmullh %[g_vec1], %[u], %[ug] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "pmullh %[g_vec1], %[v], %[vg] \n\t" + "psubsh %[g_vec0], %[g_vec0], %[g_vec1] \n\t" + "psrah %[g_vec0], %[g_vec0], %[six] \n\t" - "paddsh %[r_vec0], %[y], %[br] \n\t" - "pmullh %[r_vec1], %[v], %[vr] \n\t" - "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" - "psrah %[r_vec0], %[r_vec0], %[six] \n\t" + "paddsh %[r_vec0], %[y], %[br] \n\t" + "pmullh %[r_vec1], %[v], %[vr] \n\t" + "psubsh %[r_vec0], %[r_vec0], %[r_vec1] \n\t" + "psrah %[r_vec0], %[r_vec0], %[six] \n\t" - "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" - "packushb %[g_vec0], %[g_vec0], %[zero] \n\t" - "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" - "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" - "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" + "packushb %[r_vec0], %[b_vec0], %[r_vec0] \n\t" + "packushb %[g_vec0], %[g_vec0], %[zero] \n\t" + "punpcklbh %[b_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpckhbh %[r_vec0], %[r_vec0], %[g_vec0] \n\t" + "punpcklhw %[g_vec0], %[b_vec0], %[r_vec0] \n\t" + "punpckhhw %[g_vec1], %[b_vec0], %[r_vec0] \n\t" - "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t" - "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t" - "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t" - "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t" - "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t" - "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" - "pextrh %[r_vec1], %[g_vec1], %[one] \n\t" - "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t" - "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t" - "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t" - "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" + "punpckhwd %[r_vec0], %[g_vec0], %[g_vec0] \n\t" + "psllw %[r_vec1], %[r_vec0], %[lmove1] \n\t" + "or %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "psrlw %[r_vec1], %[r_vec0], %[rmove1] \n\t" + "pextrh %[r_vec1], %[r_vec1], %[zero] \n\t" + "pinsrh_2 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "pextrh %[r_vec1], %[g_vec1], %[zero] \n\t" + "pinsrh_3 %[g_vec0], %[g_vec0], %[r_vec1] \n\t" + "pextrh %[r_vec1], %[g_vec1], %[one] \n\t" + "punpckhwd %[g_vec1], %[g_vec1], %[g_vec1] \n\t" + "psllw %[g_vec1], %[g_vec1], %[rmove1] \n\t" + "or %[g_vec1], %[g_vec1], %[r_vec1] \n\t" + "gssdlc1 %[g_vec0], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec0], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[g_vec1], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[g_vec1], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec0] "=&f"(b_vec[0]), - [b_vec1] "=&f"(b_vec[1]), [g_vec0] "=&f"(g_vec[0]), - [g_vec1] "=&f"(g_vec[1]), [r_vec0] "=&f"(r_vec[0]), - [r_vec1] "=&f"(r_vec[1]), [ub] "=&f"(ub), [ug] "=&f"(ug), - [vg] "=&f"(vg), [vr] "=&f"(vr), [bb] "=&f"(bb), [bg] "=&f"(bg), - [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), - [mask] "f"(mask), [lmove1] "f"(0x18), [rmove1] "f"(0x8), [one] "f"(0x1) - : "memory"); + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0c \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" + + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec0]"=&f"(b_vec[0]), [b_vec1]"=&f"(b_vec[1]), + [g_vec0]"=&f"(g_vec[0]), [g_vec1]"=&f"(g_vec[1]), + [r_vec0]"=&f"(r_vec[0]), [r_vec1]"=&f"(r_vec[1]), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask]"f"(mask), + [lmove1]"f"(0x18), [rmove1]"f"(0x8), + [one]"f"(0x1) + : "memory" + ); } void I422ToARGB4444Row_MMI(const uint8_t* src_y, @@ -6538,103 +6564,110 @@ void I422ToARGB4444Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" // y*0x0101 - "pmulhuh %[y], %[y], %[yg] \n\t" // y1 + "punpcklbh %[y], %[y], %[y] \n\t"//y*0x0101 + "pmulhuh %[y], %[y], %[yg] \n\t"//y1 - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" // u - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t"//u + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "and %[g_vec], %[g_vec], %[mask1] \n\t" - "psrlw %[g_vec], %[g_vec], %[four] \n\t" - "psrlw %[r_vec], %[g_vec], %[four] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" - "and %[g_vec], %[g_vec], %[r_vec] \n\t" + "and %[g_vec], %[g_vec], %[mask1] \n\t" + "psrlw %[g_vec], %[g_vec], %[four] \n\t" + "psrlw %[r_vec], %[g_vec], %[four] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" + "and %[g_vec], %[g_vec], %[r_vec] \n\t" - "and %[b_vec], %[b_vec], %[mask1] \n\t" - "psrlw %[b_vec], %[b_vec], %[four] \n\t" - "psrlw %[r_vec], %[b_vec], %[four] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" - "and %[b_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[b_vec] \n\t" + "and %[b_vec], %[b_vec], %[mask1] \n\t" + "psrlw %[b_vec], %[b_vec], %[four] \n\t" + "psrlw %[r_vec], %[b_vec], %[four] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "punpcklbh %[r_vec], %[alpha], %[zero] \n\t" + "and %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[b_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[dst_argb4444]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_argb4444]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_argb4444], %[dst_argb4444], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [dst_argb4444] "r"(dst_argb4444), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), - [mask] "f"(0xff00ff00ff00ff00), [four] "f"(0x4), - [mask1] "f"(0xf0f0f0f0f0f0f0f0), [alpha] "f"(-1) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [dst_argb4444]"r"(dst_argb4444), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask]"f"(0xff00ff00ff00ff00), + [four]"f"(0x4), [mask1]"f"(0xf0f0f0f0f0f0f0f0), + [alpha]"f"(-1) + : "memory" + ); } void I422ToARGB1555Row_MMI(const uint8_t* src_y, @@ -6645,118 +6678,125 @@ void I422ToARGB1555Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "psrlw %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "or %[g_vec], %[g_vec], %[mask3] \n\t" + "psrlw %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "or %[g_vec], %[g_vec], %[mask3] \n\t" - "psrlw %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "psrlw %[temp], %[temp], %[eight] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "or %[b_vec], %[b_vec], %[mask3] \n\t" + "psrlw %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "psrlw %[temp], %[temp], %[eight] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "or %[b_vec], %[b_vec], %[mask3] \n\t" - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[dst_argb1555]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_argb1555]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_argb1555], %[dst_argb1555], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [dst_argb1555] "r"(dst_argb1555), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), - [mask1] "f"(0xff00ff00ff00ff00), [three] "f"(0x3), - [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), - [mask3] "f"(0x800000008000), [lmove5] "f"(0x5) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [dst_argb1555]"r"(dst_argb1555), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [three]"f"(0x3), [mask2]"f"(0x1f0000001f), + [eight]"f"(0x8), [mask3]"f"(0x800000008000), + [lmove5]"f"(0x5) + : "memory" + ); } void I422ToRGB565Row_MMI(const uint8_t* src_y, @@ -6767,120 +6807,127 @@ void I422ToRGB565Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - // u3|u2|u1|u0 --> u1|u1|u0|u0 - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + //u3|u2|u1|u0 --> u1|u1|u0|u0 + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - // v3|v2|v1|v0 --> v1|v1|v0|v0 - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + //v3|v2|v1|v0 --> v1|v1|v0|v0 + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "psrlh %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "psrlh %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "psrlh %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "psrlh %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psllw %[r_vec], %[r_vec], %[lmove5] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [dst_rgb565] "r"(dst_rgb565), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), - [mask1] "f"(0xff00ff00ff00ff00), [three] "f"(0x3), - [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), [seven] "f"(0x7), - [lmove5] "f"(0x5) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [dst_rgb565]"r"(dst_rgb565), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [three]"f"(0x3), [mask2]"f"(0x1f0000001f), + [eight]"f"(0x8), [seven]"f"(0x7), + [lmove5]"f"(0x5) + : "memory" + ); } void NV12ToARGBRow_MMI(const uint8_t* src_y, @@ -6890,83 +6937,91 @@ void NV12ToARGBRow_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1) + : "memory" + ); } void NV21ToARGBRow_MMI(const uint8_t* src_y, @@ -6976,83 +7031,91 @@ void NV21ToARGBRow_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[ushu] \n\t" - "pshufh %[u], %[u], %[vshu] \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[ushu] \n\t" + "pshufh %[u], %[u], %[vshu] \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [vu_ptr] "r"(src_vu), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1) + : "memory" + ); } void NV12ToRGB24Row_MMI(const uint8_t* src_y, @@ -7062,95 +7125,103 @@ void NV12ToRGB24Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" - "psllw %[temp], %[r_vec], %[lmove1] \n\t" - "or %[g_vec], %[g_vec], %[temp] \n\t" - "psrlw %[temp], %[r_vec], %[rmove1] \n\t" - "pextrh %[temp], %[temp], %[zero] \n\t" - "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[zero] \n\t" - "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[one] \n\t" - "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" - "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" - "or %[b_vec], %[b_vec], %[temp] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" + "psllw %[temp], %[r_vec], %[lmove1] \n\t" + "or %[g_vec], %[g_vec], %[temp] \n\t" + "psrlw %[temp], %[r_vec], %[rmove1] \n\t" + "pextrh %[temp], %[temp], %[zero] \n\t" + "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[zero] \n\t" + "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[one] \n\t" + "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" + "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" + "or %[b_vec], %[b_vec], %[temp] \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [lmove1] "f"(0x18), - [one] "f"(0x1), [rmove1] "f"(0x8) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1), [lmove1]"f"(0x18), + [one]"f"(0x1), [rmove1]"f"(0x8) + : "memory" + ); } void NV21ToRGB24Row_MMI(const uint8_t* src_y, @@ -7160,95 +7231,103 @@ void NV21ToRGB24Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[ushu] \n\t" - "pshufh %[u], %[u], %[vshu] \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[vu_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[vu_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[ushu] \n\t" + "pshufh %[u], %[u], %[vshu] \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" - "psllw %[temp], %[r_vec], %[lmove1] \n\t" - "or %[g_vec], %[g_vec], %[temp] \n\t" - "psrlw %[temp], %[r_vec], %[rmove1] \n\t" - "pextrh %[temp], %[temp], %[zero] \n\t" - "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[zero] \n\t" - "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" - "pextrh %[temp], %[b_vec], %[one] \n\t" - "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" - "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" - "or %[b_vec], %[b_vec], %[temp] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" - "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "punpckhwd %[r_vec], %[g_vec], %[g_vec] \n\t" + "psllw %[temp], %[r_vec], %[lmove1] \n\t" + "or %[g_vec], %[g_vec], %[temp] \n\t" + "psrlw %[temp], %[r_vec], %[rmove1] \n\t" + "pextrh %[temp], %[temp], %[zero] \n\t" + "pinsrh_2 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[zero] \n\t" + "pinsrh_3 %[g_vec], %[g_vec], %[temp] \n\t" + "pextrh %[temp], %[b_vec], %[one] \n\t" + "punpckhwd %[b_vec], %[b_vec], %[b_vec] \n\t" + "psllw %[b_vec], %[b_vec], %[rmove1] \n\t" + "or %[b_vec], %[b_vec], %[temp] \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gsswlc1 %[b_vec], 0x0b(%[rgbbuf_ptr]) \n\t" + "gsswrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[vu_ptr], %[vu_ptr], 0x04 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x0C \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [vu_ptr] "r"(src_vu), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [lmove1] "f"(0x18), - [rmove1] "f"(0x8), [one] "f"(0x1) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [vu_ptr]"r"(src_vu), + [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [lmove1]"f"(0x18), [rmove1]"f"(0x8), + [one]"f"(0x1) + : "memory" + ); } void NV12ToRGB565Row_MMI(const uint8_t* src_y, @@ -7258,115 +7337,123 @@ void NV12ToRGB565Row_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "pshufh %[v], %[u], %[vshu] \n\t" - "pshufh %[u], %[u], %[ushu] \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[uv_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[uv_ptr]) \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "pshufh %[v], %[u], %[vshu] \n\t" + "pshufh %[u], %[u], %[ushu] \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "psrlh %[temp], %[g_vec], %[three] \n\t" - "and %[g_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psubb %[y], %[eight], %[three] \n\t" // 5 - "psllw %[r_vec], %[r_vec], %[y] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "psrlh %[temp], %[g_vec], %[three] \n\t" + "and %[g_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psubb %[y], %[eight], %[three] \n\t"//5 + "psllw %[r_vec], %[r_vec], %[y] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[g_vec], %[g_vec], %[r_vec] \n\t" - "psrlh %[temp], %[b_vec], %[three] \n\t" - "and %[b_vec], %[temp], %[mask2] \n\t" - "psrlw %[temp], %[temp], %[seven] \n\t" - "psrlw %[r_vec], %[mask1], %[eight] \n\t" - "and %[r_vec], %[temp], %[r_vec] \n\t" - "psubb %[y], %[eight], %[three] \n\t" // 5 - "psllw %[r_vec], %[r_vec], %[y] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "paddb %[r_vec], %[three], %[six] \n\t" - "psrlw %[temp], %[temp], %[r_vec] \n\t" - "and %[r_vec], %[temp], %[mask2] \n\t" - "paddb %[temp], %[three], %[eight] \n\t" - "psllw %[r_vec], %[r_vec], %[temp] \n\t" - "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "psrlh %[temp], %[b_vec], %[three] \n\t" + "and %[b_vec], %[temp], %[mask2] \n\t" + "psrlw %[temp], %[temp], %[seven] \n\t" + "psrlw %[r_vec], %[mask1], %[eight] \n\t" + "and %[r_vec], %[temp], %[r_vec] \n\t" + "psubb %[y], %[eight], %[three] \n\t"//5 + "psllw %[r_vec], %[r_vec], %[y] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" + "paddb %[r_vec], %[three], %[six] \n\t" + "psrlw %[temp], %[temp], %[r_vec] \n\t" + "and %[r_vec], %[temp], %[mask2] \n\t" + "paddb %[temp], %[three], %[eight] \n\t" + "psllw %[r_vec], %[r_vec], %[temp] \n\t" + "or %[b_vec], %[b_vec], %[r_vec] \n\t" - "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" - "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" - "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" + "punpcklhw %[r_vec], %[g_vec], %[b_vec] \n\t" + "punpckhhw %[b_vec], %[g_vec], %[b_vec] \n\t" + "punpcklhw %[g_vec], %[r_vec], %[b_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[dst_rgb565]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[dst_rgb565]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" - "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[uv_ptr], %[uv_ptr], 0x04 \n\t" + "daddiu %[dst_rgb565], %[dst_rgb565], 0x08 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [uv_ptr] "r"(src_uv), [dst_rgb565] "r"(dst_rgb565), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [three] "f"(0x3), - [mask2] "f"(0x1f0000001f), [eight] "f"(0x8), [seven] "f"(0x7) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [uv_ptr]"r"(src_uv), + [dst_rgb565]"r"(dst_rgb565), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [three]"f"(0x3), [mask2]"f"(0x1f0000001f), + [eight]"f"(0x8), [seven]"f"(0x7) + : "memory" + ); } void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2, @@ -7375,83 +7462,90 @@ void YUY2ToARGBRow_MMI(const uint8_t* src_yuy2, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t" - "psrlh %[temp], %[y], %[eight] \n\t" - "pshufh %[u], %[temp], %[ushu] \n\t" - "pshufh %[v], %[temp], %[vshu] \n\t" + "1: \n\t" + "gsldlc1 %[y], 0x07(%[yuy2_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[yuy2_ptr]) \n\t" + "psrlh %[temp], %[y], %[eight] \n\t" + "pshufh %[u], %[temp], %[ushu] \n\t" + "pshufh %[v], %[temp], %[vshu] \n\t" - "psrlh %[temp], %[mask1], %[eight] \n\t" - "and %[y], %[y], %[temp] \n\t" - "psllh %[temp], %[y], %[eight] \n\t" - "or %[y], %[y], %[temp] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "psrlh %[temp], %[mask1], %[eight] \n\t" + "and %[y], %[y], %[temp] \n\t" + "psllh %[temp], %[y], %[eight] \n\t" + "or %[y], %[y], %[temp] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[yuy2_ptr], %[yuy2_ptr], 0x08 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [yuy2_ptr] "r"(src_yuy2), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [eight] "f"(0x8) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [yuy2_ptr]"r"(src_yuy2), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1), [eight]"f"(0x8) + : "memory" + ); } void UYVYToARGBRow_MMI(const uint8_t* src_uyvy, @@ -7460,83 +7554,90 @@ void UYVYToARGBRow_MMI(const uint8_t* src_uyvy, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t" - "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t" - "psrlh %[temp], %[mask1], %[eight] \n\t" - "and %[temp], %[y], %[temp] \n\t" - "pshufh %[u], %[temp], %[ushu] \n\t" - "pshufh %[v], %[temp], %[vshu] \n\t" + "1: \n\t" + "gsldlc1 %[y], 0x07(%[uyvy_ptr]) \n\t" + "gsldrc1 %[y], 0x00(%[uyvy_ptr]) \n\t" + "psrlh %[temp], %[mask1], %[eight] \n\t" + "and %[temp], %[y], %[temp] \n\t" + "pshufh %[u], %[temp], %[ushu] \n\t" + "pshufh %[v], %[temp], %[vshu] \n\t" - "psrlh %[y], %[y], %[eight] \n\t" - "psllh %[temp], %[y], %[eight] \n\t" - "or %[y], %[y], %[temp] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "psrlh %[y], %[y], %[eight] \n\t" + "psllh %[temp], %[y], %[eight] \n\t" + "or %[y], %[y], %[temp] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" - "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" - "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[g_vec], %[alpha] \n\t" + "punpcklbh %[b_vec], %[r_vec], %[g_vec] \n\t" + "punpckhbh %[r_vec], %[r_vec], %[g_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[uyvy_ptr], %[uyvy_ptr], 0x08 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [uyvy_ptr] "r"(src_uyvy), [rgbbuf_ptr] "r"(rgb_buf), - [yuvcons_ptr] "r"(yuvconstants), [width] "r"(width), [zero] "f"(0x00), - [five] "f"(0x55), [six] "f"(0x6), [mask1] "f"(0xff00ff00ff00ff00), - [ushu] "f"(0xA0), [vshu] "f"(0xf5), [alpha] "f"(-1), [eight] "f"(0x8) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [uyvy_ptr]"r"(src_uyvy), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [ushu]"f"(0xA0), [vshu]"f"(0xf5), + [alpha]"f"(-1), [eight]"f"(0x8) + : "memory" + ); } void I422ToRGBARow_MMI(const uint8_t* src_y, @@ -7547,104 +7648,112 @@ void I422ToRGBARow_MMI(const uint8_t* src_y, int width) { uint64_t y, u, v; uint64_t b_vec, g_vec, r_vec, temp; - uint64_t ub, ug, vg, vr, bb, bg, br, yg; + uint64_t ub,ug,vg,vr,bb,bg,br,yg; __asm__ volatile( - "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" - "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" - "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" - "or %[ub], %[ub], %[mask1] \n\t" - "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" - "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[ug], %[ug], %[zero] \n\t" - "pshufh %[ug], %[ug], %[zero] \n\t" - "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vg], %[vg], %[zero] \n\t" - "pshufh %[vg], %[vg], %[five] \n\t" - "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" - "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" - "punpcklbh %[vr], %[vr], %[zero] \n\t" - "pshufh %[vr], %[vr], %[five] \n\t" - "or %[vr], %[vr], %[mask1] \n\t" + "ldc1 %[yg], 0xc0(%[yuvcons_ptr]) \n\t" + "ldc1 %[bb], 0x60(%[yuvcons_ptr]) \n\t" + "ldc1 %[ub], 0x00(%[yuvcons_ptr]) \n\t" + "or %[ub], %[ub], %[mask1] \n\t" + "ldc1 %[bg], 0x80(%[yuvcons_ptr]) \n\t" + "ldc1 %[ug], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[ug], %[ug], %[zero] \n\t" + "pshufh %[ug], %[ug], %[zero] \n\t" + "ldc1 %[vg], 0x20(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vg], %[vg], %[zero] \n\t" + "pshufh %[vg], %[vg], %[five] \n\t" + "ldc1 %[br], 0xa0(%[yuvcons_ptr]) \n\t" + "ldc1 %[vr], 0x40(%[yuvcons_ptr]) \n\t" + "punpcklbh %[vr], %[vr], %[zero] \n\t" + "pshufh %[vr], %[vr], %[five] \n\t" + "or %[vr], %[vr], %[mask1] \n\t" - "1: \n\t" - "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" - "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" - "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" - "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" - "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" - "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" + "1: \n\t" + "gslwlc1 %[y], 0x03(%[y_ptr]) \n\t" + "gslwrc1 %[y], 0x00(%[y_ptr]) \n\t" + "gslwlc1 %[u], 0x03(%[u_ptr]) \n\t" + "gslwrc1 %[u], 0x00(%[u_ptr]) \n\t" + "gslwlc1 %[v], 0x03(%[v_ptr]) \n\t" + "gslwrc1 %[v], 0x00(%[v_ptr]) \n\t" - "punpcklbh %[y], %[y], %[y] \n\t" - "pmulhuh %[y], %[y], %[yg] \n\t" + "punpcklbh %[y], %[y], %[y] \n\t" + "pmulhuh %[y], %[y], %[yg] \n\t" - "punpcklbh %[u], %[u], %[u] \n\t" - "punpcklbh %[u], %[u], %[zero] \n\t" - "paddsh %[b_vec], %[y], %[bb] \n\t" - "pmullh %[temp], %[u], %[ub] \n\t" - "psubsh %[b_vec], %[b_vec], %[temp] \n\t" - "psrah %[b_vec], %[b_vec], %[six] \n\t" + "punpcklbh %[u], %[u], %[u] \n\t" + "punpcklbh %[u], %[u], %[zero] \n\t" + "paddsh %[b_vec], %[y], %[bb] \n\t" + "pmullh %[temp], %[u], %[ub] \n\t" + "psubsh %[b_vec], %[b_vec], %[temp] \n\t" + "psrah %[b_vec], %[b_vec], %[six] \n\t" - "punpcklbh %[v], %[v], %[v] \n\t" - "punpcklbh %[v], %[v], %[zero] \n\t" - "paddsh %[g_vec], %[y], %[bg] \n\t" - "pmullh %[temp], %[u], %[ug] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "pmullh %[temp], %[v], %[vg] \n\t" - "psubsh %[g_vec], %[g_vec], %[temp] \n\t" - "psrah %[g_vec], %[g_vec], %[six] \n\t" + "punpcklbh %[v], %[v], %[v] \n\t" + "punpcklbh %[v], %[v], %[zero] \n\t" + "paddsh %[g_vec], %[y], %[bg] \n\t" + "pmullh %[temp], %[u], %[ug] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "pmullh %[temp], %[v], %[vg] \n\t" + "psubsh %[g_vec], %[g_vec], %[temp] \n\t" + "psrah %[g_vec], %[g_vec], %[six] \n\t" - "paddsh %[r_vec], %[y], %[br] \n\t" - "pmullh %[temp], %[v], %[vr] \n\t" - "psubsh %[r_vec], %[r_vec], %[temp] \n\t" - "psrah %[r_vec], %[r_vec], %[six] \n\t" + "paddsh %[r_vec], %[y], %[br] \n\t" + "pmullh %[temp], %[v], %[vr] \n\t" + "psubsh %[r_vec], %[r_vec], %[temp] \n\t" + "psrah %[r_vec], %[r_vec], %[six] \n\t" - "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" - "packushb %[g_vec], %[g_vec], %[zero] \n\t" - "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t" - "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t" - "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t" - "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" - "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[r_vec], %[b_vec], %[r_vec] \n\t" + "packushb %[g_vec], %[g_vec], %[zero] \n\t" + "punpcklwd %[g_vec], %[alpha], %[g_vec] \n\t" + "punpcklbh %[b_vec], %[g_vec], %[r_vec] \n\t" + "punpckhbh %[r_vec], %[g_vec], %[r_vec] \n\t" + "punpcklhw %[g_vec], %[b_vec], %[r_vec] \n\t" + "punpckhhw %[b_vec], %[b_vec], %[r_vec] \n\t" - "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" - "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" - "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[g_vec], 0x07(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[g_vec], 0x00(%[rgbbuf_ptr]) \n\t" + "gssdlc1 %[b_vec], 0x0f(%[rgbbuf_ptr]) \n\t" + "gssdrc1 %[b_vec], 0x08(%[rgbbuf_ptr]) \n\t" - "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" - "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" - "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" - "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "bnez %[width], 1b \n\t" + "daddiu %[y_ptr], %[y_ptr], 0x04 \n\t" + "daddiu %[u_ptr], %[u_ptr], 0x02 \n\t" + "daddiu %[v_ptr], %[v_ptr], 0x02 \n\t" + "daddiu %[rgbbuf_ptr], %[rgbbuf_ptr], 0x10 \n\t" + "daddi %[width], %[width], -0x04 \n\t" + "bnez %[width], 1b \n\t" - : [y] "=&f"(y), [u] "=&f"(u), [v] "=&f"(v), [b_vec] "=&f"(b_vec), - [g_vec] "=&f"(g_vec), [r_vec] "=&f"(r_vec), [temp] "=&f"(temp), - [ub] "=&f"(ub), [ug] "=&f"(ug), [vg] "=&f"(vg), [vr] "=&f"(vr), - [bb] "=&f"(bb), [bg] "=&f"(bg), [br] "=&f"(br), [yg] "=&f"(yg) - : [y_ptr] "r"(src_y), [u_ptr] "r"(src_u), [v_ptr] "r"(src_v), - [rgbbuf_ptr] "r"(rgb_buf), [yuvcons_ptr] "r"(yuvconstants), - [width] "r"(width), [zero] "f"(0x00), [five] "f"(0x55), [six] "f"(0x6), - [mask1] "f"(0xff00ff00ff00ff00), [alpha] "f"(-1) - : "memory"); + : [y]"=&f"(y), [u]"=&f"(u), + [v]"=&f"(v), + [b_vec]"=&f"(b_vec), [g_vec]"=&f"(g_vec), + [r_vec]"=&f"(r_vec), [temp]"=&f"(temp), + [ub]"=&f"(ub), [ug]"=&f"(ug), + [vg]"=&f"(vg), [vr]"=&f"(vr), + [bb]"=&f"(bb), [bg]"=&f"(bg), + [br]"=&f"(br), [yg]"=&f"(yg) + : [y_ptr]"r"(src_y), [u_ptr]"r"(src_u), + [v_ptr]"r"(src_v), [rgbbuf_ptr]"r"(rgb_buf), + [yuvcons_ptr]"r"(yuvconstants), [width]"r"(width), + [zero]"f"(0x00), [five]"f"(0x55), + [six]"f"(0x6), [mask1]"f"(0xff00ff00ff00ff00), + [alpha]"f"(-1) + : "memory" + ); } void ARGBSetRow_MMI(uint8_t* dst_argb, uint32_t v32, int width) { - __asm__ volatile( - "punpcklwd %[v32], %[v32], %[v32] \n\t" - "1: \n\t" - "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t" - "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t" - "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t" - "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t" + __asm__ volatile ( + "punpcklwd %[v32], %[v32], %[v32] \n\t" + "1: \n\t" + "gssdlc1 %[v32], 0x07(%[dst_ptr]) \n\t" + "gssdrc1 %[v32], 0x00(%[dst_ptr]) \n\t" + "gssdlc1 %[v32], 0x0f(%[dst_ptr]) \n\t" + "gssdrc1 %[v32], 0x08(%[dst_ptr]) \n\t" - "daddi %[width], %[width], -0x04 \n\t" - "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" - "bnez %[width], 1b \n\t" - : [v32] "+&f"(v32) - : [dst_ptr] "r"(dst_argb), [width] "r"(width) - : "memory"); + "daddi %[width], %[width], -0x04 \n\t" + "daddiu %[dst_ptr], %[dst_ptr], 0x10 \n\t" + "bnez %[width], 1b \n\t" + : [v32]"+&f"(v32) + : [dst_ptr]"r"(dst_argb), [width]"r"(width) + : "memory" + ); } // 10 bit YUV to ARGB diff --git a/source/row_neon.cc b/source/row_neon.cc index 1cf8eefea..99e7db97f 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -682,22 +682,23 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( // Start at end of source row. - "mov r3, #-16 \n" "add %0, %0, %2 \n" - "sub %0, #16 \n" + "sub %0, %0, #32 \n" // 32 bytes per loop "1: \n" - "vld1.8 {q0}, [%0], r3 \n" // src -= 16 - "subs %2, #16 \n" // 16 pixels per loop. - "vrev64.8 q0, q0 \n" - "vst1.8 {d1}, [%1]! \n" // dst += 16 - "vst1.8 {d0}, [%1]! \n" + "vld1.8 {q1, q2}, [%0], %3 \n" // src -= 32 + "subs %2, #32 \n" // 32 pixels per loop. + "vrev64.8 q0, q2 \n" + "vrev64.8 q1, q1 \n" + "vswp d0, d1 \n" + "vswp d2, d3 \n" + "vst1.8 {q0, q1}, [%1]! \n" // dst += 32 "bgt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 - : - : "cc", "memory", "r3", "q0"); + : "r"(-32) // %3 + : "cc", "memory", "q0", "q1", "q2"); } void MirrorUVRow_NEON(const uint8_t* src_uv, diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 866e7bfc6..5646da8a2 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -723,23 +723,29 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { : "cc", "memory", "v0"); } +// Shuffle table for reversing the bytes. +static const uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, + 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u}; + void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { asm volatile( // Start at end of source row. + "ld1 {v3.16b}, [%4] \n" // shuffler "add %0, %0, %w2, sxtw \n" - "sub %0, %0, #16 \n" + "sub %0, %0, #32 \n" "1: \n" - "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 - "subs %w2, %w2, #16 \n" // 16 pixels per loop. - "rev64 v0.16b, v0.16b \n" - "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 - "st1 {v0.D}[0], [%1], #8 \n" + "ld1 {v1.16b,v2.16b}, [%0], %3 \n" // src -= 32 + "subs %w2, %w2, #32 \n" // 32 pixels per loop. + "tbl v1.16b, {v1.16b}, v3.16b \n" + "tbl v0.16b, {v2.16b}, v3.16b \n" + "st1 {v0.16b, v1.16b}, [%1], #32 \n" // store 32 pixels "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 "+r"(width) // %2 - : "r"((ptrdiff_t)-16) // %3 - : "cc", "memory", "v0"); + : "r"((ptrdiff_t)-32), // %3 + "r"(&kShuffleMirror) // %4 + : "cc", "memory", "v0", "v1", "v2", "v3"); } void MirrorUVRow_NEON(const uint8_t* src_uv, diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index fb9632a94..698cd9562 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -40,9 +40,9 @@ #endif // Some functions fail on big endian. Enable these tests on all cpus except -// PowerPC -#if !defined(__powerpc__) -#define LITTLE_ENDIAN_TEST 1 +// PowerPC, but they are not optimized so disabled by default. +#if !defined(__powerpc__) && defined(ENABLE_SLOW_TESTS) +#define INTEL_TEST 1 #endif namespace libyuv { @@ -691,7 +691,7 @@ TESTPLANARTOB(J420, 2, 2, RAW, 3, 3, 1) TESTPLANARTOB(J420, 2, 2, RGB24, 3, 3, 1) TESTPLANARTOB(H420, 2, 2, RAW, 3, 3, 1) TESTPLANARTOB(H420, 2, 2, RGB24, 3, 3, 1) -#ifdef LITTLE_ENDIAN_TEST +#ifdef INTEL_TEST TESTPLANARTOB(I420, 2, 2, RGB565, 2, 2, 1) TESTPLANARTOB(J420, 2, 2, RGB565, 2, 2, 1) TESTPLANARTOB(H420, 2, 2, RGB565, 2, 2, 1) @@ -723,7 +723,7 @@ TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1) TESTPLANARTOB(I422, 2, 1, UYVY, 2, 4, 1) TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 1) TESTPLANARTOB(J420, 2, 2, J400, 1, 1, 1) -#ifdef LITTLE_ENDIAN_TEST +#ifdef INTEL_TEST TESTPLANARTOB(I420, 2, 2, AR30, 4, 4, 1) TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1) #endif @@ -876,7 +876,7 @@ TESTBIPLANARTOB(NV12, 2, 2, RGB24, RGB24, 3, 2) TESTBIPLANARTOB(NV21, 2, 2, RGB24, RGB24, 3, 2) TESTBIPLANARTOB(NV12, 2, 2, RAW, RAW, 3, 2) TESTBIPLANARTOB(NV21, 2, 2, RAW, RAW, 3, 2) -#ifdef LITTLE_ENDIAN_TEST +#ifdef INTEL_TEST TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2, 9) #endif TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3, 2) @@ -1012,7 +1012,7 @@ TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2) TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2) TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, ARM_YUV_ERROR) TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, ARM_YUV_ERROR) -#ifdef LITTLE_ENDIAN_TEST +#ifdef INTEL_TEST TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15) TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17) #endif @@ -1022,7 +1022,7 @@ TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2) TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4) TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4) TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, ARM_YUV_ERROR) -#ifdef LITTLE_ENDIAN_TEST +#ifdef INTEL_TEST TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5) #endif TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4) @@ -1200,20 +1200,20 @@ TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2) // TODO(fbarchard): make ARM version of C code that matches NEON. TESTATOB(AB30, 4, 4, 1, ABGR, 4, 4, 1, 0) TESTATOB(AB30, 4, 4, 1, ARGB, 4, 4, 1, 0) -#ifdef LITTLE_ENDIAN_TEST +#ifdef INTEL_TEST TESTATOB(ABGR, 4, 4, 1, AR30, 4, 4, 1, 0) #endif TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0) -#ifdef LITTLE_ENDIAN_TEST +#ifdef INTEL_TEST TESTATOB(AR30, 4, 4, 1, AB30, 4, 4, 1, 0) #endif TESTATOB(AR30, 4, 4, 1, ABGR, 4, 4, 1, 0) -#ifdef LITTLE_ENDIAN_TEST +#ifdef INTEL_TEST TESTATOB(AR30, 4, 4, 1, AR30, 4, 4, 1, 0) TESTATOB(AR30, 4, 4, 1, ARGB, 4, 4, 1, 0) #endif TESTATOB(ARGB, 4, 4, 1, ABGR, 4, 4, 1, 0) -#ifdef LITTLE_ENDIAN_TEST +#ifdef INTEL_TEST TESTATOB(ARGB, 4, 4, 1, AR30, 4, 4, 1, 0) #endif TESTATOB(ARGB, 4, 4, 1, ARGB, 4, 4, 1, 0) @@ -1226,7 +1226,7 @@ TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 0) TESTATOB(RGBA, 4, 4, 1, J400, 1, 1, 1, 0) TESTATOB(ARGB, 4, 4, 1, RAW, 3, 3, 1, 0) TESTATOB(ARGB, 4, 4, 1, RGB24, 3, 3, 1, 0) -#ifdef LITTLE_ENDIAN_TEST +#ifdef INTEL_TEST TESTATOB(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0) #endif TESTATOB(ARGB, 4, 4, 1, RGBA, 4, 4, 1, 0) @@ -1245,7 +1245,7 @@ TESTATOB(RAW, 3, 3, 1, RGBA, 4, 4, 1, 0) TESTATOB(RAW, 3, 3, 1, RGB24, 3, 3, 1, 0) TESTATOB(RGB24, 3, 3, 1, ARGB, 4, 4, 1, 0) TESTATOB(RGB24, 3, 3, 1, J400, 1, 1, 1, 0) -#ifdef LITTLE_ENDIAN_TEST +#ifdef INTEL_TEST TESTATOB(RGB565, 2, 2, 1, ARGB, 4, 4, 1, 0) #endif TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0) @@ -1348,7 +1348,7 @@ TESTATOB(YUY2, 2, 4, 1, Y, 1, 1, 1, 0) TESTATOBDRANDOM(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ HEIGHT_B, DIFF) -#ifdef LITTLE_ENDIAN_TEST +#ifdef INTEL_TEST TESTATOBD(ARGB, 4, 4, 1, RGB565, 2, 2, 1, 0) #endif @@ -2447,7 +2447,7 @@ TEST_F(LibYUVConvertTest, TestDither) { TESTPLANARTOBID(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ YALIGN, benchmark_width_, DIFF, _Opt, +, 0, FMT_C, BPP_C) -#ifdef LITTLE_ENDIAN_TEST +#ifdef INTEL_TEST TESTPLANARTOBD(I420, 2, 2, RGB565, 2, 2, 1, 9, ARGB, 4) #endif #define TESTPTOB(NAME, UYVYTOI420, UYVYTONV12) \ @@ -2591,7 +2591,7 @@ TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, RGB24, 3) TESTPLANARTOE(H420, 2, 2, RGB24, 1, 3, RAW, 3) TESTPLANARTOE(H420, 2, 2, ARGB, 1, 4, RAW, 3) TESTPLANARTOE(H420, 2, 2, RAW, 1, 3, ARGB, 4) -#ifdef LITTLE_ENDIAN_TEST +#ifdef INTEL_TEST TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, RGB565, 2) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB1555, 2) TESTPLANARTOE(I420, 2, 2, ARGB, 1, 4, ARGB4444, 2) @@ -2738,7 +2738,7 @@ TESTQPLANARTOE(I420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4) _Opt, +, 0, FMT_C, BPP_C) // Caveat: Destination needs to be 4 bytes -#ifdef LITTLE_ENDIAN_TEST +#ifdef INTEL_TEST TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4) TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4) TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4) @@ -2929,7 +2929,7 @@ TESTPLANAR16TOB(H210, 2, 1, ARGB, 4, 4, 1, 2) TESTPLANAR16TOB(H210, 2, 1, ABGR, 4, 4, 1, 2) TESTPLANAR16TOB(U210, 2, 1, ARGB, 4, 4, 1, 2) TESTPLANAR16TOB(U210, 2, 1, ABGR, 4, 4, 1, 2) -#ifdef LITTLE_ENDIAN_TEST +#ifdef INTEL_TEST TESTPLANAR16TOB(I010, 2, 2, AR30, 4, 4, 1, 2) TESTPLANAR16TOB(I010, 2, 2, AB30, 4, 4, 1, 2) TESTPLANAR16TOB(H010, 2, 2, AR30, 4, 4, 1, 2) diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc index 93b77f56a..1dfa47f86 100644 --- a/unit_test/scale_argb_test.cc +++ b/unit_test/scale_argb_test.cc @@ -306,7 +306,9 @@ TEST_SCALETO(ARGBScale, 320, 240) TEST_SCALETO(ARGBScale, 569, 480) TEST_SCALETO(ARGBScale, 640, 360) TEST_SCALETO(ARGBScale, 1280, 720) +#ifdef ENABLE_SLOW_TESTS TEST_SCALETO(ARGBScale, 1920, 1080) +#endif // ENABLE_SLOW_TESTS #undef TEST_SCALETO1 #undef TEST_SCALETO diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index 94a785900..ac1e26ea3 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -500,7 +500,7 @@ static int I444TestFilter_16(int src_width, #define DX(x, nom, denom) static_cast(((Abs(x) / nom + 1) / 2) * nom * 2) #define SX(x, nom, denom) static_cast(((x / nom + 1) / 2) * denom * 2) -#define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ +#define TEST_FACTOR1(DISABLED_, name, filter, nom, denom, max_diff) \ TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter) { \ int diff = I420TestFilter( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ @@ -517,7 +517,7 @@ static int I444TestFilter_16(int src_width, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(LibYUVScaleTest, I420ScaleDownBy##name##_##filter##_16) { \ + TEST_F(LibYUVScaleTest, DISABLED_##I420ScaleDownBy##name##_##filter##_16) { \ int diff = I420TestFilter_16( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ @@ -525,7 +525,7 @@ static int I444TestFilter_16(int src_width, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(LibYUVScaleTest, I444ScaleDownBy##name##_##filter##_16) { \ + TEST_F(LibYUVScaleTest, DISABLED_##I444ScaleDownBy##name##_##filter##_16) { \ int diff = I444TestFilter_16( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \ @@ -536,11 +536,19 @@ static int I444TestFilter_16(int src_width, // Test a scale factor with all 4 filters. Expect unfiltered to be exact, but // filtering is different fixed point implementations for SSSE3, Neon and C. +#ifdef ENABLE_SLOW_TESTS #define TEST_FACTOR(name, nom, denom, boxdiff) \ - TEST_FACTOR1(name, None, nom, denom, 0) \ - TEST_FACTOR1(name, Linear, nom, denom, 3) \ - TEST_FACTOR1(name, Bilinear, nom, denom, 3) \ - TEST_FACTOR1(name, Box, nom, denom, boxdiff) + TEST_FACTOR1(, name, None, nom, denom, 0) \ + TEST_FACTOR1(, name, Linear, nom, denom, 3) \ + TEST_FACTOR1(, name, Bilinear, nom, denom, 3) \ + TEST_FACTOR1(, name, Box, nom, denom, boxdiff) +#else +#define TEST_FACTOR(name, nom, denom, boxdiff) \ + TEST_FACTOR1(DISABLED_, name, None, nom, denom, 0) \ + TEST_FACTOR1(DISABLED_, name, Linear, nom, denom, 3) \ + TEST_FACTOR1(DISABLED_, name, Bilinear, nom, denom, 3) \ + TEST_FACTOR1(DISABLED_, name, Box, nom, denom, boxdiff) +#endif TEST_FACTOR(2, 1, 2, 0) TEST_FACTOR(4, 1, 4, 0) @@ -553,7 +561,7 @@ TEST_FACTOR(3, 1, 3, 0) #undef SX #undef DX -#define TEST_SCALETO1(name, width, height, filter, max_diff) \ +#define TEST_SCALETO1(DISABLED_, name, width, height, filter, max_diff) \ TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter) { \ int diff = I420TestFilter(benchmark_width_, benchmark_height_, width, \ height, kFilter##filter, benchmark_iterations_, \ @@ -566,13 +574,13 @@ TEST_FACTOR(3, 1, 3, 0) disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(LibYUVScaleTest, I420##name##To##width##x##height##_##filter##_16) { \ + TEST_F(LibYUVScaleTest, DISABLED_##I420##name##To##width##x##height##_##filter##_16) { \ int diff = I420TestFilter_16( \ benchmark_width_, benchmark_height_, width, height, kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \ EXPECT_LE(diff, max_diff); \ } \ - TEST_F(LibYUVScaleTest, I444##name##To##width##x##height##_##filter##_16) { \ + TEST_F(LibYUVScaleTest, DISABLED_##I444##name##To##width##x##height##_##filter##_16) { \ int diff = I444TestFilter_16( \ benchmark_width_, benchmark_height_, width, height, kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, benchmark_cpu_info_); \ @@ -593,7 +601,7 @@ TEST_FACTOR(3, 1, 3, 0) EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, \ - I420##name##From##width##x##height##_##filter##_16) { \ + DISABLED_##I420##name##From##width##x##height##_##filter##_16) { \ int diff = I420TestFilter_16(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, \ @@ -601,7 +609,7 @@ TEST_FACTOR(3, 1, 3, 0) EXPECT_LE(diff, max_diff); \ } \ TEST_F(LibYUVScaleTest, \ - I444##name##From##width##x##height##_##filter##_16) { \ + DISABLED_##I444##name##From##width##x##height##_##filter##_16) { \ int diff = I444TestFilter_16(width, height, Abs(benchmark_width_), \ Abs(benchmark_height_), kFilter##filter, \ benchmark_iterations_, disable_cpu_flags_, \ @@ -609,19 +617,30 @@ TEST_FACTOR(3, 1, 3, 0) EXPECT_LE(diff, max_diff); \ } +#ifdef ENABLE_SLOW_TESTS // Test scale to a specified size with all 4 filters. #define TEST_SCALETO(name, width, height) \ - TEST_SCALETO1(name, width, height, None, 0) \ - TEST_SCALETO1(name, width, height, Linear, 3) \ - TEST_SCALETO1(name, width, height, Bilinear, 3) \ - TEST_SCALETO1(name, width, height, Box, 3) + TEST_SCALETO1(, name, width, height, None, 0) \ + TEST_SCALETO1(, name, width, height, Linear, 3) \ + TEST_SCALETO1(, name, width, height, Bilinear, 3) \ + TEST_SCALETO1(, name, width, height, Box, 3) +#else + // Test scale to a specified size with all 4 filters. +#define TEST_SCALETO(name, width, height) \ + TEST_SCALETO1(DISABLED_, name, width, height, None, 0) \ + TEST_SCALETO1(DISABLED_, name, width, height, Linear, 3) \ + TEST_SCALETO1(DISABLED_, name, width, height, Bilinear, 3) \ + TEST_SCALETO1(DISABLED_, name, width, height, Box, 3) +#endif TEST_SCALETO(Scale, 1, 1) TEST_SCALETO(Scale, 320, 240) TEST_SCALETO(Scale, 569, 480) TEST_SCALETO(Scale, 640, 360) TEST_SCALETO(Scale, 1280, 720) +#ifdef ENABLE_SLOW_TESTS TEST_SCALETO(Scale, 1920, 1080) +#endif // ENABLE_SLOW_TESTS #undef TEST_SCALETO1 #undef TEST_SCALETO @@ -879,7 +898,7 @@ static int TestPlaneFilter_16(int src_width, #define SX(x, nom, denom) static_cast(((x / nom + 1) / 2) * denom * 2) #define TEST_FACTOR1(name, filter, nom, denom, max_diff) \ - TEST_F(LibYUVScaleTest, ScalePlaneDownBy##name##_##filter##_16) { \ + TEST_F(LibYUVScaleTest, DISABLED_##ScalePlaneDownBy##name##_##filter##_16) {\ int diff = TestPlaneFilter_16( \ SX(benchmark_width_, nom, denom), SX(benchmark_height_, nom, denom), \ DX(benchmark_width_, nom, denom), DX(benchmark_height_, nom, denom), \