From 7018f5be0f98419c0e4eac518b0641655ba91c98 Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Mon, 3 Oct 2016 18:21:31 -0700
Subject: [PATCH] Add MSA optimized I422ToYUY2Row, I422ToUYVYRow functions

R=fbarchard@google.com
BUG=libyuv:634

Performance gains :-

I422ToYUY2Row_MSA     - ~12x
I422ToYUY2Row_Any_MSA - ~7x

I422ToUYVYRow_MSA     - ~12x
I422ToUYVYRow_Any_MSA - ~7x

Review URL: https://codereview.chromium.org/2378753004 .
---
 include/libyuv/macros_msa.h | 13 ++++++++++
 include/libyuv/row.h        | 18 ++++++++++++++
 source/convert_from.cc      | 24 +++++++++++++++++++
 source/convert_from_argb.cc | 16 +++++++++++++
 source/row_any.cc           |  6 +++++
 source/row_msa.cc           | 48 +++++++++++++++++++++++++++++++++++++
 6 files changed, 125 insertions(+)

diff --git a/include/libyuv/macros_msa.h b/include/libyuv/macros_msa.h
index 92ed21c38..8a81e8213 100644
--- a/include/libyuv/macros_msa.h
+++ b/include/libyuv/macros_msa.h
@@ -71,6 +71,19 @@
 }
 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
 
+/* Description : Interleave both left and right half of input vectors
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements from 'in0' and 'in1' are
+                 interleaved and written to 'out0'
+*/
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1) {           \
+  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \
+  out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \
+}
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+
 #endif  /* !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) */
 
 #endif  // INCLUDE_LIBYUV_MACROS_MSA_H_
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 7bbad513c..01cca3358 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -370,6 +370,8 @@ extern "C" {
 #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
 #define HAS_MIRRORROW_MSA
 #define HAS_ARGBMIRRORROW_MSA
+#define HAS_I422TOYUY2ROW_MSA
+#define HAS_I422TOUYVYROW_MSA
 #endif
 
 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@@ -1769,6 +1771,22 @@ void I422ToUYVYRow_Any_NEON(const uint8* src_y,
                             const uint8* src_u,
                             const uint8* src_v,
                             uint8* dst_uyvy, int width);
+void I422ToYUY2Row_MSA(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_yuy2, int width);
+void I422ToUYVYRow_MSA(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_uyvy, int width);
+void I422ToYUY2Row_Any_MSA(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_yuy2, int width);
+void I422ToUYVYRow_Any_MSA(const uint8* src_y,
+                           const uint8* src_u,
+                           const uint8* src_v,
+                           uint8* dst_uyvy, int width);
 
 // Effects related row functions.
 void ARGBAttenuateRow_C(const uint8* src_argb, uint8* dst_argb, int width);
diff --git a/source/convert_from.cc b/source/convert_from.cc
index 3b2dca816..1256ca99c 100644
--- a/source/convert_from.cc
+++ b/source/convert_from.cc
@@ -237,6 +237,14 @@ int I420ToYUY2(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
@@ -298,6 +306,14 @@ int I422ToUYVY(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
@@ -345,6 +361,14 @@ int I420ToUYVY(const uint8* src_y, int src_stride_y,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc
index 2a8682b7e..50ede22a6 100644
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@@ -553,6 +553,14 @@ int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOYUY2ROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToYUY2Row = I422ToYUY2Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToYUY2Row = I422ToYUY2Row_MSA;
+    }
+  }
+#endif
 
   {
     // Allocate a rows of yuv.
@@ -655,6 +663,14 @@ int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
     }
   }
 #endif
+#if defined(HAS_I422TOUYVYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    I422ToUYVYRow = I422ToUYVYRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      I422ToUYVYRow = I422ToUYVYRow_MSA;
+    }
+  }
+#endif
 
   {
     // Allocate a rows of yuv.
diff --git a/source/row_any.cc b/source/row_any.cc
index 1f1b36348..df442b0f6 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -80,9 +80,15 @@ ANY31(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, 1, 1, 4, 15)
 #ifdef HAS_I422TOYUY2ROW_NEON
 ANY31(I422ToYUY2Row_Any_NEON, I422ToYUY2Row_NEON, 1, 1, 4, 15)
 #endif
+#ifdef HAS_I422TOYUY2ROW_MSA
+ANY31(I422ToYUY2Row_Any_MSA, I422ToYUY2Row_MSA, 1, 1, 4, 31)
+#endif
 #ifdef HAS_I422TOUYVYROW_NEON
 ANY31(I422ToUYVYRow_Any_NEON, I422ToUYVYRow_NEON, 1, 1, 4, 15)
 #endif
+#ifdef HAS_I422TOUYVYROW_MSA
+ANY31(I422ToUYVYRow_Any_MSA, I422ToUYVYRow_MSA, 1, 1, 4, 31)
+#endif
 #ifdef HAS_BLENDPLANEROW_AVX2
 ANY31(BlendPlaneRow_Any_AVX2, BlendPlaneRow_AVX2, 0, 0, 1, 31)
 #endif
diff --git a/source/row_msa.cc b/source/row_msa.cc
index b86865cf3..52a246cdb 100644
--- a/source/row_msa.cc
+++ b/source/row_msa.cc
@@ -53,6 +53,54 @@ void ARGBMirrorRow_MSA(const uint8* src, uint8* dst, int width) {
   }
 }
 
+void I422ToYUY2Row_MSA(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_yuy2,
+                       int width) {
+  int x;
+  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
+  v16u8 dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3;
+
+  for (x = 0; x < width; x += 32) {
+    src_u0 = LD_UB(src_u);
+    src_v0 = LD_UB(src_v);
+    LD_UB2(src_y, 16, src_y0, src_y1);
+    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
+    ILVRL_B2_UB(vec_uv0, src_y0, dst_yuy2_0, dst_yuy2_1);
+    ILVRL_B2_UB(vec_uv1, src_y1, dst_yuy2_2, dst_yuy2_3);
+    ST_UB4(dst_yuy2_0, dst_yuy2_1, dst_yuy2_2, dst_yuy2_3, dst_yuy2, 16);
+    src_u += 16;
+    src_v += 16;
+    src_y += 32;
+    dst_yuy2 += 64;
+  }
+}
+
+void I422ToUYVYRow_MSA(const uint8* src_y,
+                       const uint8* src_u,
+                       const uint8* src_v,
+                       uint8* dst_uyvy,
+                       int width) {
+  int x;
+  v16u8 src_u0, src_v0, src_y0, src_y1, vec_uv0, vec_uv1;
+  v16u8 dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3;
+
+  for (x = 0; x < width; x += 32) {
+    src_u0 = LD_UB(src_u);
+    src_v0 = LD_UB(src_v);
+    LD_UB2(src_y, 16, src_y0, src_y1);
+    ILVRL_B2_UB(src_v0, src_u0, vec_uv0, vec_uv1);
+    ILVRL_B2_UB(src_y0, vec_uv0, dst_uyvy0, dst_uyvy1);
+    ILVRL_B2_UB(src_y1, vec_uv1, dst_uyvy2, dst_uyvy3);
+    ST_UB4(dst_uyvy0, dst_uyvy1, dst_uyvy2, dst_uyvy3, dst_uyvy, 16);
+    src_u += 16;
+    src_v += 16;
+    src_y += 32;
+    dst_uyvy += 64;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv