From a2891ec77c183ec265af8278eee821e4d9715c12 Mon Sep 17 00:00:00 2001
From: Frank Barchard <fbarchard@google.com>
Date: Fri, 7 Oct 2016 10:37:22 -0700
Subject: [PATCH] Add MSA optimized YUY2ToI422, YUY2ToI420, UYVYToI422,
 UYVYToI420 functions

R=fbarchard@google.com
BUG=libyuv:634

Performance gains as below,

YUY2ToI422, YUY2ToI420 :-

YUY2ToYRow_MSA          : ~10x
YUY2ToUVRow_MSA         : ~11x
YUY2ToUV422Row_MSA      : ~9x
YUY2ToYRow_Any_MSA      : ~6x
YUY2ToUVRow_Any_MSA     : ~5x
YUY2ToUV422Row_Any_MSA  : ~4x

UYVYToI422, UYVYToI420 :-

UYVYToYRow_MSA          : ~10x
UYVYToUVRow_MSA         : ~11x
UYVYToUV422Row_MSA      : ~9x
UYVYToYRow_Any_MSA      : ~6x
UYVYToUVRow_Any_MSA     : ~5x
UYVYToUV422Row_Any_MSA  : ~4x

Review URL: https://codereview.chromium.org/2397693002 .
---
 docs/getting_started.md    |   6 +-
 include/libyuv/row.h       |  26 ++++++++
 source/convert.cc          |  20 +++++++
 source/planar_functions.cc |  20 +++++++
 source/row_any.cc          |  16 +++++
 source/row_msa.cc          | 120 +++++++++++++++++++++++++++++++++++++
 6 files changed, 205 insertions(+), 3 deletions(-)

diff --git a/docs/getting_started.md b/docs/getting_started.md
index 9524d9d6c..a0d50a94e 100644
--- a/docs/getting_started.md
+++ b/docs/getting_started.md
@@ -166,12 +166,12 @@ ia32
 mipsel
 
     gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mipsel\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
-    gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mipsel\"" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
+    gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mipsel\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
     ninja -j7 -v -C out/Debug libyuv_unittest
     ninja -j7 -v -C out/Release libyuv_unittest
 
-    gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\"" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
-    gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\"" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
+    gn gen out/Release "--args=is_debug=false target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
+    gn gen out/Debug "--args=is_debug=true target_os=\"android\" target_cpu=\"mips64el\" mips_arch_variant=\"r6\" mips_use_msa=true is_component_build=true is_clang=false"
     ninja -j7 -v -C out/Debug libyuv_unittest
     ninja -j7 -v -C out/Release libyuv_unittest
 
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 01cca3358..1a17cbf4e 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -372,6 +372,12 @@ extern "C" {
 #define HAS_ARGBMIRRORROW_MSA
 #define HAS_I422TOYUY2ROW_MSA
 #define HAS_I422TOUYVYROW_MSA
+#define HAS_YUY2TOYROW_MSA
+#define HAS_YUY2TOUVROW_MSA
+#define HAS_YUY2TOUV422ROW_MSA
+#define HAS_UYVYTOYROW_MSA
+#define HAS_UYVYTOUVROW_MSA
+
 #endif
 
 #if defined(_MSC_VER) && !defined(__CLR_VER) && !defined(__clang__)
@@ -1669,6 +1675,11 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
                       uint8* dst_u, uint8* dst_v, int width);
 void YUY2ToUV422Row_NEON(const uint8* src_yuy2,
                          uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_MSA(const uint8* src_yuy2, int stride_yuy2,
+                     uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUV422Row_MSA(const uint8* src_yuy2,
+                        uint8* dst_u, uint8* dst_v, int width);
 void YUY2ToYRow_C(const uint8* src_yuy2, uint8* dst_y, int width);
 void YUY2ToUVRow_C(const uint8* src_yuy2, int stride_yuy2,
                    uint8* dst_u, uint8* dst_v, int width);
@@ -1689,6 +1700,11 @@ void YUY2ToUVRow_Any_NEON(const uint8* src_yuy2, int stride_yuy2,
                           uint8* dst_u, uint8* dst_v, int width);
 void YUY2ToUV422Row_Any_NEON(const uint8* src_yuy2,
                              uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToYRow_Any_MSA(const uint8* src_yuy2, uint8* dst_y, int width);
+void YUY2ToUVRow_Any_MSA(const uint8* src_yuy2, int stride_yuy2,
+                         uint8* dst_u, uint8* dst_v, int width);
+void YUY2ToUV422Row_Any_MSA(const uint8* src_yuy2,
+                            uint8* dst_u, uint8* dst_v, int width);
 void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width);
 void UYVYToUVRow_AVX2(const uint8* src_uyvy, int stride_uyvy,
                       uint8* dst_u, uint8* dst_v, int width);
@@ -1709,6 +1725,11 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
                       uint8* dst_u, uint8* dst_v, int width);
 void UYVYToUV422Row_NEON(const uint8* src_uyvy,
                          uint8* dst_u, uint8* dst_v, int width);
+void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_MSA(const uint8* src_uyvy, int stride_uyvy,
+                     uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUV422Row_MSA(const uint8* src_uyvy,
+                        uint8* dst_u, uint8* dst_v, int width);
 
 void UYVYToYRow_C(const uint8* src_uyvy, uint8* dst_y, int width);
 void UYVYToUVRow_C(const uint8* src_uyvy, int stride_uyvy,
@@ -1730,6 +1751,11 @@ void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
                           uint8* dst_u, uint8* dst_v, int width);
 void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
                              uint8* dst_u, uint8* dst_v, int width);
+void UYVYToYRow_Any_MSA(const uint8* src_uyvy, uint8* dst_y, int width);
+void UYVYToUVRow_Any_MSA(const uint8* src_uyvy, int stride_uyvy,
+                         uint8* dst_u, uint8* dst_v, int width);
+void UYVYToUV422Row_Any_MSA(const uint8* src_uyvy,
+                            uint8* dst_u, uint8* dst_v, int width);
 
 void I422ToYUY2Row_C(const uint8* src_y,
                      const uint8* src_u,
diff --git a/source/convert.cc b/source/convert.cc
index a33742d24..ed3cd7fd0 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -392,6 +392,16 @@ int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
     }
   }
 #endif
+#if defined(HAS_YUY2TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    YUY2ToUVRow = YUY2ToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+      YUY2ToUVRow = YUY2ToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
@@ -457,6 +467,16 @@ int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
     }
   }
 #endif
+#if defined(HAS_UYVYTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToYRow = UYVYToYRow_Any_MSA;
+    UYVYToUVRow = UYVYToUVRow_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToYRow = UYVYToYRow_MSA;
+      UYVYToUVRow = UYVYToUVRow_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height - 1; y += 2) {
     UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 6fea63079..f9fa212f3 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -482,6 +482,16 @@ int YUY2ToI422(const uint8* src_yuy2, int src_stride_yuy2,
     }
   }
 #endif
+#if defined(HAS_YUY2TOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    YUY2ToYRow = YUY2ToYRow_Any_MSA;
+    YUY2ToUV422Row = YUY2ToUV422Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      YUY2ToYRow = YUY2ToYRow_MSA;
+      YUY2ToUV422Row = YUY2ToUV422Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
@@ -556,6 +566,16 @@ int UYVYToI422(const uint8* src_uyvy, int src_stride_uyvy,
     }
   }
 #endif
+#if defined(HAS_UYVYTOYROW_MSA)
+  if (TestCpuFlag(kCpuHasMSA)) {
+    UYVYToYRow = UYVYToYRow_Any_MSA;
+    UYVYToUV422Row = UYVYToUV422Row_Any_MSA;
+    if (IS_ALIGNED(width, 32)) {
+      UYVYToYRow = UYVYToYRow_MSA;
+      UYVYToUV422Row = UYVYToUV422Row_MSA;
+    }
+  }
+#endif
 
   for (y = 0; y < height; ++y) {
     UYVYToUV422Row(src_uyvy, dst_u, dst_v, width);
diff --git a/source/row_any.cc b/source/row_any.cc
index 53f232ed7..af9ecc511 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -442,6 +442,12 @@ ANY11(YUY2ToYRow_Any_NEON, YUY2ToYRow_NEON, 1, 4, 1, 15)
 #ifdef HAS_UYVYTOYROW_NEON
 ANY11(UYVYToYRow_Any_NEON, UYVYToYRow_NEON, 0, 2, 1, 15)
 #endif
+#ifdef HAS_YUY2TOYROW_MSA
+ANY11(YUY2ToYRow_Any_MSA, YUY2ToYRow_MSA, 1, 4, 1, 31)
+#endif
+#ifdef HAS_UYVYTOYROW_MSA
+ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 0, 2, 1, 31)
+#endif
 #ifdef HAS_RGB24TOARGBROW_NEON
 ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
 #endif
@@ -763,6 +769,10 @@ ANY12(ARGBToUV411Row_Any_NEON, ARGBToUV411Row_NEON, 0, 4, 2, 31)
 ANY12(YUY2ToUV422Row_Any_NEON, YUY2ToUV422Row_NEON, 1, 4, 1, 15)
 ANY12(UYVYToUV422Row_Any_NEON, UYVYToUV422Row_NEON, 1, 4, 1, 15)
 #endif
+#ifdef HAS_YUY2TOUV422ROW_MSA
+ANY12(YUY2ToUV422Row_Any_MSA, YUY2ToUV422Row_MSA, 1, 4, 1, 31)
+ANY12(UYVYToUV422Row_Any_MSA, UYVYToUV422Row_MSA, 1, 4, 1, 31)
+#endif
 #undef ANY12
 
 // Any 1 to 2 with source stride (2 rows of source).  Outputs UV planes.
@@ -848,6 +858,12 @@ ANY12S(YUY2ToUVRow_Any_NEON, YUY2ToUVRow_NEON, 1, 4, 15)
 #ifdef HAS_UYVYTOUVROW_NEON
 ANY12S(UYVYToUVRow_Any_NEON, UYVYToUVRow_NEON, 1, 4, 15)
 #endif
+#ifdef HAS_YUY2TOUVROW_MSA
+ANY12S(YUY2ToUVRow_Any_MSA, YUY2ToUVRow_MSA, 1, 4, 31)
+#endif
+#ifdef HAS_UYVYTOUVROW_MSA
+ANY12S(UYVYToUVRow_Any_MSA, UYVYToUVRow_MSA, 1, 4, 31)
+#endif
 #undef ANY12S
 
 #ifdef __cplusplus
diff --git a/source/row_msa.cc b/source/row_msa.cc
index 52a246cdb..acc60520a 100644
--- a/source/row_msa.cc
+++ b/source/row_msa.cc
@@ -101,6 +101,126 @@ void I422ToUYVYRow_MSA(const uint8* src_y,
   }
 }
 
+void YUY2ToYRow_MSA(const uint8* src_yuy2, uint8* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+    dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
+    dst1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
+    ST_UB2(dst0, dst1, dst_y, 16);
+    src_yuy2 += 64;
+    dst_y += 32;
+  }
+}
+
+void YUY2ToUVRow_MSA(const uint8* src_yuy2, int src_stride_yuy2,
+                     uint8* dst_u, uint8* dst_v, int width) {
+  const uint8* src_yuy2_next = src_yuy2 + src_stride_yuy2;
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+    LD_UB4(src_yuy2_next, 16, src4, src5, src6, src7);
+    src0 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
+    src1 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
+    src2 = (v16u8) __msa_pckod_b((v16i8) src5, (v16i8) src4);
+    src3 = (v16u8) __msa_pckod_b((v16i8) src7, (v16i8) src6);
+    vec0 = __msa_aver_u_b(src0, src2);
+    vec1 = __msa_aver_u_b(src1, src3);
+    dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
+    dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_yuy2 += 64;
+    src_yuy2_next += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void YUY2ToUV422Row_MSA(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+                        int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_yuy2, 16, src0, src1, src2, src3);
+    src0 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
+    src1 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
+    dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
+    dst1 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_yuy2 += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void UYVYToYRow_MSA(const uint8* src_uyvy, uint8* dst_y, int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+    dst0 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
+    dst1 = (v16u8) __msa_pckod_b((v16i8) src3, (v16i8) src2);
+    ST_UB2(dst0, dst1, dst_y, 16);
+    src_uyvy += 64;
+    dst_y += 32;
+  }
+}
+
+void UYVYToUVRow_MSA(const uint8* src_uyvy, int src_stride_uyvy,
+                     uint8* dst_u, uint8* dst_v, int width) {
+  const uint8 *src_uyvy_next = src_uyvy + src_stride_uyvy;
+  int x;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 vec0, vec1, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+    LD_UB4(src_uyvy_next, 16, src4, src5, src6, src7);
+    src0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
+    src1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
+    src2 = (v16u8) __msa_pckev_b((v16i8) src5, (v16i8) src4);
+    src3 = (v16u8) __msa_pckev_b((v16i8) src7, (v16i8) src6);
+    vec0 = __msa_aver_u_b(src0, src2);
+    vec1 = __msa_aver_u_b(src1, src3);
+    dst0 = (v16u8) __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
+    dst1 = (v16u8) __msa_pckod_b((v16i8) vec1, (v16i8) vec0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_uyvy += 64;
+    src_uyvy_next += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
+void UYVYToUV422Row_MSA(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+                        int width) {
+  int x;
+  v16u8 src0, src1, src2, src3, dst0, dst1;
+
+  for (x = 0; x < width; x += 32) {
+    LD_UB4(src_uyvy, 16, src0, src1, src2, src3);
+    src0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
+    src1 = (v16u8) __msa_pckev_b((v16i8) src3, (v16i8) src2);
+    dst0 = (v16u8) __msa_pckev_b((v16i8) src1, (v16i8) src0);
+    dst1 = (v16u8) __msa_pckod_b((v16i8) src1, (v16i8) src0);
+    ST_UB(dst0, dst_u);
+    ST_UB(dst1, dst_v);
+    src_uyvy += 64;
+    dst_u += 16;
+    dst_v += 16;
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 }  // namespace libyuv