Remove alignment from ARGBToRGB24 and ARGBToRAW to allow fast code to be used all of the time. Improves performance on Westmere and beyond, hurts performance for aligned buffers on older CPUs.

BUG=230 TESTED=try bot R=nfullagar@google.com Review URL: https://webrtc-codereview.appspot.com/2197007 git-svn-id: http://libyuv.googlecode.com/svn/trunk@785 16f28f9a-4ce2-e073-06de-1de4eb20be90
2026-02-06 01:39:49 +08:00 · 2013-09-11 01:18:36 +00:00 · 2013-09-11 01:18:36 +00:00 · 7e7c7753ba
commit 7e7c7753ba
parent 1390aaf69a
5 changed files with 32 additions and 36 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 784
+Version: 785
 License: BSD
 License File: LICENSE

--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_

-#define LIBYUV_VERSION 784
+#define LIBYUV_VERSION 785

 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/convert_from_argb.cc
+++ b/source/convert_from_argb.cc
@ -744,9 +744,7 @@ int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
  void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
      ARGBToRGB24Row_C;
 #if defined(HAS_ARGBTORGB24ROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_rgb24, 16) && IS_ALIGNED(dst_stride_rgb24, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
    ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
@ -792,9 +790,7 @@ int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
  void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
      ARGBToRAWRow_C;
 #if defined(HAS_ARGBTORAWROW_SSSE3)
-  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
-      IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
-      IS_ALIGNED(dst_raw, 16) && IS_ALIGNED(dst_stride_raw, 16)) {
+  if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
    ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
    if (IS_ALIGNED(width, 16)) {
      ARGBToRAWRow = ARGBToRAWRow_SSSE3;
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -569,10 +569,10 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
    "movdqa    %3,%%xmm6                       \n"
    ".p2align  4                               \n"
  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm3                 \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "movdqu    0x20(%0),%%xmm2                 \n"
+    "movdqu    0x30(%0),%%xmm3                 \n"
    "lea       0x40(%0),%0                     \n"
    "pshufb    %%xmm6,%%xmm0                   \n"
    "pshufb    %%xmm6,%%xmm1                   \n"
@ -584,13 +584,13 @@ void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
    "movdqa    %%xmm2,%%xmm5                   \n"
    "por       %%xmm4,%%xmm0                   \n"
    "pslldq    $0x8,%%xmm5                     \n"
-    "movdqa    %%xmm0,(%1)                     \n"
+    "movdqu    %%xmm0,(%1)                     \n"
    "por       %%xmm5,%%xmm1                   \n"
    "psrldq    $0x8,%%xmm2                     \n"
    "pslldq    $0x4,%%xmm3                     \n"
    "por       %%xmm3,%%xmm2                   \n"
-    "movdqa    %%xmm1,0x10(%1)                 \n"
-    "movdqa    %%xmm2,0x20(%1)                 \n"
+    "movdqu    %%xmm1,0x10(%1)                 \n"
+    "movdqu    %%xmm2,0x20(%1)                 \n"
    "lea       0x30(%1),%1                     \n"
    "sub       $0x10,%2                        \n"
    "jg        1b                              \n"
@ -610,10 +610,10 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
    "movdqa    %3,%%xmm6                       \n"
    ".p2align  4                               \n"
  "1:                                          \n"
-    "movdqa    (%0),%%xmm0                     \n"
-    "movdqa    0x10(%0),%%xmm1                 \n"
-    "movdqa    0x20(%0),%%xmm2                 \n"
-    "movdqa    0x30(%0),%%xmm3                 \n"
+    "movdqu    (%0),%%xmm0                     \n"
+    "movdqu    0x10(%0),%%xmm1                 \n"
+    "movdqu    0x20(%0),%%xmm2                 \n"
+    "movdqu    0x30(%0),%%xmm3                 \n"
    "lea       0x40(%0),%0                     \n"
    "pshufb    %%xmm6,%%xmm0                   \n"
    "pshufb    %%xmm6,%%xmm1                   \n"
@ -625,13 +625,13 @@ void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
    "movdqa    %%xmm2,%%xmm5                   \n"
    "por       %%xmm4,%%xmm0                   \n"
    "pslldq    $0x8,%%xmm5                     \n"
-    "movdqa    %%xmm0,(%1)                     \n"
+    "movdqu    %%xmm0,(%1)                     \n"
    "por       %%xmm5,%%xmm1                   \n"
    "psrldq    $0x8,%%xmm2                     \n"
    "pslldq    $0x4,%%xmm3                     \n"
    "por       %%xmm3,%%xmm2                   \n"
-    "movdqa    %%xmm1,0x10(%1)                 \n"
-    "movdqa    %%xmm2,0x20(%1)                 \n"
+    "movdqu    %%xmm1,0x10(%1)                 \n"
+    "movdqu    %%xmm2,0x20(%1)                 \n"
    "lea       0x30(%1),%1                     \n"
    "sub       $0x10,%2                        \n"
    "jg        1b                              \n"
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -479,10 +479,10 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {

    align      16
 convertloop:
-    movdqa    xmm0, [eax]   // fetch 16 pixels of argb
-    movdqa    xmm1, [eax + 16]
-    movdqa    xmm2, [eax + 32]
-    movdqa    xmm3, [eax + 48]
+    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm2, [eax + 32]
+    movdqu    xmm3, [eax + 48]
    lea       eax, [eax + 64]
    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
    pshufb    xmm1, xmm6
@ -494,13 +494,13 @@ void ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
    por       xmm0, xmm4   // 4 bytes from 1 for 0
    pslldq    xmm5, 8      // 8 bytes from 2 for 1
-    movdqa    [edx], xmm0  // store 0
+    movdqu    [edx], xmm0  // store 0
    por       xmm1, xmm5   // 8 bytes from 2 for 1
    psrldq    xmm2, 8      // 4 bytes from 2
    pslldq    xmm3, 4      // 12 bytes from 3 for 2
    por       xmm2, xmm3   // 12 bytes from 3 for 2
-    movdqa    [edx + 16], xmm1   // store 1
-    movdqa    [edx + 32], xmm2   // store 2
+    movdqu    [edx + 16], xmm1   // store 1
+    movdqu    [edx + 32], xmm2   // store 2
    lea       edx, [edx + 48]
    sub       ecx, 16
    jg        convertloop
@ -518,10 +518,10 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {

    align      16
 convertloop:
-    movdqa    xmm0, [eax]   // fetch 16 pixels of argb
-    movdqa    xmm1, [eax + 16]
-    movdqa    xmm2, [eax + 32]
-    movdqa    xmm3, [eax + 48]
+    movdqu    xmm0, [eax]   // fetch 16 pixels of argb
+    movdqu    xmm1, [eax + 16]
+    movdqu    xmm2, [eax + 32]
+    movdqu    xmm3, [eax + 48]
    lea       eax, [eax + 64]
    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
    pshufb    xmm1, xmm6
@ -533,13 +533,13 @@ void ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
    por       xmm0, xmm4   // 4 bytes from 1 for 0
    pslldq    xmm5, 8      // 8 bytes from 2 for 1
-    movdqa    [edx], xmm0  // store 0
+    movdqu    [edx], xmm0  // store 0
    por       xmm1, xmm5   // 8 bytes from 2 for 1
    psrldq    xmm2, 8      // 4 bytes from 2
    pslldq    xmm3, 4      // 12 bytes from 3 for 2
    por       xmm2, xmm3   // 12 bytes from 3 for 2
-    movdqa    [edx + 16], xmm1   // store 1
-    movdqa    [edx + 32], xmm2   // store 2
+    movdqu    [edx + 16], xmm1   // store 1
+    movdqu    [edx + 32], xmm2   // store 2
    lea       edx, [edx + 48]
    sub       ecx, 16
    jg        convertloop