From 205c1440cf822b7203934eb818a6ea278fd93cba Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Tue, 7 Oct 2014 19:47:06 +0000 Subject: [PATCH] Use movdqu then pavgb to allow unaligned memory for rgb subsampling code. Allows this assembly to be used for unaligned pointers as well as aligned ones with no performance hit when memory is aligned on a modern cpu. BUG=365 TESTED=libyuvTest.ARGBToI420_Unaligned (453 ms) R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/30679004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1116 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/row.h | 10 +++--- include/libyuv/version.h | 2 +- source/row_posix.cc | 70 ++++++++++++++++++++++++++-------------- source/row_win.cc | 65 +++++++++++++++++++++++++------------ 5 files changed, 97 insertions(+), 52 deletions(-) diff --git a/README.chromium b/README.chromium index 6cd2d60ee..eccd8c6b4 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1115 +Version: 1116 License: BSD License File: LICENSE diff --git a/include/libyuv/row.h b/include/libyuv/row.h index 186dcbd2d..07d38756a 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -100,8 +100,8 @@ extern "C" { #define HAS_SOBELYROW_SSE2 // Conversions: -//#define HAS_ARGBTOUVROW_SSSE3 -//#define HAS_ABGRTOUVROW_SSSE3 +#define HAS_ARGBTOUVROW_SSSE3 +#define HAS_ABGRTOUVROW_SSSE3 #define HAS_ABGRTOYROW_SSSE3 #define HAS_ARGB1555TOARGBROW_SSE2 #define HAS_ARGB4444TOARGBROW_SSE2 @@ -116,10 +116,10 @@ extern "C" { #define HAS_ARGBTORGB565ROW_SSE2 #define HAS_ARGBTOUV422ROW_SSSE3 #define HAS_ARGBTOUV444ROW_SSSE3 -//#define HAS_ARGBTOUVJROW_SSSE3 +#define HAS_ARGBTOUVJROW_SSSE3 #define HAS_ARGBTOYJROW_SSSE3 #define HAS_ARGBTOYROW_SSSE3 -//#define HAS_BGRATOUVROW_SSSE3 +#define HAS_BGRATOUVROW_SSSE3 #define HAS_BGRATOYROW_SSSE3 #define HAS_COPYROW_ERMS #define HAS_COPYROW_SSE2 @@ -153,7 +153,7 @@ extern "C" { #define HAS_RGB24TOARGBROW_SSSE3 #define HAS_RGB24TOYROW_SSSE3 #define HAS_RGB565TOARGBROW_SSE2 -//#define HAS_RGBATOUVROW_SSSE3 +#define HAS_RGBATOUVROW_SSSE3 #define HAS_RGBATOYROW_SSSE3 #define HAS_SETROW_X86 #define HAS_SPLITUVROW_SSE2 diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 772dc4332..c1f4bdab0 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1115 +#define LIBYUV_VERSION 1116 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_posix.cc b/source/row_posix.cc index 90dc5f22c..55aa9a18a 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -807,14 +807,18 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -876,14 +880,18 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -1111,14 +1119,18 @@ void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra, LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -1251,14 +1263,18 @@ void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr, LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" @@ -1317,14 +1333,18 @@ void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba, LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm0 \n" + MEMOPREG(movdqu,0x00,0,4,1,xmm7) // movdqu (%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm0 \n" "movdqu " MEMACCESS2(0x10,0) ",%%xmm1 \n" + MEMOPREG(movdqu,0x10,0,4,1,xmm7) // movdqu 0x10(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm1 \n" "movdqu " MEMACCESS2(0x20,0) ",%%xmm2 \n" + MEMOPREG(movdqu,0x20,0,4,1,xmm7) // movdqu 0x20(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm2 \n" "movdqu " MEMACCESS2(0x30,0) ",%%xmm6 \n" - BUNDLEALIGN - MEMOPREG(pavgb,0x00,0,4,1,xmm0) // pavgb (%0,%4,1),%%xmm0 - MEMOPREG(pavgb,0x10,0,4,1,xmm1) // pavgb 0x10(%0,%4,1),%%xmm1 - MEMOPREG(pavgb,0x20,0,4,1,xmm2) // pavgb 0x20(%0,%4,1),%%xmm2 - MEMOPREG(pavgb,0x30,0,4,1,xmm6) // pavgb 0x30(%0,%4,1),%%xmm6 + MEMOPREG(movdqu,0x30,0,4,1,xmm7) // movdqu 0x30(%0,%4,1),%%xmm7 + "pavgb %%xmm7,%%xmm6 \n" + "lea " MEMLEA(0x40,0) ",%0 \n" "movdqa %%xmm0,%%xmm7 \n" "shufps $0x88,%%xmm1,%%xmm0 \n" diff --git a/source/row_win.cc b/source/row_win.cc index a5b99d7b0..e26a62275 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -977,13 +977,18 @@ void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 movdqu xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -1043,13 +1048,18 @@ void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb, convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 movdqu xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -1294,13 +1304,18 @@ void BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 movdqu xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -1360,13 +1375,18 @@ void ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 movdqu xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88 @@ -1426,13 +1446,18 @@ void RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, convertloop: /* step 1 - subsample 16x2 argb pixels to 8x1 */ movdqu xmm0, [eax] + movdqu xmm4, [eax + esi] + pavgb xmm0, xmm4 movdqu xmm1, [eax + 16] + movdqu xmm4, [eax + esi + 16] + pavgb xmm1, xmm4 movdqu xmm2, [eax + 32] + movdqu xmm4, [eax + esi + 32] + pavgb xmm2, xmm4 movdqu xmm3, [eax + 48] - pavgb xmm0, [eax + esi] - pavgb xmm1, [eax + esi + 16] - pavgb xmm2, [eax + esi + 32] - pavgb xmm3, [eax + esi + 48] + movdqu xmm4, [eax + esi + 48] + pavgb xmm3, xmm4 + lea eax, [eax + 64] movdqa xmm4, xmm0 shufps xmm0, xmm1, 0x88