From 845e94d1a74dc4b773159bf91d3c5da23b781476 Mon Sep 17 00:00:00 2001 From: "fbarchard@google.com" Date: Fri, 10 Aug 2012 03:48:21 +0000 Subject: [PATCH] Affine do 4 pixels at a time. BUG=none TEST=affine unitest Review URL: https://webrtc-codereview.appspot.com/729005 git-svn-id: http://libyuv.googlecode.com/svn/trunk@319 16f28f9a-4ce2-e073-06de-1de4eb20be90 --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/row_win.cc | 79 +++++++++++++++++++++++----------------- 3 files changed, 48 insertions(+), 35 deletions(-) diff --git a/README.chromium b/README.chromium index 210e8aaa5..3c4dafc87 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 318 +Version: 319 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 5f151adbe..c3243143b 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 318 +#define LIBYUV_VERSION 319 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/row_win.cc b/source/row_win.cc index 5de63711e..df601a621 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -3359,53 +3359,66 @@ void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, mov edx, [esp + 16] // dst_argb mov ecx, [esp + 20] // pointer to uv_dudv movq xmm2, qword ptr [ecx] // uv - movq xmm3, qword ptr [ecx + 8] // dudv + movq xmm7, qword ptr [ecx + 8] // dudv mov ecx, [esp + 24] // width shl esi, 16 // 4, stride add esi, 4 - movd xmm4, esi - sub ecx, 2 - jl l2b + movd xmm5, esi + sub ecx, 4 + jl l4b + // setup for 4 pixel loop + pshufd xmm7, xmm7, 0x44 // dup dudv + pshufd xmm5, xmm5, 0 // dup 4, stride movdqa xmm0, xmm2 // x0, y0, x1, y1 - addps xmm0, xmm3 + addps xmm0, xmm7 movlhps xmm2, xmm0 - pshufd xmm4, xmm4, 0 // dup 4, stride - movlhps xmm3, xmm3 // dudv - addps xmm3, xmm3 // dudv *= 2 - pshufd xmm4, xmm4, 0 + movdqa xmm4, xmm7 + addps xmm4, xmm4 // dudv *= 2 + movdqa xmm3, xmm2 // x2, y2, x3, y3 + addps xmm3, xmm4 + addps xmm4, xmm4 // dudv *= 4 - // 2 pixel loop + // 4 pixel loop align 4 - l2: - cvttps2dq xmm1, xmm2 // x, y float to int - packssdw xmm1, xmm1 // x, y as shorts - pmaddwd xmm1, xmm4 // offset = x * 4 + y * stride - addps xmm2, xmm3 // x, y += dx, dy - movd esi, xmm1 - movdqa xmm5, xmm1 - pshufd xmm5, xmm5, 0x55 - movd xmm0, [eax + esi] // read pixel 0 - movd esi, xmm5 - movd xmm5, [eax + esi] // read pixel 1 - punpckldq xmm0, xmm5 - sub ecx, 2 - movq qword ptr [edx], xmm0 - lea edx, [edx + 8] - jge l2 + l4: + cvttps2dq xmm0, xmm2 // x, y float to int first 2 + cvttps2dq xmm1, xmm3 // x, y float to int next 2 + packssdw xmm0, xmm1 // x, y as 8 shorts + pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. + addps xmm2, xmm4 // x, y += dx, dy first 2 + addps xmm3, xmm4 // x, y += dx, dy next 2 + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd xmm1, [eax + esi] // read pixel 0 + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd xmm6, [eax + esi] // read pixel 1 + punpckldq xmm1, xmm6 // combine pixel 0 and 1 + movd esi, xmm0 + pshufd xmm0, xmm0, 0x39 // shift right + movd xmm6, [eax + esi] // read pixel 2 + movd esi, xmm0 + movd xmm0, [eax + esi] // read pixel 3 + punpckldq xmm6, xmm0 // combine pixel 2 and 3 + punpcklqdq xmm1, xmm6 // combine pixel 0, 1, 2 and 3 + sub ecx, 4 + movdqu [edx], xmm1 + lea edx, [edx + 16] + jge l4 - l2b: - add ecx, 2 - 1 + l4b: + add ecx, 4 - 1 jl l1b // 1 pixel loop align 4 l1: - cvttps2dq xmm1, xmm2 // x, y float to int - packssdw xmm1, xmm1 // x, y as shorts - pmaddwd xmm1, xmm4 // offset = x * 4 + y * stride - addps xmm2, xmm3 // x, y += dx, dy - movd esi, xmm1 + cvttps2dq xmm0, xmm2 // x, y float to int + packssdw xmm0, xmm0 // x, y as shorts + pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride + addps xmm2, xmm7 // x, y += dx, dy + movd esi, xmm0 movd xmm0, [eax + esi] // copy a pixel sub ecx, 1 movd [edx], xmm0