diff --git a/README.chromium b/README.chromium index 441cf7581..0a637246f 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 707 +Version: 708 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 5cb5d7f2f..eb2624953 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 707 +#define LIBYUV_VERSION 708 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/scale.cc b/source/scale.cc index f39f1dace..721beee08 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -196,16 +196,14 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 align 16 wloop: movdqa xmm0, [eax] movdqa xmm1, [eax + 16] lea eax, [eax + 32] - pand xmm0, xmm5 - pand xmm1, xmm5 + psrlw xmm0, 8 // isolate odd pixels. + psrlw xmm1, 8 packuswb xmm0, xmm1 sub ecx, 16 movdqa [edx], xmm0 @@ -271,16 +269,14 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, // src_stride ignored mov edx, [esp + 12] // dst_ptr mov ecx, [esp + 16] // dst_width - pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff - psrlw xmm5, 8 align 16 wloop: movdqu xmm0, [eax] movdqu xmm1, [eax + 16] lea eax, [eax + 32] - pand xmm0, xmm5 - pand xmm1, xmm5 + psrlw xmm0, 8 // isolate odd pixels. + psrlw xmm1, 8 packuswb xmm0, xmm1 sub ecx, 16 movdqu [edx], xmm0 @@ -1269,15 +1265,13 @@ static void ScaleFilterRows_Unaligned_SSSE3(uint8* dst_ptr, static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "movdqa %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" @@ -1289,7 +1283,7 @@ static void ScaleRowDown2_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, : : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" + , "xmm0", "xmm1" #endif ); } @@ -1336,15 +1330,13 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) { asm volatile ( - "pcmpeqb %%xmm5,%%xmm5 \n" - "psrlw $0x8,%%xmm5 \n" ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" "movdqu 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" - "pand %%xmm5,%%xmm0 \n" - "pand %%xmm5,%%xmm1 \n" + "psrlw $0x8,%%xmm0 \n" + "psrlw $0x8,%%xmm1 \n" "packuswb %%xmm1,%%xmm0 \n" "movdqu %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" @@ -1356,7 +1348,7 @@ static void ScaleRowDown2_Unaligned_SSE2(const uint8* src_ptr, : : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm5" + , "xmm0", "xmm1" #endif ); } @@ -2324,13 +2316,13 @@ static void ScaleRowDown2_C(const uint8* src_ptr, ptrdiff_t /* src_stride */, uint8* dst, int dst_width) { uint8* dend = dst + dst_width - 1; do { - dst[0] = src_ptr[0]; - dst[1] = src_ptr[2]; + dst[0] = src_ptr[1]; + dst[1] = src_ptr[3]; dst += 2; src_ptr += 4; } while (dst < dend); if (dst_width & 1) { - dst[0] = src_ptr[0]; + dst[0] = src_ptr[1]; } } @@ -2689,6 +2681,7 @@ static void ScalePlaneDown2(int /* src_width */, int /* src_height */, } #endif + src_ptr += src_stride; // Point to odd rows. // TODO(fbarchard): Loop through source height to allow odd height. for (int y = 0; y < dst_height; ++y) { ScaleRowDown2(src_ptr, src_stride, dst_ptr, dst_width); diff --git a/source/scale_argb.cc b/source/scale_argb.cc index 248236c24..fa271556a 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -62,7 +62,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb, movdqa xmm0, [eax] movdqa xmm1, [eax + 16] lea eax, [eax + 32] - shufps xmm0, xmm1, 0x88 + shufps xmm0, xmm1, 0xdd sub ecx, 4 movdqa [edx], xmm0 lea edx, [edx + 16] @@ -350,7 +350,7 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_argb, "movdqa (%0),%%xmm0 \n" "movdqa 0x10(%0),%%xmm1 \n" "lea 0x20(%0),%0 \n" - "shufps $0x88,%%xmm1,%%xmm0 \n" + "shufps $0xdd,%%xmm1,%%xmm0 \n" "sub $0x4,%2 \n" "movdqa %%xmm0,(%1) \n" "lea 0x10(%1),%1 \n" @@ -634,13 +634,13 @@ static void ScaleARGBRowDown2_C(const uint8* src_argb, uint32* dst = reinterpret_cast(dst_argb); for (int x = 0; x < dst_width - 1; x += 2) { - dst[0] = src[0]; - dst[1] = src[2]; + dst[0] = src[1]; + dst[1] = src[3]; src += 4; dst += 2; } if (dst_width & 1) { - dst[0] = src[0]; + dst[0] = src[1]; } } @@ -743,25 +743,26 @@ static void ScaleARGBDown2(int /* src_width */, int /* src_height */, FilterMode filtering) { assert(dx == 65536 * 2); // Test scale factor of 2. assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2. + // Advance to odd row / even column. + src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4; + int row_stride = src_stride * (dy >> 16); void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride, uint8* dst_argb, int dst_width) = filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C; #if defined(HAS_SCALEARGBROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) && - IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) && + IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) && IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) { ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 : ScaleARGBRowDown2_SSE2; } #elif defined(HAS_SCALEARGBROWDOWN2_NEON) if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) && - IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) { + IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) { ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_NEON : ScaleARGBRowDown2_NEON; } #endif - src_argb += (y >> 16) * src_stride + (x >> 16) * 4; - int row_stride = src_stride * (dy >> 16); // TODO(fbarchard): Loop through source height to allow odd height. for (int y = 0; y < dst_height; ++y) { @@ -782,6 +783,9 @@ static void ScaleARGBDownEven(int src_width, int src_height, FilterMode filtering) { assert(IS_ALIGNED(src_width, 2)); assert(IS_ALIGNED(src_height, 2)); + int col_step = dx >> 16; + int row_stride = (dy >> 16) * src_stride; + src_argb += (y >> 16) * src_stride + (x >> 16) * 4; void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride, int src_step, uint8* dst_argb, int dst_width) = filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C; @@ -798,9 +802,6 @@ static void ScaleARGBDownEven(int src_width, int src_height, ScaleARGBRowDownEven_NEON; } #endif - int col_step = dx >> 16; - int row_stride = (dy >> 16) * src_stride; - src_argb += (y >> 16) * src_stride + (x >> 16) * 4; for (int y = 0; y < dst_height; ++y) { ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width); diff --git a/source/scale_argb_neon.cc b/source/scale_argb_neon.cc index 720b72e22..819186bc7 100644 --- a/source/scale_argb_neon.cc +++ b/source/scale_argb_neon.cc @@ -27,8 +27,8 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, "vld2.u32 {q0, q1}, [%0]! \n" "vld2.u32 {q2, q3}, [%0]! \n" "subs %2, %2, #8 \n" // 8 processed per loop - "vst1.u8 {q0}, [%1]! \n" // store even pixels - "vst1.u8 {q2}, [%1]! \n" + "vst1.u8 {q1}, [%1]! \n" // store odd pixels + "vst1.u8 {q3}, [%1]! \n" "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1 @@ -78,6 +78,7 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t, int src_stepx, uint8* dst_argb, int dst_width) { asm volatile ( + "add %0, #4 \n" // point to odd pixels. "mov r12, %3, lsl #2 \n" ".p2align 2 \n" "1: \n" diff --git a/source/scale_mips.cc b/source/scale_mips.cc index ba8e8b516..b30eaba0c 100644 --- a/source/scale_mips.cc +++ b/source/scale_mips.cc @@ -39,6 +39,7 @@ void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t /* src_stride */, "lw $t5, 20(%[src_ptr]) \n" // |23|22|21|20| "lw $t6, 24(%[src_ptr]) \n" // |27|26|25|24| "lw $t7, 28(%[src_ptr]) \n" // |31|30|29|28| + // TODO(fbarchard): Use odd pixels instead of even. "precr.qb.ph $t8, $t1, $t0 \n" // |6|4|2|0| "precr.qb.ph $t0, $t3, $t2 \n" // |14|12|10|8| "precr.qb.ph $t1, $t5, $t4 \n" // |22|20|18|16| diff --git a/source/scale_neon.cc b/source/scale_neon.cc index c1cf7f11b..2449ec80e 100644 --- a/source/scale_neon.cc +++ b/source/scale_neon.cc @@ -29,7 +29,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t /* src_stride */, // load even pixels into q0, odd into q1 "vld2.u8 {q0,q1}, [%0]! \n" "subs %2, %2, #16 \n" // 16 processed per loop - "vst1.u8 {q0}, [%1]! \n" // store even pixels + "vst1.u8 {q1}, [%1]! \n" // store odd pixels "bgt 1b \n" : "+r"(src_ptr), // %0 "+r"(dst), // %1