diff --git a/include/libyuv/scale_argb.h b/include/libyuv/scale_argb.h index 575119c9c..225f77258 100644 --- a/include/libyuv/scale_argb.h +++ b/include/libyuv/scale_argb.h @@ -22,7 +22,7 @@ extern "C" { enum FilterMode { kFilterNone = 0, // Point sample; Fastest kFilterBilinear = 1, // Faster than box, but lower quality scaling down. - kFilterBox = 2 // Highest quality + kFilterBox = 2 // Highest quality (not supported for ARGB) }; int ARGBScale(const uint8* src_argb, int src_stride_argb, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index fcbb506b0..8efe68508 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -775,13 +775,14 @@ __declspec(naked) __declspec(align(16)) static void SetRows32_X86(uint8* dst, uint32 v32, int width, int dst_stride, int height) { __asm { + push esi push edi push ebp - mov edi, [esp + 8 + 4] // dst - mov eax, [esp + 8 + 8] // v32 - mov ebp, [esp + 8 + 12] // width - mov edx, [esp + 8 + 16] // dst_stride - mov ebx, [esp + 8 + 20] // height + mov edi, [esp + 12 + 4] // dst + mov eax, [esp + 12 + 8] // v32 + mov ebp, [esp + 12 + 12] // width + mov edx, [esp + 12 + 16] // dst_stride + mov esi, [esp + 12 + 20] // height lea ecx, [ebp * 4] sub edx, ecx // stride - width * 4 @@ -790,11 +791,12 @@ static void SetRows32_X86(uint8* dst, uint32 v32, int width, mov ecx, ebp rep stosd add edi, edx - sub ebx, 1 + sub esi, 1 jg convertloop pop ebp pop edi + pop esi ret } } diff --git a/source/scale.cc b/source/scale.cc index c34a5a272..235bcf8f2 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -55,7 +55,7 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, int /* src_stride */, asm volatile ( "1: \n" // load even pixels into q0, odd into q1 - "vld2.u8 {q0,q1}, [%0]! \n" + "vld2.u8 {q0,q1}, [%0]! \n" "vst1.u8 {q0}, [%1]! \n" // store even pixels "subs %2, %2, #16 \n" // 16 processed per loop "bgt 1b \n" @@ -71,14 +71,14 @@ void ScaleRowDown2Int_NEON(const uint8* src_ptr, int src_stride, uint8* dst, int dst_width) { asm volatile ( // change the stride to row 2 pointer - "add %1, %0 \n" + "add %1, %0 \n" "1: \n" "vld1.u8 {q0,q1}, [%0]! \n" // load row 1 and post inc "vld1.u8 {q2,q3}, [%1]! \n" // load row 2 and post inc "vpaddl.u8 q0, q0 \n" // row 1 add adjacent "vpaddl.u8 q1, q1 \n" // row 2 add adjacent, add row 1 to row 2 - "vpadal.u8 q0, q2 \n" + "vpadal.u8 q0, q2 \n" "vpadal.u8 q1, q3 \n" "vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack "vrshrn.u16 d1, q1, #2 \n" @@ -1399,6 +1399,10 @@ static void ScaleAddRows_SSE2(const uint8* src_ptr, int src_stride, } // Bilinear row filtering combines 16x2 -> 16x1. SSE2 version. +// Normal formula for bilinear interpolation is: +// source_y_fraction * row1 + (1 - source_y_fraction) row0 +// SSE2 version using the a single multiply of difference: +// source_y_fraction * (row1 - row0) + row0 #define HAS_SCALEFILTERROWS_SSE2 __declspec(naked) __declspec(align(16)) static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, @@ -1424,8 +1428,6 @@ static void ScaleFilterRows_SSE2(uint8* dst_ptr, const uint8* src_ptr, pshufd xmm5, xmm5, 0 pxor xmm4, xmm4 - // f * row1 + (1 - frac) row0 - // frac * (row1 - row0) + row0 align 16 xloop: movdqa xmm0, [esi] // row0 @@ -3677,11 +3679,13 @@ void ScalePlane(const uint8* src, int src_stride, // optimized, 3/8 ScalePlaneDown38(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); - } else if (4 * dst_width == src_width && 4 * dst_height == src_height) { + } else if (4 * dst_width == src_width && 4 * dst_height == src_height && + filtering != kFilterBilinear) { // optimized, 1/4 ScalePlaneDown4(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); - } else if (8 * dst_width == src_width && 8 * dst_height == src_height) { + } else if (8 * dst_width == src_width && 8 * dst_height == src_height && + filtering != kFilterBilinear) { // optimized, 1/8 ScalePlaneDown8(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); diff --git a/source/scale_argb.cc b/source/scale_argb.cc index a7c1fe383..50ba89bce 100644 --- a/source/scale_argb.cc +++ b/source/scale_argb.cc @@ -36,7 +36,7 @@ extern "C" { // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) __declspec(align(16)) static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { + uint8* dst_ptr, int dst_width) { __asm { mov eax, [esp + 4] // src_ptr // src_stride ignored @@ -61,8 +61,8 @@ static void ScaleARGBRowDown2_SSE2(const uint8* src_ptr, int src_stride, // Blends 8x2 rectangle to 4x1. // Alignment requirement: src_ptr 16 byte aligned, dst_ptr 16 byte aligned. __declspec(naked) __declspec(align(16)) -void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { +static void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { __asm { push esi mov eax, [esp + 4 + 4] // src_ptr @@ -79,8 +79,7 @@ void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, lea eax, [eax + 32] pavgb xmm0, xmm2 // average rows pavgb xmm1, xmm3 - - movdqa xmm2, xmm0 // average columns (32 to 16 pixels) + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) shufps xmm0, xmm1, 0x88 // even pixels shufps xmm2, xmm1, 0xdd // odd pixels pavgb xmm0, xmm2 @@ -94,6 +93,94 @@ void ScaleARGBRowDown2Int_SSE2(const uint8* src_ptr, int src_stride, } } +#define HAS_SCALEARGBROWDOWNEVEN_SSE2 +// Reads 4 pixels at a time. +// Alignment requirement: dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +static void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride, + int src_stepx, + uint8* dst_ptr, int dst_width) { + __asm { + push ebx + push edi + mov eax, [esp + 8 + 4] // src_ptr + // src_stride ignored + mov ebx, [esp + 8 + 12] // src_stepx + mov edx, [esp + 8 + 16] // dst_ptr + mov ecx, [esp + 8 + 20] // dst_width + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + align 16 + wloop: + movd xmm0, [eax] + movd xmm1, [eax + ebx] + punpckldq xmm0, xmm1 + movd xmm2, [eax + ebx * 2] + movd xmm3, [eax + edi] + lea eax, [eax + ebx * 4] + punpckldq xmm2, xmm3 + punpcklqdq xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop edi + pop ebx + ret + } +} + +// Blends four 2x2 to 4x1. +// Alignment requirement: dst_ptr 16 byte aligned. +__declspec(naked) __declspec(align(16)) +static void ScaleARGBRowDownEvenInt_SSE2(const uint8* src_ptr, int src_stride, + int src_stepx, + uint8* dst_ptr, int dst_width) { + __asm { + push ebx + push esi + push edi + mov eax, [esp + 12 + 4] // src_ptr + mov esi, [esp + 12 + 8] // src_stride + mov ebx, [esp + 12 + 12] // src_stepx + mov edx, [esp + 12 + 16] // dst_ptr + mov ecx, [esp + 12 + 20] // dst_width + lea esi, [eax + esi] // row1 pointer + lea ebx, [ebx * 4] + lea edi, [ebx + ebx * 2] + + align 16 + wloop: + movq xmm0, qword ptr [eax] // row0 4 pairs + movhps xmm0, qword ptr [eax + ebx] + movq xmm1, qword ptr [eax + ebx * 2] + movhps xmm1, qword ptr [eax + edi] + lea eax, [eax + ebx * 4] + movq xmm2, qword ptr [esi] // row1 4 pairs + movhps xmm2, qword ptr [esi + ebx] + movq xmm3, qword ptr [esi + ebx * 2] + movhps xmm3, qword ptr [esi + edi] + lea esi, [esi + ebx * 4] + pavgb xmm0, xmm2 // average rows + pavgb xmm1, xmm3 + movdqa xmm2, xmm0 // average columns (8 to 4 pixels) + shufps xmm0, xmm1, 0x88 // even pixels + shufps xmm2, xmm1, 0xdd // odd pixels + pavgb xmm0, xmm2 + sub ecx, 4 + movdqa [edx], xmm0 + lea edx, [edx + 16] + jg wloop + + pop edi + pop esi + pop ebx + ret + } +} + // Bilinear row filtering combines 4x2 -> 4x1. SSE2 version. #define HAS_SCALEARGBFILTERROWS_SSE2 __declspec(naked) __declspec(align(16)) @@ -472,16 +559,16 @@ static void ScaleARGBRowDown2_C(const uint8* src_ptr, int, for (int x = 0; x < dst_width - 1; x += 2) { dst[0] = src[0]; dst[1] = src[2]; - dst += 2; src += 4; + dst += 2; } if (dst_width & 1) { dst[0] = src[0]; } } -void ScaleARGBRowDown2Int_C(const uint8* src_ptr, int src_stride, - uint8* dst_ptr, int dst_width) { +static void ScaleARGBRowDown2Int_C(const uint8* src_ptr, int src_stride, + uint8* dst_ptr, int dst_width) { for (int x = 0; x < dst_width; ++x) { dst_ptr[0] = (src_ptr[0] + src_ptr[4] + src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2; @@ -491,8 +578,42 @@ void ScaleARGBRowDown2Int_C(const uint8* src_ptr, int src_stride, src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2; dst_ptr[3] = (src_ptr[3] + src_ptr[7] + src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2; - dst_ptr += 4; src_ptr += 8; + dst_ptr += 4; + } +} + +static void ScaleARGBRowDownEven_C(const uint8* src_ptr, int, + int src_stepx, + uint8* dst_ptr, int dst_width) { + const uint32* src = reinterpret_cast(src_ptr); + uint32* dst = reinterpret_cast(dst_ptr); + + for (int x = 0; x < dst_width - 1; x += 2) { + dst[0] = src[0]; + dst[1] = src[src_stepx]; + src += src_stepx * 2; + dst += 2; + } + if (dst_width & 1) { + dst[0] = src[0]; + } +} + +static void ScaleARGBRowDownEvenInt_C(const uint8* src_ptr, int src_stride, + int src_stepx, + uint8* dst_ptr, int dst_width) { + for (int x = 0; x < dst_width; ++x) { + dst_ptr[0] = (src_ptr[0] + src_ptr[4] + + src_ptr[src_stride] + src_ptr[src_stride + 4] + 2) >> 2; + dst_ptr[1] = (src_ptr[1] + src_ptr[5] + + src_ptr[src_stride + 1] + src_ptr[src_stride + 5] + 2) >> 2; + dst_ptr[2] = (src_ptr[2] + src_ptr[6] + + src_ptr[src_stride + 2] + src_ptr[src_stride + 6] + 2) >> 2; + dst_ptr[3] = (src_ptr[3] + src_ptr[7] + + src_ptr[src_stride + 3] + src_ptr[src_stride + 7] + 2) >> 2; + src_ptr += src_stepx * 4; + dst_ptr += 4; } } @@ -583,7 +704,7 @@ static void ScaleARGBDown2(int src_width, int src_height, filtering ? ScaleARGBRowDown2Int_C : ScaleARGBRowDown2_C; #if defined(HAS_SCALEARGBROWDOWN2_SSE2) if (TestCpuFlag(kCpuHasSSE2) && - IS_ALIGNED(dst_width, 16) && + IS_ALIGNED(dst_width, 4) && IS_ALIGNED(src_ptr, 16) && IS_ALIGNED(src_stride, 16) && IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Int_SSE2 : @@ -599,15 +720,52 @@ static void ScaleARGBDown2(int src_width, int src_height, } } + +/** + * ScaleARGB ARGB Even + * + * This is an optimized version for scaling down a ARGB to even + * multiple of its original size. + * + */ +static void ScaleARGBDownEven(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr, + FilterMode filtering) { + assert(IS_ALIGNED(src_width, 2)); + assert(IS_ALIGNED(src_height, 2)); + void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride, + int src_step, uint8* dst_ptr, int dst_width) = + filtering ? ScaleARGBRowDownEvenInt_C : ScaleARGBRowDownEven_C; +#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2) + if (TestCpuFlag(kCpuHasSSE2) && + IS_ALIGNED(dst_width, 4) && + IS_ALIGNED(dst_ptr, 16) && IS_ALIGNED(dst_stride, 16)) { + ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenInt_SSE2 : + ScaleARGBRowDownEven_SSE2; + } +#endif + int src_step = src_width / dst_width; + // Adjust to point to center of box. + int row_step = src_height / dst_height; + int row_stride = row_step * src_stride; + src_ptr += ((row_step >> 1) - 1) * src_stride + ((src_step >> 1) - 1) * 4; + for (int y = 0; y < dst_height; ++y) { + ScaleARGBRowDownEven(src_ptr, src_stride, src_step, dst_ptr, dst_width); + src_ptr += row_stride; + dst_ptr += dst_stride; + } +} /** * ScaleARGB ARGB to/from any dimensions, with bilinear * interpolation. */ -void ScaleARGBBilinear(int src_width, int src_height, - int dst_width, int dst_height, - int src_stride, int dst_stride, - const uint8* src_ptr, uint8* dst_ptr) { +static void ScaleARGBBilinear(int src_width, int src_height, + int dst_width, int dst_height, + int src_stride, int dst_stride, + const uint8* src_ptr, uint8* dst_ptr) { assert(dst_width > 0); assert(dst_height > 0); assert(src_width <= kMaxInputWidth); @@ -728,11 +886,25 @@ static void ScaleARGB(const uint8* src, int src_stride, return; } if (2 * dst_width == src_width && 2 * dst_height == src_height) { - // optimized 1/2. + // Optimized 1/2. ScaleARGBDown2(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); return; } + int scale_down_x = src_width / dst_width; + int scale_down_y = src_height / dst_height; + if (dst_width * scale_down_x == src_width && + dst_height * scale_down_y == src_height) { + if (!(scale_down_x & 1) && !(scale_down_y & 1)) { + // Optimized even scale down. ie 4, 6, 8, 10x + ScaleARGBDownEven(src_width, src_height, dst_width, dst_height, + src_stride, dst_stride, src, dst, filtering); + return; + } + if ((scale_down_x & 1) && (scale_down_y & 1)) { + filtering = kFilterNone; + } + } // Arbitrary scale up and/or down. ScaleARGBAnySize(src_width, src_height, dst_width, dst_height, src_stride, dst_stride, src, dst, filtering); diff --git a/unit_test/scale_argb_test.cc b/unit_test/scale_argb_test.cc index 489d91323..08103ca87 100644 --- a/unit_test/scale_argb_test.cc +++ b/unit_test/scale_argb_test.cc @@ -20,129 +20,169 @@ namespace libyuv { static int ARGBTestFilter(int src_width, int src_height, int dst_width, int dst_height, FilterMode f) { + const int b = 128; + int src_argb_plane_size = (src_width + (2 * b)) * (src_height + (2 * b)) * 4; + int src_stride_argb = (2 * b + src_width) * 4; - int b = 128; + align_buffer_16(src_argb, src_argb_plane_size) + memset(src_argb, 1, src_argb_plane_size); - int src_y_plane_size = (src_width + (2 * b)) * (src_height + (2 * b)) * 4; - int src_stride_y = (2 * b + src_width) * 4; - - align_buffer_16(src_y, src_y_plane_size) - - int dst_y_plane_size = (dst_width + (2 * b)) * (dst_height + (2 * b)) * 4; - int dst_stride_y = (2 * b + dst_width) * 4; + int dst_argb_plane_size = (dst_width + (2 * b)) * (dst_height + (2 * b)) * 4; + int dst_stride_argb = (2 * b + dst_width) * 4; srandom(time(NULL)); int i, j; - for (i = b; i < (src_height + b); ++i) { for (j = b; j < (src_width + b) * 4; ++j) { - src_y[(i * src_stride_y) + j] = (random() & 0xff); + src_argb[(i * src_stride_argb) + j] = (random() & 0xff); } } const int runs = 1000; - align_buffer_16(dst_y_c, dst_y_plane_size) - align_buffer_16(dst_y_opt, dst_y_plane_size) + align_buffer_16(dst_argb_c, dst_argb_plane_size) + align_buffer_16(dst_argb_opt, dst_argb_plane_size) + memset(dst_argb_c, 2, dst_argb_plane_size); + memset(dst_argb_opt, 3, dst_argb_plane_size); - MaskCpuFlags(kCpuInitialized); + // Warm up both versions for consistent benchmarks. + MaskCpuFlags(0); // Disable all CPU optimization. + ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, + src_width, src_height, + dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb, + dst_width, dst_height, f); + MaskCpuFlags(-1); // Enable all CPU optimization. + ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, + src_width, src_height, + dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb, + dst_width, dst_height, f); + + MaskCpuFlags(0); // Disable all CPU optimization. double c_time = get_time(); - - for (i = 0; i < runs; ++i) - ARGBScale(src_y + (src_stride_y * b) + b * 4, src_stride_y, + for (i = 0; i < runs; ++i) { + ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, src_width, src_height, - dst_y_c + (dst_stride_y * b) + b * 4, dst_stride_y, + dst_argb_c + (dst_stride_argb * b) + b * 4, dst_stride_argb, dst_width, dst_height, f); - + } c_time = (get_time() - c_time) / runs; - MaskCpuFlags(-1); + MaskCpuFlags(-1); // Enable all CPU optimization. double opt_time = get_time(); - - for (i = 0; i < runs; ++i) - ARGBScale(src_y + (src_stride_y * b) + b * 4, src_stride_y, + for (i = 0; i < runs; ++i) { + ARGBScale(src_argb + (src_stride_argb * b) + b * 4, src_stride_argb, src_width, src_height, - dst_y_opt + (dst_stride_y * b) + b * 4, dst_stride_y, + dst_argb_opt + (dst_stride_argb * b) + b * 4, dst_stride_argb, dst_width, dst_height, f); - + } opt_time = (get_time() - opt_time) / runs; - printf ("filter %d - %8d us c - %8d us opt\n", - f, (int)(c_time*1e6), (int)(opt_time*1e6)); + // Report performance of C vs OPT + printf("filter %d - %8d us C - %8d us OPT\n", + f, static_cast(c_time*1e6), static_cast(opt_time*1e6)); // C version may be a little off from the optimized. Order of // operations may introduce rounding somewhere. So do a difference // of the buffers and look to see that the max difference isn't // over 2. - int err = 0; int max_diff = 0; for (i = b; i < (dst_height + b); ++i) { for (j = b * 4; j < (dst_width + b) * 4; ++j) { - int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] - - dst_y_opt[(i * dst_stride_y) + j]); + int abs_diff = abs(dst_argb_c[(i * dst_stride_argb) + j] - + dst_argb_opt[(i * dst_stride_argb) + j]); if (abs_diff > max_diff) max_diff = abs_diff; } } - if (max_diff > 2) - err++; - - free_aligned_buffer_16(dst_y_c) - free_aligned_buffer_16(dst_y_opt) - free_aligned_buffer_16(src_y) - return err; + free_aligned_buffer_16(dst_argb_c) + free_aligned_buffer_16(dst_argb_opt) + free_aligned_buffer_16(src_argb) + return max_diff; } TEST_F(libyuvTest, ARGBScaleDownBy2) { - const int src_width = 1280; const int src_height = 720; const int dst_width = src_width / 2; const int dst_height = src_height / 2; - int err = 0; for (int f = 0; f < 2; ++f) { - err += ARGBTestFilter(src_width, src_height, - dst_width, dst_height, - static_cast(f)); + int err = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast(f)); + EXPECT_GE(1, err); } - - EXPECT_EQ(0, err); } TEST_F(libyuvTest, ARGBScaleDownBy4) { - const int src_width = 1280; const int src_height = 720; const int dst_width = src_width / 4; const int dst_height = src_height / 4; - int err = 0; for (int f = 0; f < 2; ++f) { - err += ARGBTestFilter(src_width, src_height, - dst_width, dst_height, - static_cast(f)); + int err = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast(f)); + EXPECT_GE(1, err); } +} - EXPECT_EQ(0, err); +TEST_F(libyuvTest, ARGBScaleDownBy5) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width / 5; + const int dst_height = src_height / 5; + + for (int f = 0; f < 2; ++f) { + int err = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast(f)); + EXPECT_GE(1, err); + } +} + +TEST_F(libyuvTest, ARGBScaleDownBy8) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width / 8; + const int dst_height = src_height / 8; + + for (int f = 0; f < 2; ++f) { + int err = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast(f)); + EXPECT_GE(1, err); + } +} + +TEST_F(libyuvTest, ARGBScaleDownBy16) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width / 16; + const int dst_height = src_height / 16; + + for (int f = 0; f < 2; ++f) { + int err = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast(f)); + EXPECT_GE(1, err); + } } TEST_F(libyuvTest, ARGBScaleDownBy34) { - const int src_width = 1280; const int src_height = 720; const int dst_width = src_width * 3 / 4; const int dst_height = src_height * 3 / 4; - int err = 0; for (int f = 0; f < 2; ++f) { - err += ARGBTestFilter(src_width, src_height, - dst_width, dst_height, - static_cast(f)); + int err = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast(f)); + EXPECT_GE(1, err); } - - EXPECT_EQ(0, err); } TEST_F(libyuvTest, ARGBScaleDownBy38) { @@ -150,31 +190,27 @@ TEST_F(libyuvTest, ARGBScaleDownBy38) { int src_height = 720; int dst_width = src_width * 3 / 8; int dst_height = src_height * 3 / 8; - int err = 0; for (int f = 0; f < 2; ++f) { - err += ARGBTestFilter(src_width, src_height, - dst_width, dst_height, - static_cast(f)); + int err = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast(f)); + EXPECT_GE(1, err); } - - EXPECT_EQ(0, err); } -TEST_F(libyuvTest, ARGBScalePlaneBilinear) { +TEST_F(libyuvTest, ARGBScaleTo1366) { int src_width = 1280; int src_height = 720; int dst_width = 1366; int dst_height = 768; - int err = 0; for (int f = 0; f < 2; ++f) { - err += ARGBTestFilter(src_width, src_height, - dst_width, dst_height, - static_cast(f)); + int err = ARGBTestFilter(src_width, src_height, + dst_width, dst_height, + static_cast(f)); + EXPECT_GE(1, err); } - - EXPECT_EQ(0, err); } } // namespace libyuv diff --git a/unit_test/scale_test.cc b/unit_test/scale_test.cc index 215aadfb6..4701c1e1a 100644 --- a/unit_test/scale_test.cc +++ b/unit_test/scale_test.cc @@ -20,8 +20,7 @@ namespace libyuv { static int TestFilter(int src_width, int src_height, int dst_width, int dst_height, FilterMode f) { - - int b = 128; + const int b = 128; int src_width_uv = (src_width + 1) >> 1; int src_height_uv = (src_height + 1) >> 1; @@ -47,7 +46,6 @@ static int TestFilter(int src_width, int src_height, srandom(time(NULL)); int i, j; - for (i = b; i < (src_height + b); ++i) { for (j = b; j < (src_width + b); ++j) { src_y[(i * src_stride_y) + j] = (random() & 0xff); @@ -69,10 +67,29 @@ static int TestFilter(int src_width, int src_height, align_buffer_16(dst_u_opt, dst_uv_plane_size) align_buffer_16(dst_v_opt, dst_uv_plane_size) - MaskCpuFlags(kCpuInitialized); - double c_time = get_time(); + // Warm up both versions for consistent benchmarks. + MaskCpuFlags(0); // Disable all CPU optimization. + I420Scale(src_y + (src_stride_y * b) + b, src_stride_y, + src_u + (src_stride_uv * b) + b, src_stride_uv, + src_v + (src_stride_uv * b) + b, src_stride_uv, + src_width, src_height, + dst_y_c + (dst_stride_y * b) + b, dst_stride_y, + dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv, + dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv, + dst_width, dst_height, f); + MaskCpuFlags(-1); // Enable all CPU optimization. + I420Scale(src_y + (src_stride_y * b) + b, src_stride_y, + src_u + (src_stride_uv * b) + b, src_stride_uv, + src_v + (src_stride_uv * b) + b, src_stride_uv, + src_width, src_height, + dst_y_opt + (dst_stride_y * b) + b, dst_stride_y, + dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv, + dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv, + dst_width, dst_height, f); - for (i = 0; i < runs; ++i) + MaskCpuFlags(0); // Disable all CPU optimization. + double c_time = get_time(); + for (i = 0; i < runs; ++i) { I420Scale(src_y + (src_stride_y * b) + b, src_stride_y, src_u + (src_stride_uv * b) + b, src_stride_uv, src_v + (src_stride_uv * b) + b, src_stride_uv, @@ -81,13 +98,12 @@ static int TestFilter(int src_width, int src_height, dst_u_c + (dst_stride_uv * b) + b, dst_stride_uv, dst_v_c + (dst_stride_uv * b) + b, dst_stride_uv, dst_width, dst_height, f); - + } c_time = (get_time() - c_time) / runs; - MaskCpuFlags(-1); + MaskCpuFlags(-1); // Enable all CPU optimization. double opt_time = get_time(); - - for (i = 0; i < runs; ++i) + for (i = 0; i < runs; ++i) { I420Scale(src_y + (src_stride_y * b) + b, src_stride_y, src_u + (src_stride_uv * b) + b, src_stride_uv, src_v + (src_stride_uv * b) + b, src_stride_uv, @@ -96,24 +112,25 @@ static int TestFilter(int src_width, int src_height, dst_u_opt + (dst_stride_uv * b) + b, dst_stride_uv, dst_v_opt + (dst_stride_uv * b) + b, dst_stride_uv, dst_width, dst_height, f); - + } opt_time = (get_time() - opt_time) / runs; - printf ("filter %d - %8d us c - %8d us opt\n", - f, (int)(c_time*1e6), (int)(opt_time*1e6)); + // Report performance of C vs OPT + printf("filter %d - %8d us C - %8d us OPT\n", + f, static_cast(c_time*1e6), static_cast(opt_time*1e6)); // C version may be a little off from the optimized. Order of // operations may introduce rounding somewhere. So do a difference // of the buffers and look to see that the max difference isn't // over 2. - int err = 0; int max_diff = 0; for (i = b; i < (dst_height + b); ++i) { for (j = b; j < (dst_width + b); ++j) { int abs_diff = abs(dst_y_c[(i * dst_stride_y) + j] - dst_y_opt[(i * dst_stride_y) + j]); - if (abs_diff > max_diff) + if (abs_diff > max_diff) { max_diff = abs_diff; + } } } @@ -121,19 +138,17 @@ static int TestFilter(int src_width, int src_height, for (j = b; j < (dst_width_uv + b); ++j) { int abs_diff = abs(dst_u_c[(i * dst_stride_uv) + j] - dst_u_opt[(i * dst_stride_uv) + j]); - if (abs_diff > max_diff) + if (abs_diff > max_diff) { max_diff = abs_diff; + } abs_diff = abs(dst_v_c[(i * dst_stride_uv) + j] - dst_v_opt[(i * dst_stride_uv) + j]); - if (abs_diff > max_diff) + if (abs_diff > max_diff) { max_diff = abs_diff; - + } } } - if (max_diff > 2) - err++; - free_aligned_buffer_16(dst_y_c) free_aligned_buffer_16(dst_u_c) free_aligned_buffer_16(dst_v_c) @@ -145,55 +160,91 @@ static int TestFilter(int src_width, int src_height, free_aligned_buffer_16(src_u) free_aligned_buffer_16(src_v) - return err; + return max_diff; } TEST_F(libyuvTest, ScaleDownBy2) { - const int src_width = 1280; const int src_height = 720; const int dst_width = src_width / 2; const int dst_height = src_height / 2; - int err = 0; - for (int f = 0; f < 3; ++f) - err += TestFilter(src_width, src_height, - dst_width, dst_height, - static_cast(f)); - - EXPECT_EQ(0, err); + for (int f = 0; f < 3; ++f) { + int err = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast(f)); + EXPECT_GE(1, err); + } } TEST_F(libyuvTest, ScaleDownBy4) { - const int src_width = 1280; const int src_height = 720; const int dst_width = src_width / 4; const int dst_height = src_height / 4; - int err = 0; - for (int f = 0; f < 3; ++f) - err += TestFilter(src_width, src_height, - dst_width, dst_height, - static_cast(f)); + for (int f = 0; f < 3; ++f) { + int err = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast(f)); + EXPECT_GE(2, err); // This is the only scale factor with error of 2. + } +} - EXPECT_EQ(0, err); +TEST_F(libyuvTest, ScaleDownBy5) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width / 5; + const int dst_height = src_height / 5; + + for (int f = 0; f < 3; ++f) { + int err = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast(f)); + EXPECT_GE(1, err); + } +} + +TEST_F(libyuvTest, ScaleDownBy8) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width / 8; + const int dst_height = src_height / 8; + + for (int f = 0; f < 3; ++f) { + int err = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast(f)); + EXPECT_GE(1, err); + } +} + +TEST_F(libyuvTest, ScaleDownBy16) { + const int src_width = 1280; + const int src_height = 720; + const int dst_width = src_width / 16; + const int dst_height = src_height / 16; + + for (int f = 0; f < 3; ++f) { + int err = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast(f)); + EXPECT_GE(1, err); + } } TEST_F(libyuvTest, ScaleDownBy34) { - const int src_width = 1280; const int src_height = 720; const int dst_width = src_width * 3 / 4; const int dst_height = src_height * 3 / 4; - int err = 0; - for (int f = 0; f < 3; ++f) - err += TestFilter(src_width, src_height, - dst_width, dst_height, - static_cast(f)); - - EXPECT_EQ(0, err); + for (int f = 0; f < 3; ++f) { + int err = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast(f)); + EXPECT_GE(1, err); + } } TEST_F(libyuvTest, ScaleDownBy38) { @@ -201,29 +252,27 @@ TEST_F(libyuvTest, ScaleDownBy38) { int src_height = 720; int dst_width = src_width * 3 / 8; int dst_height = src_height * 3 / 8; - int err = 0; - for (int f = 0; f < 3; ++f) - err += TestFilter(src_width, src_height, - dst_width, dst_height, - static_cast(f)); - - EXPECT_EQ(0, err); + for (int f = 0; f < 3; ++f) { + int err = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast(f)); + EXPECT_GE(1, err); + } } -TEST_F(libyuvTest, ScalePlaneBilinear) { +TEST_F(libyuvTest, ScaleTo1366) { int src_width = 1280; int src_height = 720; int dst_width = 1366; int dst_height = 768; - int err = 0; - for (int f = 0; f < 3; ++f) - err += TestFilter(src_width, src_height, - dst_width, dst_height, - static_cast(f)); - - EXPECT_EQ(0, err); + for (int f = 0; f < 3; ++f) { + int err = TestFilter(src_width, src_height, + dst_width, dst_height, + static_cast(f)); + EXPECT_GE(1, err); + } } } // namespace libyuv