diff --git a/source/row_common.cc b/source/row_common.cc index abf9cf72d..bf953eeff 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -202,9 +202,8 @@ void ARGBToRGB565Row_C(const uint8* src_argb, uint8* dst_rgb, int width) { uint8 b1 = src_argb[4] >> 3; uint8 g1 = src_argb[5] >> 2; uint8 r1 = src_argb[6] >> 3; - WRITEWORD( - dst_rgb, - b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27)); + WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | + (r1 << 27)); dst_rgb += 4; src_argb += 8; } @@ -238,9 +237,8 @@ void ARGBToRGB565DitherRow_C(const uint8* src_argb, uint8 b1 = clamp255(src_argb[4] + dither1) >> 3; uint8 g1 = clamp255(src_argb[5] + dither1) >> 2; uint8 r1 = clamp255(src_argb[6] + dither1) >> 3; - WRITEWORD( - dst_rgb, - b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | (r1 << 27)); + WRITEWORD(dst_rgb, b0 | (g0 << 5) | (r0 << 11) | (b1 << 16) | (g1 << 21) | + (r1 << 27)); dst_rgb += 4; src_argb += 8; } diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 4a9d88916..8735070b6 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -1,4 +1,3 @@ -// VERSION 2 /* * Copyright 2011 The LibYuv Project Authors. All rights reserved. * @@ -5457,12 +5456,13 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { asm volatile ( "pshufd $0x0,%3,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" + "sub %0,%1 \n" // 16 pixel loop. LABELALIGN "1: \n" "movdqu " MEMACCESS(0) ",%%xmm2 \n" // 8 shorts - "lea " MEMLEA(0x10,0) ",%0 \n" + "add $0x10,%0 \n" "movdqa %%xmm2,%%xmm3 \n" "punpcklwd %%xmm5,%%xmm2 \n" // 8 ints in xmm2/1 "cvtdq2ps %%xmm2,%%xmm2 \n" // 8 floats @@ -5473,8 +5473,7 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { "psrld $0xd,%%xmm2 \n" "psrld $0xd,%%xmm3 \n" "packssdw %%xmm3,%%xmm2 \n" - "movdqu %%xmm2," MEMACCESS(1) " \n" - "lea " MEMLEA(0x10,1) ",%1 \n" + MEMOPMEM(movdqu,xmm2,-0x10,0,1,1) "sub $0x8,%2 \n" "jg 1b \n" : "+r"(src), // %0 @@ -5488,17 +5487,17 @@ void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) { #endif // HAS_HALFFLOATROW_SSE2 #ifdef HAS_HALFFLOATROW_AVX2 -// TODO(fbarchard): consider vadddw instead of vmulps void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { asm volatile ( "vbroadcastss %3, %%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" + "sub %0,%1 \n" // 16 pixel loop. LABELALIGN "1: \n" "vmovdqu " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts - "lea " MEMLEA(0x20,0) ",%0 \n" + "add $0x20,%0 \n" "vpunpckhwd %%ymm5,%%ymm2,%%ymm3 \n" // mutates "vpunpcklwd %%ymm5,%%ymm2,%%ymm2 \n" "vcvtdq2ps %%ymm3,%%ymm3 \n" @@ -5508,10 +5507,10 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { "vpsrld $0xd,%%ymm3,%%ymm3 \n" "vpsrld $0xd,%%ymm2,%%ymm2 \n" "vpackssdw %%ymm3, %%ymm2, %%ymm2 \n" // unmutates - "vmovdqu %%ymm2," MEMACCESS(1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" + MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1) "sub $0x10,%2 \n" "jg 1b \n" + "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -5526,26 +5525,25 @@ void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) { #ifdef HAS_HALFFLOATROW_F16C void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { asm volatile ( - "vbroadcastss %3, %%ymm4 \n" + "vbroadcastss %3, %%ymm4 \n" + "sub %0,%1 \n" // 16 pixel loop. LABELALIGN "1: \n" "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" "vcvtdq2ps %%ymm2,%%ymm2 \n" "vcvtdq2ps %%ymm3,%%ymm3 \n" "vmulps %%ymm2,%%ymm4,%%ymm2 \n" "vmulps %%ymm3,%%ymm4,%%ymm3 \n" "vcvtps2ph $3, %%ymm2, %%xmm2 \n" "vcvtps2ph $3, %%ymm3, %%xmm3 \n" - "vmovdqu %%xmm2," MEMACCESS(1) " \n" - "vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" + MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1) + MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1) + "add $0x20,%0 \n" "sub $0x10,%2 \n" "jg 1b \n" - "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 @@ -5560,22 +5558,21 @@ void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) { #ifdef HAS_HALFFLOATROW_F16C void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) { asm volatile ( + "sub %0,%1 \n" // 16 pixel loop. LABELALIGN "1: \n" "vpmovzxwd " MEMACCESS(0) ",%%ymm2 \n" // 16 shorts -> 16 ints "vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm3 \n" - "lea " MEMLEA(0x20,0) ",%0 \n" "vcvtdq2ps %%ymm2,%%ymm2 \n" "vcvtdq2ps %%ymm3,%%ymm3 \n" "vcvtps2ph $3, %%ymm2, %%xmm2 \n" "vcvtps2ph $3, %%ymm3, %%xmm3 \n" - "vmovdqu %%xmm2," MEMACCESS(1) " \n" - "vmovdqu %%xmm3," MEMACCESS2(0x10,1) " \n" - "lea " MEMLEA(0x20,1) ",%1 \n" + MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1) + MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1) + "add $0x20,%0 \n" "sub $0x10,%2 \n" "jg 1b \n" - "vzeroupper \n" : "+r"(src), // %0 "+r"(dst), // %1 diff --git a/source/row_win.cc b/source/row_win.cc index 028f7bf92..202f2b8d4 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -6070,11 +6070,12 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src, mulss xmm4, kExpBias pshufd xmm4, xmm4, 0 pxor xmm5, xmm5 + sub edx, eax // 8 pixel loop. convertloop: movdqu xmm2, xmmword ptr [eax] // 8 shorts - lea eax, [eax + 16] + add eax, 16 movdqa xmm3, xmm2 punpcklwd xmm2, xmm5 cvtdq2ps xmm2, xmm2 // convert 8 ints to floats @@ -6085,8 +6086,7 @@ __declspec(naked) void HalfFloatRow_SSE2(const uint16* src, psrld xmm2, 13 psrld xmm3, 13 packssdw xmm2, xmm3 - movdqu [edx], xmm2 - lea edx, [edx + 16] + movdqu [eax + edx - 16], xmm2 sub ecx, 8 jg convertloop ret @@ -6108,11 +6108,12 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src, vmulss xmm4, xmm4, kExpBias vbroadcastss ymm4, xmm4 vpxor ymm5, ymm5, ymm5 + sub edx, eax // 16 pixel loop. convertloop: vmovdqu ymm2, [eax] // 16 shorts - lea eax, [eax + 32] + add eax, 32 vpunpckhwd ymm3, ymm2, ymm5 // convert 16 shorts to 16 ints vpunpcklwd ymm2, ymm2, ymm5 vcvtdq2ps ymm3, ymm3 // convert 16 ints to floats @@ -6122,8 +6123,7 @@ __declspec(naked) void HalfFloatRow_AVX2(const uint16* src, vpsrld ymm3, ymm3, 13 // float convert to 8 half floats truncate vpsrld ymm2, ymm2, 13 vpackssdw ymm2, ymm2, ymm3 - vmovdqu [edx], ymm2 - lea edx, [edx + 32] + vmovdqu [eax + edx - 32], ymm2 sub ecx, 16 jg convertloop vzeroupper @@ -6142,21 +6142,21 @@ __declspec(naked) void HalfFloatRow_F16C(const uint16* src, mov edx, [esp + 8] /* dst */ vbroadcastss ymm4, [esp + 12] /* scale */ mov ecx, [esp + 16] /* width */ + sub edx, eax // 16 pixel loop. convertloop: vpmovzxwd ymm2, xmmword ptr [eax] // 8 shorts -> 8 ints vpmovzxwd ymm3, xmmword ptr [eax + 16] // 8 more shorts - lea eax, [eax + 32] + add eax, 32 vcvtdq2ps ymm2, ymm2 // convert 8 ints to floats vcvtdq2ps ymm3, ymm3 vmulps ymm2, ymm2, ymm4 // scale to normalized range 0 to 1 vmulps ymm3, ymm3, ymm4 vcvtps2ph xmm2, ymm2, 3 // float convert to 8 half floats truncate vcvtps2ph xmm3, ymm3, 3 - vmovdqu [edx], xmm2 - vmovdqu [edx + 16], xmm3 - lea edx, [edx + 32] + vmovdqu [eax + edx + 32], xmm2 + vmovdqu [eax + edx + 32 + 16], xmm3 sub ecx, 16 jg convertloop vzeroupper diff --git a/source/scale.cc b/source/scale.cc index 65df1f09e..0c94a3c98 100644 --- a/source/scale.cc +++ b/source/scale.cc @@ -45,9 +45,10 @@ static void ScalePlaneDown2(int src_width, int y; void (*ScaleRowDown2)(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst_ptr, int dst_width) = - filtering == kFilterNone ? ScaleRowDown2_C : (filtering == kFilterLinear - ? ScaleRowDown2Linear_C - : ScaleRowDown2Box_C); + filtering == kFilterNone + ? ScaleRowDown2_C + : (filtering == kFilterLinear ? ScaleRowDown2Linear_C + : ScaleRowDown2Box_C); int row_stride = src_stride << 1; if (!filtering) { src_ptr += src_stride; // Point to odd rows. diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 3e55789a8..0f1c74302 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -30,113 +30,107 @@ namespace libyuv { #define SUBSAMPLE(v, a) ((((v) + (a)-1)) / (a)) -#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ - TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = benchmark_height_; \ - align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ - align_buffer_page_end( \ - src_u, \ - SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ - OFF); \ - align_buffer_page_end( \ - src_v, \ - SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ - OFF); \ - align_buffer_page_end(dst_y_c, kWidth* kHeight); \ - align_buffer_page_end( \ - dst_u_c, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end( \ - dst_v_c, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ - align_buffer_page_end( \ - dst_u_opt, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end( \ - dst_v_opt, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - for (int i = 0; i < kHeight; ++i) \ - for (int j = 0; j < kWidth; ++j) \ - src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ - for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ - src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ - (fastrand() & 0xff); \ - src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ - (fastrand() & 0xff); \ - } \ - } \ - memset(dst_y_c, 1, kWidth* kHeight); \ - memset(dst_u_c, 2, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_v_c, 3, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_opt, 101, kWidth* kHeight); \ - memset(dst_u_opt, 102, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_v_opt, 103, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - MaskCpuFlags(disable_cpu_flags_); \ - SRC_FMT_PLANAR##To##FMT_PLANAR( \ - src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ - src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \ - dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \ - SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - SRC_FMT_PLANAR##To##FMT_PLANAR( \ - src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ - src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \ - dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt, \ - SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \ - } \ - int max_diff = 0; \ - for (int i = 0; i < kHeight; ++i) { \ - for (int j = 0; j < kWidth; ++j) { \ - int abs_diff = abs(static_cast(dst_y_c[i * kWidth + j]) - \ - static_cast(dst_y_opt[i * kWidth + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ - } \ - } \ - EXPECT_EQ(0, max_diff); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ - int abs_diff = abs( \ - static_cast(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ - static_cast( \ - dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ - } \ - } \ - EXPECT_LE(max_diff, 3); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ - int abs_diff = abs( \ - static_cast(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ - static_cast( \ - dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ - } \ - } \ - EXPECT_LE(max_diff, 3); \ - free_aligned_buffer_page_end(dst_y_c); \ - free_aligned_buffer_page_end(dst_u_c); \ - free_aligned_buffer_page_end(dst_v_c); \ - free_aligned_buffer_page_end(dst_y_opt); \ - free_aligned_buffer_page_end(dst_u_opt); \ - free_aligned_buffer_page_end(dst_v_opt); \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_u); \ - free_aligned_buffer_page_end(src_v); \ +#define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ + TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_u, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ + OFF); \ + align_buffer_page_end(src_v, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ + OFF); \ + align_buffer_page_end(dst_y_c, kWidth* kHeight); \ + align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ + align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kWidth; ++j) \ + src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ + src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ + (fastrand() & 0xff); \ + src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ + (fastrand() & 0xff); \ + } \ + } \ + memset(dst_y_c, 1, kWidth* kHeight); \ + memset(dst_u_c, 2, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_v_c, 3, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_y_opt, 101, kWidth* kHeight); \ + memset(dst_u_opt, 102, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_v_opt, 103, \ + SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + MaskCpuFlags(disable_cpu_flags_); \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \ + dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_c, \ + SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \ + dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X), dst_v_opt, \ + SUBSAMPLE(kWidth, SUBSAMP_X), kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + int abs_diff = abs(static_cast(dst_y_c[i * kWidth + j]) - \ + static_cast(dst_y_opt[i * kWidth + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_EQ(0, max_diff); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ + int abs_diff = abs( \ + static_cast(dst_u_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ + static_cast( \ + dst_u_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 3); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ + int abs_diff = abs( \ + static_cast(dst_v_c[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j]) - \ + static_cast( \ + dst_v_opt[i * SUBSAMPLE(kWidth, SUBSAMP_X) + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 3); \ + free_aligned_buffer_page_end(dst_y_c); \ + free_aligned_buffer_page_end(dst_u_c); \ + free_aligned_buffer_page_end(dst_v_c); \ + free_aligned_buffer_page_end(dst_y_opt); \ + free_aligned_buffer_page_end(dst_u_opt); \ + free_aligned_buffer_page_end(dst_v_opt); \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_u); \ + free_aligned_buffer_page_end(src_v); \ } #define TESTPLANARTOP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ @@ -172,19 +166,15 @@ TESTPLANARTOP(I444, 1, 1, I444, 1, 1) align_buffer_page_end(src_uv, \ kSizeUV*((PIXEL_STRIDE == 3) ? 3 : 2) + OFF); \ align_buffer_page_end(dst_y_c, kWidth* kHeight); \ - align_buffer_page_end( \ - dst_u_c, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end( \ - dst_v_c, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ - align_buffer_page_end( \ - dst_u_opt, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end( \ - dst_v_opt, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ uint8* src_u = src_uv + OFF_U; \ uint8* src_v = src_uv + (PIXEL_STRIDE == 1 ? kSizeUV : OFF_V); \ int src_stride_uv = SUBSAMPLE(kWidth, SUBSAMP_X) * PIXEL_STRIDE; \ @@ -288,88 +278,84 @@ TESTAPLANARTOP(Android420, I420, 1, 0, 0, 2, 2, I420, 2, 2) TESTAPLANARTOP(Android420, NV12, 2, 0, 1, 2, 2, I420, 2, 2) TESTAPLANARTOP(Android420, NV21, 2, 1, 0, 2, 2, I420, 2, 2) -#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ - FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ - TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ - const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ - const int kHeight = benchmark_height_; \ - align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ - align_buffer_page_end( \ - src_u, \ - SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ - OFF); \ - align_buffer_page_end( \ - src_v, \ - SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ - OFF); \ - align_buffer_page_end(dst_y_c, kWidth* kHeight); \ - align_buffer_page_end( \ - dst_uv_c, \ - SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ - align_buffer_page_end( \ - dst_uv_opt, \ - SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - for (int i = 0; i < kHeight; ++i) \ - for (int j = 0; j < kWidth; ++j) \ - src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ - for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ - src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ - (fastrand() & 0xff); \ - src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ - (fastrand() & 0xff); \ - } \ - } \ - memset(dst_y_c, 1, kWidth* kHeight); \ - memset(dst_uv_c, 2, \ - SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - memset(dst_y_opt, 101, kWidth* kHeight); \ - memset(dst_uv_opt, 102, \ - SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - MaskCpuFlags(disable_cpu_flags_); \ - SRC_FMT_PLANAR##To##FMT_PLANAR( \ - src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ - src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \ - dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \ - MaskCpuFlags(benchmark_cpu_info_); \ - for (int i = 0; i < benchmark_iterations_; ++i) { \ - SRC_FMT_PLANAR##To##FMT_PLANAR( \ - src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ - src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \ - dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \ - } \ - int max_diff = 0; \ - for (int i = 0; i < kHeight; ++i) { \ - for (int j = 0; j < kWidth; ++j) { \ - int abs_diff = abs(static_cast(dst_y_c[i * kWidth + j]) - \ - static_cast(dst_y_opt[i * kWidth + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ - } \ - } \ - EXPECT_LE(max_diff, 1); \ - for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ - for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \ - int abs_diff = \ - abs(static_cast( \ - dst_uv_c[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \ - static_cast( \ - dst_uv_opt[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \ - if (abs_diff > max_diff) { \ - max_diff = abs_diff; \ - } \ - } \ - } \ - EXPECT_LE(max_diff, 1); \ - free_aligned_buffer_page_end(dst_y_c); \ - free_aligned_buffer_page_end(dst_uv_c); \ - free_aligned_buffer_page_end(dst_y_opt); \ - free_aligned_buffer_page_end(dst_uv_opt); \ - free_aligned_buffer_page_end(src_y); \ - free_aligned_buffer_page_end(src_u); \ - free_aligned_buffer_page_end(src_v); \ +#define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ + FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ + TEST_F(LibYUVConvertTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ + const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ + const int kHeight = benchmark_height_; \ + align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ + align_buffer_page_end(src_u, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ + OFF); \ + align_buffer_page_end(src_v, SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ + OFF); \ + align_buffer_page_end(dst_y_c, kWidth* kHeight); \ + align_buffer_page_end(dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ + align_buffer_page_end(dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + for (int i = 0; i < kHeight; ++i) \ + for (int j = 0; j < kWidth; ++j) \ + src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SRC_SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth, SRC_SUBSAMP_X); ++j) { \ + src_u[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ + (fastrand() & 0xff); \ + src_v[(i * SUBSAMPLE(kWidth, SRC_SUBSAMP_X)) + j + OFF] = \ + (fastrand() & 0xff); \ + } \ + } \ + memset(dst_y_c, 1, kWidth* kHeight); \ + memset(dst_uv_c, 2, \ + SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + memset(dst_y_opt, 101, kWidth* kHeight); \ + memset(dst_uv_opt, 102, \ + SUBSAMPLE(kWidth * 2, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + MaskCpuFlags(disable_cpu_flags_); \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_c, kWidth, \ + dst_uv_c, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \ + MaskCpuFlags(benchmark_cpu_info_); \ + for (int i = 0; i < benchmark_iterations_; ++i) { \ + SRC_FMT_PLANAR##To##FMT_PLANAR( \ + src_y + OFF, kWidth, src_u + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), \ + src_v + OFF, SUBSAMPLE(kWidth, SRC_SUBSAMP_X), dst_y_opt, kWidth, \ + dst_uv_opt, SUBSAMPLE(kWidth * 2, SUBSAMP_X), kWidth, NEG kHeight); \ + } \ + int max_diff = 0; \ + for (int i = 0; i < kHeight; ++i) { \ + for (int j = 0; j < kWidth; ++j) { \ + int abs_diff = abs(static_cast(dst_y_c[i * kWidth + j]) - \ + static_cast(dst_y_opt[i * kWidth + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 1); \ + for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ + for (int j = 0; j < SUBSAMPLE(kWidth * 2, SUBSAMP_X); ++j) { \ + int abs_diff = \ + abs(static_cast( \ + dst_uv_c[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j]) - \ + static_cast( \ + dst_uv_opt[i * SUBSAMPLE(kWidth * 2, SUBSAMP_X) + j])); \ + if (abs_diff > max_diff) { \ + max_diff = abs_diff; \ + } \ + } \ + } \ + EXPECT_LE(max_diff, 1); \ + free_aligned_buffer_page_end(dst_y_c); \ + free_aligned_buffer_page_end(dst_uv_c); \ + free_aligned_buffer_page_end(dst_y_opt); \ + free_aligned_buffer_page_end(dst_uv_opt); \ + free_aligned_buffer_page_end(src_y); \ + free_aligned_buffer_page_end(src_u); \ + free_aligned_buffer_page_end(src_v); \ } #define TESTPLANARTOBP(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ @@ -393,24 +379,19 @@ TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2) const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ const int kHeight = benchmark_height_; \ align_buffer_page_end(src_y, kWidth* kHeight + OFF); \ - align_buffer_page_end(src_uv, \ - 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ - SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ - OFF); \ + align_buffer_page_end(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SRC_SUBSAMP_Y) + \ + OFF); \ align_buffer_page_end(dst_y_c, kWidth* kHeight); \ - align_buffer_page_end( \ - dst_u_c, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end( \ - dst_v_c, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_u_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_v_c, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ align_buffer_page_end(dst_y_opt, kWidth* kHeight); \ - align_buffer_page_end( \ - dst_u_opt, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ - align_buffer_page_end( \ - dst_v_opt, \ - SUBSAMPLE(kWidth, SUBSAMP_X) * SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_u_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ + align_buffer_page_end(dst_v_opt, SUBSAMPLE(kWidth, SUBSAMP_X) * \ + SUBSAMPLE(kHeight, SUBSAMP_Y)); \ for (int i = 0; i < kHeight; ++i) \ for (int j = 0; j < kWidth; ++j) \ src_y[i * kWidth + j + OFF] = (fastrand() & 0xff); \ @@ -1388,12 +1369,10 @@ TEST_F(LibYUVConvertTest, MJPGToI420) { const int kSize = kImageSize + kOff; align_buffer_page_end(orig_pixels, kSize); align_buffer_page_end(dst_y_opt, benchmark_width_ * benchmark_height_); - align_buffer_page_end( - dst_u_opt, - SUBSAMPLE(benchmark_width_, 2) * SUBSAMPLE(benchmark_height_, 2)); - align_buffer_page_end( - dst_v_opt, - SUBSAMPLE(benchmark_width_, 2) * SUBSAMPLE(benchmark_height_, 2)); + align_buffer_page_end(dst_u_opt, SUBSAMPLE(benchmark_width_, 2) * + SUBSAMPLE(benchmark_height_, 2)); + align_buffer_page_end(dst_v_opt, SUBSAMPLE(benchmark_width_, 2) * + SUBSAMPLE(benchmark_height_, 2)); // EOI, SOI to make MJPG appear valid. memset(orig_pixels, 0, kSize); @@ -1465,20 +1444,16 @@ TEST_F(LibYUVConvertTest, NV12Crop) { uint8* src_uv = src_y + kWidth * kHeight; align_buffer_page_end(dst_y, kDestWidth * kDestHeight); - align_buffer_page_end( - dst_u, - SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); - align_buffer_page_end( - dst_v, - SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); + align_buffer_page_end(dst_u, SUBSAMPLE(kDestWidth, SUBSAMP_X) * + SUBSAMPLE(kDestHeight, SUBSAMP_Y)); + align_buffer_page_end(dst_v, SUBSAMPLE(kDestWidth, SUBSAMP_X) * + SUBSAMPLE(kDestHeight, SUBSAMP_Y)); align_buffer_page_end(dst_y_2, kDestWidth * kDestHeight); - align_buffer_page_end( - dst_u_2, - SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); - align_buffer_page_end( - dst_v_2, - SUBSAMPLE(kDestWidth, SUBSAMP_X) * SUBSAMPLE(kDestHeight, SUBSAMP_Y)); + align_buffer_page_end(dst_u_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) * + SUBSAMPLE(kDestHeight, SUBSAMP_Y)); + align_buffer_page_end(dst_v_2, SUBSAMPLE(kDestWidth, SUBSAMP_X) * + SUBSAMPLE(kDestHeight, SUBSAMP_Y)); for (int i = 0; i < kHeight * kWidth; ++i) { src_y[i] = (fastrand() & 0xff);