diff --git a/source/convert_from_argb.cc b/source/convert_from_argb.cc index 57cda4c85..48a645666 100644 --- a/source/convert_from_argb.cc +++ b/source/convert_from_argb.cc @@ -1061,7 +1061,7 @@ int ARGBToJ400(const uint8* src_argb, int src_stride_argb, // Coalesce contiguous rows. if (src_stride_argb == width * 4 && dst_stride_yj == width) { - return ARGBToI400(src_argb, 0, + return ARGBToJ400(src_argb, 0, dst_yj, 0, width * height, 1); } diff --git a/source/row_common.cc b/source/row_common.cc index f06847000..538b04db9 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -256,8 +256,23 @@ MAKEROWY(RGB24, 2, 1, 0, 3) MAKEROWY(RAW, 0, 1, 2, 3) #undef MAKEROWY +// BT.601 mpeg range +// b 0.1016 * 255 = 25.908 = 25 +// g 0.5078 * 255 = 129.489 = 129 +// r 0.2578 * 255 = 65.739 = 66 +// = 0.8672. 1/.8672 = 1.1531 +// BT.601 full range 8 bit (not used) +// b 0.1016 * 1.1531 = 0.1172 * 255 = 29.886 = 30 +// g 0.5078 * 1.1531 = 0.5855 * 255 = 149.3025 = 149 +// r 0.2578 * 1.1531 = 0.2973 * 255 = 75.8115 = 76 +// 30 + 149 + 76 = 255 +// BT.601 full range 7 bit +// b 0.1172 * 127 = 14.8844 = 15 +// g 0.5855 * 127 = 74.35855 = 74 +// r 0.2973 * 127 = 37.7571 = 38 + static __inline int RGBToYJ(uint8 r, uint8 g, uint8 b) { - return (66 * r + 129 * g + 25 * b + 0x0080) >> 8; + return (38 * r + 74 * g + 15 * b + 64) >> 7; } #define MAKEROWYJ(NAME, R, G, B, BPP) \ diff --git a/source/row_neon.cc b/source/row_neon.cc index e8f14ec1b..a22908b26 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -1338,9 +1338,9 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( - "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient - "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient - "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient + "vmov.u8 d24, #15 \n" // B * 0.1172 coefficient + "vmov.u8 d25, #74 \n" // G * 0.5855 coefficient + "vmov.u8 d26, #38 \n" // R * 0.2973 coefficient ".p2align 2 \n" "1: \n" "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. diff --git a/source/row_posix.cc b/source/row_posix.cc index db2e5f5cc..4f722c726 100644 --- a/source/row_posix.cc +++ b/source/row_posix.cc @@ -35,6 +35,11 @@ CONST vec8 kARGBToY = { 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 }; +// JPeg full range. +CONST vec8 kARGBToYJ = { + 15, 74, 38, 0, 15, 74, 38, 0, 15, 74, 38, 0, 15, 74, 38, 0 +}; + CONST vec8 kARGBToU = { 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 }; @@ -86,6 +91,10 @@ CONST uvec8 kAddY16 = { 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u }; +CONST vec16 kAddYJ64 = { + 64, 64, 64, 64, 64, 64, 64, 64 +}; + CONST uvec8 kAddUV128 = { 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u @@ -645,6 +654,7 @@ void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" ".p2align 4 \n" "1: \n" "movdqa (%0),%%xmm0 \n" @@ -658,6 +668,8 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "lea 0x40(%0),%0 \n" "phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm3,%%xmm2 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm2 \n" "psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" @@ -668,10 +680,11 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 - : "m"(kARGBToY) // %3 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif ); } @@ -716,6 +729,7 @@ void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( "movdqa %3,%%xmm4 \n" + "movdqa %4,%%xmm5 \n" ".p2align 4 \n" "1: \n" "movdqu (%0),%%xmm0 \n" @@ -729,6 +743,8 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { "lea 0x40(%0),%0 \n" "phaddw %%xmm1,%%xmm0 \n" "phaddw %%xmm3,%%xmm2 \n" + "paddw %%xmm5,%%xmm0 \n" + "paddw %%xmm5,%%xmm2 \n" "psrlw $0x7,%%xmm0 \n" "psrlw $0x7,%%xmm2 \n" "packuswb %%xmm2,%%xmm0 \n" @@ -739,13 +755,15 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { : "+r"(src_argb), // %0 "+r"(dst_y), // %1 "+r"(pix) // %2 - : "m"(kARGBToY) // %3 + : "m"(kARGBToYJ), // %3 + "m"(kAddYJ64) // %4 : "memory", "cc" #if defined(__SSE2__) - , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4" + , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5" #endif ); } + // TODO(fbarchard): pass xmm constants to single block of assembly. // fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes // 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers, diff --git a/source/row_win.cc b/source/row_win.cc index 78ee99860..d56ffd717 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -25,6 +25,11 @@ static const vec8 kARGBToY = { 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 }; +// JPeg full range. +static const vec8 kARGBToYJ = { + 15, 74, 38, 0, 15, 74, 38, 0, 15, 74, 38, 0, 15, 74, 38, 0 +}; + static const lvec8 kARGBToY_AVX = { 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 @@ -103,6 +108,10 @@ static const uvec8 kAddY16 = { 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u }; +static const vec16 kAddYJ64 = { + 64, 64, 64, 64, 64, 64, 64, 64 +}; + static const ulvec8 kAddY16_AVX = { 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, @@ -671,7 +680,8 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ - movdqa xmm4, kARGBToY + movdqa xmm4, kARGBToYJ + movdqa xmm5, kAddYJ64 align 16 convertloop: @@ -686,6 +696,8 @@ void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { lea eax, [eax + 64] phaddw xmm0, xmm1 phaddw xmm2, xmm3 + paddw xmm0, xmm5 + paddw xmm2, xmm5 psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 @@ -776,7 +788,8 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { mov eax, [esp + 4] /* src_argb */ mov edx, [esp + 8] /* dst_y */ mov ecx, [esp + 12] /* pix */ - movdqa xmm4, kARGBToY + movdqa xmm4, kARGBToYJ + movdqa xmm5, kAddYJ64 align 16 convertloop: @@ -791,6 +804,8 @@ void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { lea eax, [eax + 64] phaddw xmm0, xmm1 phaddw xmm2, xmm3 + paddw xmm0, xmm5 + paddw xmm2, xmm5 psrlw xmm0, 7 psrlw xmm2, 7 packuswb xmm0, xmm2 diff --git a/unit_test/compare_test.cc b/unit_test/compare_test.cc index d35ecb730..225e75736 100644 --- a/unit_test/compare_test.cc +++ b/unit_test/compare_test.cc @@ -203,7 +203,9 @@ TEST_F(libyuvTest, Psnr) { kSrcWidth, kSrcHeight); EXPECT_GT(err, 4.0); - EXPECT_LT(err, 5.0); + if (kSrcWidth * kSrcHeight >= 256) { + EXPECT_LT(err, 5.0); + } srandom(time(NULL)); diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 5edcb78e0..2cfd20001 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -35,7 +35,7 @@ namespace libyuv { #define TESTPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ - const int kWidth = W1280; \ + const int kWidth = W1280 > 1 ? W1280 : 1; \ const int kHeight = benchmark_height_; \ align_buffer_64(src_y, kWidth * kHeight + OFF); \ align_buffer_64(src_u, \ @@ -170,7 +170,7 @@ TESTPLANARTOP(I444, 1, 1, I444, 1, 1) #define TESTPLANARTOBPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ - const int kWidth = W1280; \ + const int kWidth = W1280 > 1 ? W1280 : 1; \ const int kHeight = benchmark_height_; \ align_buffer_64(src_y, kWidth * kHeight + OFF); \ align_buffer_64(src_u, \ @@ -273,7 +273,7 @@ TESTPLANARTOBP(I420, 2, 2, NV21, 2, 2) #define TESTBIPLANARTOPI(SRC_FMT_PLANAR, SRC_SUBSAMP_X, SRC_SUBSAMP_Y, \ FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, W1280, N, NEG, OFF) \ TEST_F(libyuvTest, SRC_FMT_PLANAR##To##FMT_PLANAR##N) { \ - const int kWidth = W1280; \ + const int kWidth = W1280 > 1 ? W1280 : 1; \ const int kHeight = benchmark_height_; \ align_buffer_64(src_y, kWidth * kHeight + OFF); \ align_buffer_64(src_uv, 2 * SUBSAMPLE(kWidth, SRC_SUBSAMP_X) * \ @@ -389,7 +389,7 @@ TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2) #define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \ W1280, DIFF, N, NEG, OFF, FMT_C, BPP_C) \ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ - const int kWidth = W1280; \ + const int kWidth = W1280 > 1 ? W1280 : 1; \ const int kHeight = benchmark_height_; \ const int kStrideB = ((kWidth * 8 * BPP_B + 7) / 8 + ALIGN - 1) / \ ALIGN * ALIGN; \ @@ -503,7 +503,7 @@ TESTPLANARTOB(I420, 2, 2, I400, 1, 1, 0, ARGB, 4) #define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ W1280, DIFF, N, NEG, OFF) \ TEST_F(libyuvTest, FMT_PLANAR##To##FMT_B##N) { \ - const int kWidth = W1280; \ + const int kWidth = W1280 > 1 ? W1280 : 1; \ const int kHeight = benchmark_height_; \ const int kStrideB = kWidth * BPP_B; \ align_buffer_64(src_y, kWidth * kHeight + OFF); \ @@ -582,7 +582,7 @@ TESTBIPLANARTOB(NV21, 2, 2, RGB565, 2, 9) #define TESTATOPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ W1280, DIFF, N, NEG, OFF) \ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \ - const int kWidth = W1280; \ + const int kWidth = W1280 > 1 ? W1280 : 1; \ const int kHeight = benchmark_height_; \ const int kStride = (kWidth * 8 * BPP_A + 7) / 8; \ align_buffer_64(src_argb, kStride * kHeight + OFF); \ @@ -712,7 +712,7 @@ TESTATOPLANAR(BayerGRBG, 1, I420, 2, 2, 4) #define TESTATOBIPLANARI(FMT_A, BPP_A, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ W1280, N, NEG, OFF) \ TEST_F(libyuvTest, FMT_A##To##FMT_PLANAR##N) { \ - const int kWidth = W1280; \ + const int kWidth = W1280 > 1 ? W1280 : 1; \ const int kHeight = benchmark_height_; \ const int kStride = (kWidth * 8 * BPP_A + 7) / 8; \ align_buffer_64(src_argb, kStride * kHeight + OFF); \ @@ -789,7 +789,7 @@ TESTATOBIPLANAR(ARGB, 4, NV21, 2, 2) FMT_B, BPP_B, STRIDE_B, \ W1280, DIFF, N, NEG, OFF) \ TEST_F(libyuvTest, FMT_A##To##FMT_B##N) { \ - const int kWidth = W1280; \ + const int kWidth = W1280 > 1 ? W1280 : 1; \ const int kHeight = benchmark_height_; \ const int kStrideA = (kWidth * BPP_A + STRIDE_A - 1) / STRIDE_A * STRIDE_A; \ const int kStrideB = (kWidth * BPP_B + STRIDE_B - 1) / STRIDE_B * STRIDE_B; \ @@ -814,6 +814,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##N) { \ } \ int max_diff = 0; \ for (int i = 0; i < kStrideB * kHeight; ++i) { \ + EXPECT_NEAR(dst_argb_c[i], dst_argb_opt[i], DIFF); \ int abs_diff = \ abs(static_cast(dst_argb_c[i]) - \ static_cast(dst_argb_opt[i])); \ @@ -859,6 +860,7 @@ TEST_F(libyuvTest, FMT_A##To##FMT_B##_Random) { \ int abs_diff = \ abs(static_cast(dst_argb_c[i]) - \ static_cast(dst_argb_opt[i])); \ + EXPECT_NEAR(dst_argb_c[i], dst_argb_opt[i], DIFF); \ if (abs_diff > max_diff) { \ max_diff = abs_diff; \ } \ @@ -903,7 +905,7 @@ TESTATOB(ARGB, 4, 4, 1, BayerGRBG, 1, 2, 2, 0) TESTATOB(ARGB, 4, 4, 1, YUY2, 2, 4, 1, 4) TESTATOB(ARGB, 4, 4, 1, UYVY, 2, 4, 1, 4) TESTATOB(ARGB, 4, 4, 1, I400, 1, 1, 1, 2) -TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 2) +TESTATOB(ARGB, 4, 4, 1, J400, 1, 1, 1, 0) TESTATOB(BGRA, 4, 4, 1, ARGB, 4, 4, 1, 0) TESTATOB(ABGR, 4, 4, 1, ARGB, 4, 4, 1, 0) TESTATOB(RGBA, 4, 4, 1, ARGB, 4, 4, 1, 0) diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 94284b66b..506d11a8c 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -107,6 +107,9 @@ TEST_F(libyuvTest, TestAttenuate) { static int TestAttenuateI(int width, int height, int benchmark_iterations, int invert, int off) { + if (width < 1) { + width = 1; + } const int kBpp = 4; const int kStride = (width * kBpp + 15) & ~15; align_buffer_64(src_argb, kStride * height + off); @@ -170,6 +173,9 @@ TEST_F(libyuvTest, ARGBAttenuate_Opt) { static int TestUnattenuateI(int width, int height, int benchmark_iterations, int invert, int off) { + if (width < 1) { + width = 1; + } const int kBpp = 4; const int kStride = (width * kBpp + 15) & ~15; align_buffer_64(src_argb, kStride * height + off); @@ -787,6 +793,9 @@ TESTINTERPOLATE(85) static int TestBlend(int width, int height, int benchmark_iterations, int invert, int off) { + if (width < 1) { + width = 1; + } const int kBpp = 4; const int kStride = width * kBpp; align_buffer_64(src_argb_a, kStride * height + off); @@ -1101,6 +1110,9 @@ TEST_F(libyuvTest, TestCopyPlane) { static int TestMultiply(int width, int height, int benchmark_iterations, int invert, int off) { + if (width < 1) { + width = 1; + } const int kBpp = 4; const int kStride = (width * kBpp + 15) & ~15; align_buffer_64(src_argb_a, kStride * height + off); @@ -1169,6 +1181,9 @@ TEST_F(libyuvTest, ARGBMultiply_Opt) { static int TestAdd(int width, int height, int benchmark_iterations, int invert, int off) { + if (width < 1) { + width = 1; + } const int kBpp = 4; const int kStride = (width * kBpp + 15) & ~15; align_buffer_64(src_argb_a, kStride * height + off); @@ -1237,6 +1252,9 @@ TEST_F(libyuvTest, ARGBAdd_Opt) { static int TestSubtract(int width, int height, int benchmark_iterations, int invert, int off) { + if (width < 1) { + width = 1; + } const int kBpp = 4; const int kStride = (width * kBpp + 15) & ~15; align_buffer_64(src_argb_a, kStride * height + off); @@ -1305,6 +1323,9 @@ TEST_F(libyuvTest, ARGBSubtract_Opt) { static int TestSobel(int width, int height, int benchmark_iterations, int invert, int off) { + if (width < 1) { + width = 1; + } const int kBpp = 4; const int kStride = (width * kBpp + 15) & ~15; align_buffer_64(src_argb_a, kStride * height + off); @@ -1368,6 +1389,9 @@ TEST_F(libyuvTest, ARGBSobel_Opt) { static int TestSobelXY(int width, int height, int benchmark_iterations, int invert, int off) { + if (width < 1) { + width = 1; + } const int kBpp = 4; const int kStride = (width * kBpp + 15) & ~15; align_buffer_64(src_argb_a, kStride * height + off);