From 0b8bb60f2e489b239ae75840554f592433900edb Mon Sep 17 00:00:00 2001 From: Frank Barchard Date: Tue, 5 May 2020 11:16:59 -0700 Subject: [PATCH] ARGBToI420 C version match SIMD Bug: libyuv:447 Change-Id: Iafb28cf635b355837caf41c26baee665642f4f95 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2181779 Reviewed-by: richard winterton Commit-Queue: Frank Barchard --- README.chromium | 2 +- include/libyuv/version.h | 2 +- source/planar_functions.cc | 9 ++++ source/row_common.cc | 92 ++++++++++++++++++++++++++++++++------ unit_test/convert_test.cc | 43 +++++++++--------- unit_test/planar_test.cc | 15 +++---- 6 files changed, 115 insertions(+), 48 deletions(-) diff --git a/README.chromium b/README.chromium index 51381f24e..e78d24b20 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1751 +Version: 1752 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index b9ab82963..26a9520da 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1751 +#define LIBYUV_VERSION 1752 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 7980dcfa1..3489c9711 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -579,6 +579,15 @@ int NV21ToNV12(const uint8_t* src_y, if (dst_y) { CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height); } + + // Negative height means invert the image. + if (height < 0) { + height = -height; + halfheight = (height + 1) >> 1; + src_vu = src_vu + (halfheight - 1) * src_stride_vu; + src_stride_vu = -src_stride_vu; + } + SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth, halfheight); return 0; diff --git a/source/row_common.cc b/source/row_common.cc index 5e801daf6..c2f418925 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -27,6 +27,12 @@ extern "C" { #define LIBYUV_RGB7 1 #endif +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \ + defined(_M_IX86) +#define LIBYUV_ARGBTOUV_PAVGB 1 +#define LIBYUV_RGBTOU_TRUNCATE 1 +#endif + // llvm x86 is poor at ternary operator, so use branchless min/max. #define USE_BRANCHLESS 1 @@ -420,14 +426,36 @@ static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) { } #endif +#ifdef LIBYUV_RGBTOU_TRUNCATE +static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) { + return (112 * b - 74 * g - 38 * r + 0x8000) >> 8; +} +static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { + return (112 * r - 94 * g - 18 * b + 0x8000) >> 8; +} +#else static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) { return (112 * b - 74 * g - 38 * r + 0x8080) >> 8; } static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { return (112 * r - 94 * g - 18 * b + 0x8080) >> 8; } +#endif + +#if !defined(LIBYUV_ARGBTOUV_PAVGB) +static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) { + return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8; +} +static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) { + return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8; +} +#endif + +#define AVGB(a, b) (((a) + (b) + 1) >> 1) // ARGBToY_C and ARGBToUV_C +// Intel version mimic SSE/AVX which does 2 pavgb +#if LIBYUV_ARGBTOUV_PAVGB #define MAKEROWY(NAME, R, G, B, BPP) \ void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ int x; \ @@ -442,15 +470,12 @@ static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ int x; \ for (x = 0; x < width - 1; x += 2) { \ - uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \ - src_rgb1[B + BPP]) >> \ - 2; \ - uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \ - src_rgb1[G + BPP]) >> \ - 2; \ - uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \ - src_rgb1[R + BPP]) >> \ - 2; \ + uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \ + AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \ + uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \ + AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \ + uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \ + AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \ dst_u[0] = RGBToU(ar, ag, ab); \ dst_v[0] = RGBToV(ar, ag, ab); \ src_rgb0 += BPP * 2; \ @@ -459,13 +484,54 @@ static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) { dst_v += 1; \ } \ if (width & 1) { \ - uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \ - uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \ - uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \ + uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \ + uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \ + uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \ dst_u[0] = RGBToU(ar, ag, ab); \ dst_v[0] = RGBToV(ar, ag, ab); \ } \ } +#else +// ARM version does sum / 2 then multiply by 2x smaller coefficients +#define MAKEROWY(NAME, R, G, B, BPP) \ + void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ + int x; \ + for (x = 0; x < width; ++x) { \ + dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \ + src_argb0 += BPP; \ + dst_y += 1; \ + } \ + } \ + void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \ + uint8_t* dst_u, uint8_t* dst_v, int width) { \ + const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \ + int x; \ + for (x = 0; x < width - 1; x += 2) { \ + uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \ + src_rgb1[B + BPP] + 1) >> \ + 1; \ + uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \ + src_rgb1[G + BPP] + 1) >> \ + 1; \ + uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \ + src_rgb1[R + BPP] + 1) >> \ + 1; \ + dst_u[0] = RGB2xToU(ar, ag, ab); \ + dst_v[0] = RGB2xToV(ar, ag, ab); \ + src_rgb0 += BPP * 2; \ + src_rgb1 += BPP * 2; \ + dst_u += 1; \ + dst_v += 1; \ + } \ + if (width & 1) { \ + uint16_t ab = (src_rgb0[B] + src_rgb1[B]); \ + uint16_t ag = (src_rgb0[G] + src_rgb1[G]); \ + uint16_t ar = (src_rgb0[R] + src_rgb1[R]); \ + dst_u[0] = RGB2xToU(ar, ag, ab); \ + dst_v[0] = RGB2xToV(ar, ag, ab); \ + } \ + } +#endif MAKEROWY(ARGB, 2, 1, 0, 4) MAKEROWY(BGRA, 1, 2, 3, 4) @@ -519,8 +585,6 @@ static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) { return (127 * r - 107 * g - 20 * b + 0x8080) >> 8; } -#define AVGB(a, b) (((a) + (b) + 1) >> 1) - // ARGBToYJ_C and ARGBToUVJ_C #define MAKEROWYJ(NAME, R, G, B, BPP) \ void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \ diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index 323f8d22a..5c95d9312 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -32,8 +32,9 @@ #endif #if defined(__arm__) || defined(__aarch64__) -// arm version subsamples by summing 4 pixels then multiplying by matrix with -// 4x smaller coefficients which are rounded to nearest integer. +// arm version subsamples by summing 4 pixels, rounding divide by 2, then +// multiplying by matrix with 2x smaller coefficients which are rounded +// to nearest integer. #define ARM_YUV_ERROR 4 #else #define ARM_YUV_ERROR 0 @@ -246,7 +247,7 @@ TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2) } \ } \ } \ - EXPECT_LE(max_diff, 3); \ + EXPECT_LE(max_diff, 0); \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \ int abs_diff = abs( \ @@ -1008,30 +1009,28 @@ TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3, 2) TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \ benchmark_width_, DIFF, _Opt, +, 0) -TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 4) -TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4) -TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2) -TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2) +TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 0) +TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 0) +TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 0) +TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 0) TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, ARM_YUV_ERROR) TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, ARM_YUV_ERROR) #ifdef INTEL_TEST TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15) TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17) -#endif -TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 4) -TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2) -TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2) -TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4) -TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4) -TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, ARM_YUV_ERROR) -#ifdef INTEL_TEST TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5) #endif -TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4) -TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2) -TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2) -TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2) -TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2) +TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 0) +TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 0) +TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 0) +TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 0) +TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 0) +TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, ARM_YUV_ERROR) +TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 0) +TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 0) +TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 0) +TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 0) +TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 0) #define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, \ SUBSAMP_Y, W1280, N, NEG, OFF) \ @@ -1072,7 +1071,7 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2) } \ } \ } \ - EXPECT_LE(max_diff, 4); \ + EXPECT_LE(max_diff, 0); \ for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \ for (int j = 0; j < kStrideUV * 2; ++j) { \ int abs_diff = \ @@ -1083,7 +1082,7 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2) } \ } \ } \ - EXPECT_LE(max_diff, 4); \ + EXPECT_LE(max_diff, 0); \ free_aligned_buffer_page_end(dst_y_c); \ free_aligned_buffer_page_end(dst_uv_c); \ free_aligned_buffer_page_end(dst_y_opt); \ diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 50bca4e4a..736e478a7 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -3512,7 +3512,6 @@ TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) { } TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) { - // Round count up to multiple of 16 int dst_width = (benchmark_width_ + 1) / 2; int dst_height = (benchmark_height_ + 1) / 2; align_buffer_page_end(src_pixels_u, benchmark_width_ * benchmark_height_); @@ -3529,15 +3528,11 @@ TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) { MemRandomize(dst_pixels_uv_opt, dst_width * 2 * dst_height); MemRandomize(dst_pixels_uv_c, dst_width * 2 * dst_height); - ScalePlane(src_pixels_u, benchmark_width_, benchmark_width_, - benchmark_height_, - - tmp_pixels_u, dst_width, dst_width, dst_height, kFilterBilinear); - ScalePlane(src_pixels_v, benchmark_width_, benchmark_width_, - benchmark_height_, tmp_pixels_v, dst_width, dst_width, dst_height, - kFilterBilinear); - MergeUVPlane(tmp_pixels_u, dst_width, tmp_pixels_v, dst_width, - dst_pixels_uv_c, dst_width * 2, dst_width, dst_height); + MaskCpuFlags(disable_cpu_flags_); + HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v, + benchmark_width_, dst_pixels_uv_c, dst_width * 2, + benchmark_width_, benchmark_height_); + MaskCpuFlags(benchmark_cpu_info_); for (int i = 0; i < benchmark_iterations_; ++i) { HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v,