ARGBToI420 C version match SIMD

Bug: libyuv:447
Change-Id: Iafb28cf635b355837caf41c26baee665642f4f95
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2181779
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2020-05-05 11:16:59 -07:00 committed by Commit Bot
parent 7a61759f78
commit 0b8bb60f2e
6 changed files with 115 additions and 48 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1751
Version: 1752
License: BSD
License File: LICENSE

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1751
#define LIBYUV_VERSION 1752
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -579,6 +579,15 @@ int NV21ToNV12(const uint8_t* src_y,
if (dst_y) {
CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
}
// Negative height means invert the image.
if (height < 0) {
height = -height;
halfheight = (height + 1) >> 1;
src_vu = src_vu + (halfheight - 1) * src_stride_vu;
src_stride_vu = -src_stride_vu;
}
SwapUVPlane(src_vu, src_stride_vu, dst_uv, dst_stride_uv, halfwidth,
halfheight);
return 0;

View File

@ -27,6 +27,12 @@ extern "C" {
#define LIBYUV_RGB7 1
#endif
#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || \
defined(_M_IX86)
#define LIBYUV_ARGBTOUV_PAVGB 1
#define LIBYUV_RGBTOU_TRUNCATE 1
#endif
// llvm x86 is poor at ternary operator, so use branchless min/max.
#define USE_BRANCHLESS 1
@ -420,14 +426,36 @@ static __inline int RGBToY(uint8_t r, uint8_t g, uint8_t b) {
}
#endif
#ifdef LIBYUV_RGBTOU_TRUNCATE
static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
return (112 * b - 74 * g - 38 * r + 0x8000) >> 8;
}
static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
return (112 * r - 94 * g - 18 * b + 0x8000) >> 8;
}
#else
static __inline int RGBToU(uint8_t r, uint8_t g, uint8_t b) {
return (112 * b - 74 * g - 38 * r + 0x8080) >> 8;
}
static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
return (112 * r - 94 * g - 18 * b + 0x8080) >> 8;
}
#endif
#if !defined(LIBYUV_ARGBTOUV_PAVGB)
static __inline int RGB2xToU(uint16_t r, uint16_t g, uint16_t b) {
return ((112 / 2) * b - (74 / 2) * g - (38 / 2) * r + 0x8080) >> 8;
}
static __inline int RGB2xToV(uint16_t r, uint16_t g, uint16_t b) {
return ((112 / 2) * r - (94 / 2) * g - (18 / 2) * b + 0x8080) >> 8;
}
#endif
#define AVGB(a, b) (((a) + (b) + 1) >> 1)
// ARGBToY_C and ARGBToUV_C
// Intel version mimic SSE/AVX which does 2 pavgb
#if LIBYUV_ARGBTOUV_PAVGB
#define MAKEROWY(NAME, R, G, B, BPP) \
void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
int x; \
@ -442,15 +470,12 @@ static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
int x; \
for (x = 0; x < width - 1; x += 2) { \
uint8_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
src_rgb1[B + BPP]) >> \
2; \
uint8_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
src_rgb1[G + BPP]) >> \
2; \
uint8_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
src_rgb1[R + BPP]) >> \
2; \
uint8_t ab = AVGB(AVGB(src_rgb0[B], src_rgb1[B]), \
AVGB(src_rgb0[B + BPP], src_rgb1[B + BPP])); \
uint8_t ag = AVGB(AVGB(src_rgb0[G], src_rgb1[G]), \
AVGB(src_rgb0[G + BPP], src_rgb1[G + BPP])); \
uint8_t ar = AVGB(AVGB(src_rgb0[R], src_rgb1[R]), \
AVGB(src_rgb0[R + BPP], src_rgb1[R + BPP])); \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
src_rgb0 += BPP * 2; \
@ -459,13 +484,54 @@ static __inline int RGBToV(uint8_t r, uint8_t g, uint8_t b) {
dst_v += 1; \
} \
if (width & 1) { \
uint8_t ab = (src_rgb0[B] + src_rgb1[B]) >> 1; \
uint8_t ag = (src_rgb0[G] + src_rgb1[G]) >> 1; \
uint8_t ar = (src_rgb0[R] + src_rgb1[R]) >> 1; \
uint8_t ab = AVGB(src_rgb0[B], src_rgb1[B]); \
uint8_t ag = AVGB(src_rgb0[G], src_rgb1[G]); \
uint8_t ar = AVGB(src_rgb0[R], src_rgb1[R]); \
dst_u[0] = RGBToU(ar, ag, ab); \
dst_v[0] = RGBToV(ar, ag, ab); \
} \
}
#else
// ARM version does sum / 2 then multiply by 2x smaller coefficients
#define MAKEROWY(NAME, R, G, B, BPP) \
void NAME##ToYRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \
int x; \
for (x = 0; x < width; ++x) { \
dst_y[0] = RGBToY(src_argb0[R], src_argb0[G], src_argb0[B]); \
src_argb0 += BPP; \
dst_y += 1; \
} \
} \
void NAME##ToUVRow_C(const uint8_t* src_rgb0, int src_stride_rgb, \
uint8_t* dst_u, uint8_t* dst_v, int width) { \
const uint8_t* src_rgb1 = src_rgb0 + src_stride_rgb; \
int x; \
for (x = 0; x < width - 1; x += 2) { \
uint16_t ab = (src_rgb0[B] + src_rgb0[B + BPP] + src_rgb1[B] + \
src_rgb1[B + BPP] + 1) >> \
1; \
uint16_t ag = (src_rgb0[G] + src_rgb0[G + BPP] + src_rgb1[G] + \
src_rgb1[G + BPP] + 1) >> \
1; \
uint16_t ar = (src_rgb0[R] + src_rgb0[R + BPP] + src_rgb1[R] + \
src_rgb1[R + BPP] + 1) >> \
1; \
dst_u[0] = RGB2xToU(ar, ag, ab); \
dst_v[0] = RGB2xToV(ar, ag, ab); \
src_rgb0 += BPP * 2; \
src_rgb1 += BPP * 2; \
dst_u += 1; \
dst_v += 1; \
} \
if (width & 1) { \
uint16_t ab = (src_rgb0[B] + src_rgb1[B]); \
uint16_t ag = (src_rgb0[G] + src_rgb1[G]); \
uint16_t ar = (src_rgb0[R] + src_rgb1[R]); \
dst_u[0] = RGB2xToU(ar, ag, ab); \
dst_v[0] = RGB2xToV(ar, ag, ab); \
} \
}
#endif
MAKEROWY(ARGB, 2, 1, 0, 4)
MAKEROWY(BGRA, 1, 2, 3, 4)
@ -519,8 +585,6 @@ static __inline int RGBToVJ(uint8_t r, uint8_t g, uint8_t b) {
return (127 * r - 107 * g - 20 * b + 0x8080) >> 8;
}
#define AVGB(a, b) (((a) + (b) + 1) >> 1)
// ARGBToYJ_C and ARGBToUVJ_C
#define MAKEROWYJ(NAME, R, G, B, BPP) \
void NAME##ToYJRow_C(const uint8_t* src_argb0, uint8_t* dst_y, int width) { \

View File

@ -32,8 +32,9 @@
#endif
#if defined(__arm__) || defined(__aarch64__)
// arm version subsamples by summing 4 pixels then multiplying by matrix with
// 4x smaller coefficients which are rounded to nearest integer.
// arm version subsamples by summing 4 pixels, rounding divide by 2, then
// multiplying by matrix with 2x smaller coefficients which are rounded
// to nearest integer.
#define ARM_YUV_ERROR 4
#else
#define ARM_YUV_ERROR 0
@ -246,7 +247,7 @@ TESTPLANARTOP(H420, uint8_t, 1, 2, 2, H010, uint16_t, 2, 2, 2)
} \
} \
} \
EXPECT_LE(max_diff, 3); \
EXPECT_LE(max_diff, 0); \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
for (int j = 0; j < SUBSAMPLE(kWidth, SUBSAMP_X); ++j) { \
int abs_diff = abs( \
@ -1008,30 +1009,28 @@ TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3, 2)
TESTATOPLANARI(FMT_A, BPP_A, YALIGN, FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, \
benchmark_width_, DIFF, _Opt, +, 0)
TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 4)
TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 4)
TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 2)
TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 2)
TESTATOPLANAR(ABGR, 4, 1, I420, 2, 2, 0)
TESTATOPLANAR(ARGB, 4, 1, I420, 2, 2, 0)
TESTATOPLANAR(ARGB, 4, 1, I422, 2, 1, 0)
TESTATOPLANAR(ARGB, 4, 1, I444, 1, 1, 0)
TESTATOPLANAR(ARGB, 4, 1, J420, 2, 2, ARM_YUV_ERROR)
TESTATOPLANAR(ARGB, 4, 1, J422, 2, 1, ARM_YUV_ERROR)
#ifdef INTEL_TEST
TESTATOPLANAR(ARGB1555, 2, 1, I420, 2, 2, 15)
TESTATOPLANAR(ARGB4444, 2, 1, I420, 2, 2, 17)
#endif
TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 4)
TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, ARM_YUV_ERROR)
#ifdef INTEL_TEST
TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
#endif
TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2)
TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 2)
TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 2)
TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
TESTATOPLANAR(BGRA, 4, 1, I420, 2, 2, 0)
TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 0)
TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 0)
TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 0)
TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 0)
TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, ARM_YUV_ERROR)
TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 0)
TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 0)
TESTATOPLANAR(UYVY, 2, 1, I422, 2, 1, 0)
TESTATOPLANAR(YUY2, 2, 1, I420, 2, 2, 0)
TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 0)
#define TESTATOBIPLANARI(FMT_A, SUB_A, BPP_A, FMT_PLANAR, SUBSAMP_X, \
SUBSAMP_Y, W1280, N, NEG, OFF) \
@ -1072,7 +1071,7 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
} \
} \
} \
EXPECT_LE(max_diff, 4); \
EXPECT_LE(max_diff, 0); \
for (int i = 0; i < SUBSAMPLE(kHeight, SUBSAMP_Y); ++i) { \
for (int j = 0; j < kStrideUV * 2; ++j) { \
int abs_diff = \
@ -1083,7 +1082,7 @@ TESTATOPLANAR(YUY2, 2, 1, I422, 2, 1, 2)
} \
} \
} \
EXPECT_LE(max_diff, 4); \
EXPECT_LE(max_diff, 0); \
free_aligned_buffer_page_end(dst_y_c); \
free_aligned_buffer_page_end(dst_uv_c); \
free_aligned_buffer_page_end(dst_y_opt); \

View File

@ -3512,7 +3512,6 @@ TEST_F(LibYUVPlanarTest, TestGaussPlane_F32) {
}
TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) {
// Round count up to multiple of 16
int dst_width = (benchmark_width_ + 1) / 2;
int dst_height = (benchmark_height_ + 1) / 2;
align_buffer_page_end(src_pixels_u, benchmark_width_ * benchmark_height_);
@ -3529,15 +3528,11 @@ TEST_F(LibYUVPlanarTest, HalfMergeUVPlane_Opt) {
MemRandomize(dst_pixels_uv_opt, dst_width * 2 * dst_height);
MemRandomize(dst_pixels_uv_c, dst_width * 2 * dst_height);
ScalePlane(src_pixels_u, benchmark_width_, benchmark_width_,
benchmark_height_,
tmp_pixels_u, dst_width, dst_width, dst_height, kFilterBilinear);
ScalePlane(src_pixels_v, benchmark_width_, benchmark_width_,
benchmark_height_, tmp_pixels_v, dst_width, dst_width, dst_height,
kFilterBilinear);
MergeUVPlane(tmp_pixels_u, dst_width, tmp_pixels_v, dst_width,
dst_pixels_uv_c, dst_width * 2, dst_width, dst_height);
MaskCpuFlags(disable_cpu_flags_);
HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v,
benchmark_width_, dst_pixels_uv_c, dst_width * 2,
benchmark_width_, benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
HalfMergeUVPlane(src_pixels_u, benchmark_width_, src_pixels_v,