AR30ToARGB using shifts and masking to vectorize

AR30ToARGB will vectorize if the output is masked
together as an int instead of 4 byte stores.
Performance is 2x faster
Was AR30ToARGB_Opt (1585 ms)
Now AR30ToARGB_Opt (746 ms)

Bug: libyuv:777
Test:LibYUVConvertTest.AR30ToARGB_Opt
Change-Id: Idd47ae599d5d125207bb53e618d6d7e784d4a37c
Reviewed-on: https://chromium-review.googlesource.com/923169
Reviewed-by: Miguel Casas <mcasas@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2018-02-16 09:30:12 -08:00 committed by Commit Bot
parent 9c9215b218
commit 3d6b5658d7
3 changed files with 26 additions and 19 deletions

View File

@ -55,6 +55,10 @@ int ARGBToRGBA(const uint8_t* src_argb,
int width,
int height);
// Aliases
#define ARGBToAB30 ABGRToAR30
#define ABGRToAB30 ARGBToAR30
// Convert ABGR To AR30.
LIBYUV_API
int ABGRToAR30(const uint8_t* src_abgr,

View File

@ -182,14 +182,11 @@ void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
int x;
for (x = 0; x < width; ++x) {
uint32_t ar30 = *(uint32_t*)src_ar30;
uint32_t b = ar30 & 0x3ff;
uint32_t g = (ar30 >> 10) & 0x3ff;
uint32_t r = (ar30 >> 20) & 0x3ff;
uint32_t a = (ar30 >> 30) & 0x3;
dst_argb[0] = b >> 2;
dst_argb[1] = g >> 2;
dst_argb[2] = r >> 2;
dst_argb[3] = a * 0x55;
uint32_t b = (ar30 >> 2) & 0xff;
uint32_t g = (ar30 >> 12) & 0xff;
uint32_t r = (ar30 >> 22) & 0xff;
uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits.
*(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24);
dst_argb += 4;
src_ar30 += 4;
}
@ -199,14 +196,11 @@ void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
int x;
for (x = 0; x < width; ++x) {
uint32_t ar30 = *(uint32_t*)src_ar30;
uint32_t b = ar30 & 0x3ff;
uint32_t g = (ar30 >> 10) & 0x3ff;
uint32_t r = (ar30 >> 20) & 0x3ff;
uint32_t a = (ar30 >> 30) & 0x3;
dst_abgr[0] = r >> 2;
dst_abgr[1] = g >> 2;
dst_abgr[2] = b >> 2;
dst_abgr[3] = a * 0x55;
uint32_t b = (ar30 >> 2) & 0xff;
uint32_t g = (ar30 >> 12) & 0xff;
uint32_t r = (ar30 >> 22) & 0xff;
uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits.
*(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24);
dst_abgr += 4;
src_ar30 += 4;
}
@ -217,10 +211,9 @@ void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
for (x = 0; x < width; ++x) {
uint32_t ar30 = *(uint32_t*)src_ar30;
uint32_t b = ar30 & 0x3ff;
uint32_t g = (ar30 >> 10) & 0x3ff;
uint32_t ga = ar30 & 0xc00ffc00;
uint32_t r = (ar30 >> 20) & 0x3ff;
uint32_t a = (ar30 >> 30) & 0x3;
*(uint32_t*)(dst_ab30) = r | (g << 10) | (b << 20) | (a << 30);
*(uint32_t*)(dst_ab30) = r | ga | (b << 20);
dst_ab30 += 4;
src_ar30 += 4;
}

View File

@ -513,15 +513,21 @@ TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
MaskCpuFlags(disable_cpu_flags_); \
double time0 = get_time(); \
FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideB, \
kWidth, NEG kHeight); \
double time1 = get_time(); \
MaskCpuFlags(benchmark_cpu_info_); \
for (int i = 0; i < benchmark_iterations_; ++i) { \
FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
src_v + OFF, kStrideUV, dst_argb_opt + OFF, \
kStrideB, kWidth, NEG kHeight); \
} \
double time2 = get_time(); \
printf(" %8d us C - %8d us OPT\n", \
static_cast<int>((time1 - time0) * 1e6), \
static_cast<int>((time2 - time1) * 1e6 / benchmark_iterations_)); \
int max_diff = 0; \
/* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight); \
@ -1952,6 +1958,10 @@ TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4)
TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4)
TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4)
TESTPLANETOE(AR30, 1, 4, ABGR, 1, 4, ARGB, 4)
TESTPLANETOE(ARGB, 1, 4, AB30, 1, 4, ARGB, 4)
TESTPLANETOE(ABGR, 1, 4, AB30, 1, 4, ABGR, 4)
TESTPLANETOE(AB30, 1, 4, ARGB, 1, 4, ABGR, 4)
TESTPLANETOE(AB30, 1, 4, ABGR, 1, 4, ARGB, 4)
TEST_F(LibYUVConvertTest, RotateWithARGBSource) {
// 2x2 frames