mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-08 01:36:47 +08:00
AR30ToARGB using shifts and masking to vectorize
AR30ToARGB will vectorize if the output is masked together as an int instead of 4 byte stores. Performance is 2x faster Was AR30ToARGB_Opt (1585 ms) Now AR30ToARGB_Opt (746 ms) Bug: libyuv:777 Test:LibYUVConvertTest.AR30ToARGB_Opt Change-Id: Idd47ae599d5d125207bb53e618d6d7e784d4a37c Reviewed-on: https://chromium-review.googlesource.com/923169 Reviewed-by: Miguel Casas <mcasas@chromium.org> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
9c9215b218
commit
3d6b5658d7
@ -55,6 +55,10 @@ int ARGBToRGBA(const uint8_t* src_argb,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Aliases
|
||||
#define ARGBToAB30 ABGRToAR30
|
||||
#define ABGRToAB30 ARGBToAR30
|
||||
|
||||
// Convert ABGR To AR30.
|
||||
LIBYUV_API
|
||||
int ABGRToAR30(const uint8_t* src_abgr,
|
||||
|
||||
@ -182,14 +182,11 @@ void AR30ToARGBRow_C(const uint8_t* src_ar30, uint8_t* dst_argb, int width) {
|
||||
int x;
|
||||
for (x = 0; x < width; ++x) {
|
||||
uint32_t ar30 = *(uint32_t*)src_ar30;
|
||||
uint32_t b = ar30 & 0x3ff;
|
||||
uint32_t g = (ar30 >> 10) & 0x3ff;
|
||||
uint32_t r = (ar30 >> 20) & 0x3ff;
|
||||
uint32_t a = (ar30 >> 30) & 0x3;
|
||||
dst_argb[0] = b >> 2;
|
||||
dst_argb[1] = g >> 2;
|
||||
dst_argb[2] = r >> 2;
|
||||
dst_argb[3] = a * 0x55;
|
||||
uint32_t b = (ar30 >> 2) & 0xff;
|
||||
uint32_t g = (ar30 >> 12) & 0xff;
|
||||
uint32_t r = (ar30 >> 22) & 0xff;
|
||||
uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits.
|
||||
*(uint32_t*)(dst_argb) = b | (g << 8) | (r << 16) | (a << 24);
|
||||
dst_argb += 4;
|
||||
src_ar30 += 4;
|
||||
}
|
||||
@ -199,14 +196,11 @@ void AR30ToABGRRow_C(const uint8_t* src_ar30, uint8_t* dst_abgr, int width) {
|
||||
int x;
|
||||
for (x = 0; x < width; ++x) {
|
||||
uint32_t ar30 = *(uint32_t*)src_ar30;
|
||||
uint32_t b = ar30 & 0x3ff;
|
||||
uint32_t g = (ar30 >> 10) & 0x3ff;
|
||||
uint32_t r = (ar30 >> 20) & 0x3ff;
|
||||
uint32_t a = (ar30 >> 30) & 0x3;
|
||||
dst_abgr[0] = r >> 2;
|
||||
dst_abgr[1] = g >> 2;
|
||||
dst_abgr[2] = b >> 2;
|
||||
dst_abgr[3] = a * 0x55;
|
||||
uint32_t b = (ar30 >> 2) & 0xff;
|
||||
uint32_t g = (ar30 >> 12) & 0xff;
|
||||
uint32_t r = (ar30 >> 22) & 0xff;
|
||||
uint32_t a = (ar30 >> 30) * 0x55; // Replicate 2 bits to 8 bits.
|
||||
*(uint32_t*)(dst_abgr) = r | (g << 8) | (b << 16) | (a << 24);
|
||||
dst_abgr += 4;
|
||||
src_ar30 += 4;
|
||||
}
|
||||
@ -217,10 +211,9 @@ void AR30ToAB30Row_C(const uint8_t* src_ar30, uint8_t* dst_ab30, int width) {
|
||||
for (x = 0; x < width; ++x) {
|
||||
uint32_t ar30 = *(uint32_t*)src_ar30;
|
||||
uint32_t b = ar30 & 0x3ff;
|
||||
uint32_t g = (ar30 >> 10) & 0x3ff;
|
||||
uint32_t ga = ar30 & 0xc00ffc00;
|
||||
uint32_t r = (ar30 >> 20) & 0x3ff;
|
||||
uint32_t a = (ar30 >> 30) & 0x3;
|
||||
*(uint32_t*)(dst_ab30) = r | (g << 10) | (b << 20) | (a << 30);
|
||||
*(uint32_t*)(dst_ab30) = r | ga | (b << 20);
|
||||
dst_ab30 += 4;
|
||||
src_ar30 += 4;
|
||||
}
|
||||
|
||||
@ -513,15 +513,21 @@ TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
|
||||
memset(dst_argb_c + OFF, 1, kStrideB * kHeight); \
|
||||
memset(dst_argb_opt + OFF, 101, kStrideB * kHeight); \
|
||||
MaskCpuFlags(disable_cpu_flags_); \
|
||||
double time0 = get_time(); \
|
||||
FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
|
||||
src_v + OFF, kStrideUV, dst_argb_c + OFF, kStrideB, \
|
||||
kWidth, NEG kHeight); \
|
||||
double time1 = get_time(); \
|
||||
MaskCpuFlags(benchmark_cpu_info_); \
|
||||
for (int i = 0; i < benchmark_iterations_; ++i) { \
|
||||
FMT_PLANAR##To##FMT_B(src_y + OFF, kWidth, src_u + OFF, kStrideUV, \
|
||||
src_v + OFF, kStrideUV, dst_argb_opt + OFF, \
|
||||
kStrideB, kWidth, NEG kHeight); \
|
||||
} \
|
||||
double time2 = get_time(); \
|
||||
printf(" %8d us C - %8d us OPT\n", \
|
||||
static_cast<int>((time1 - time0) * 1e6), \
|
||||
static_cast<int>((time2 - time1) * 1e6 / benchmark_iterations_)); \
|
||||
int max_diff = 0; \
|
||||
/* Convert to ARGB so 565 is expanded to bytes that can be compared. */ \
|
||||
align_buffer_page_end(dst_argb32_c, kWidth* BPP_C* kHeight); \
|
||||
@ -1952,6 +1958,10 @@ TESTPLANETOE(ARGB, 1, 4, AR30, 1, 4, ARGB, 4)
|
||||
TESTPLANETOE(ABGR, 1, 4, AR30, 1, 4, ABGR, 4)
|
||||
TESTPLANETOE(AR30, 1, 4, ARGB, 1, 4, ABGR, 4)
|
||||
TESTPLANETOE(AR30, 1, 4, ABGR, 1, 4, ARGB, 4)
|
||||
TESTPLANETOE(ARGB, 1, 4, AB30, 1, 4, ARGB, 4)
|
||||
TESTPLANETOE(ABGR, 1, 4, AB30, 1, 4, ABGR, 4)
|
||||
TESTPLANETOE(AB30, 1, 4, ARGB, 1, 4, ABGR, 4)
|
||||
TESTPLANETOE(AB30, 1, 4, ABGR, 1, 4, ARGB, 4)
|
||||
|
||||
TEST_F(LibYUVConvertTest, RotateWithARGBSource) {
|
||||
// 2x2 frames
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user