diff --git a/README.chromium b/README.chromium index ee7ba2c47..ad847bd39 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 510 +Version: 511 License: BSD License File: LICENSE diff --git a/include/libyuv/version.h b/include/libyuv/version.h index d61333595..a09fab55d 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 510 +#define LIBYUV_VERSION 511 #endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 6b8920891..d2807cb58 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -911,6 +911,10 @@ int ARGBSepia(uint8* dst_argb, int dst_stride_argb, IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) { ARGBSepiaRow = ARGBSepiaRow_SSSE3; } +#elif defined(HAS_ARGBSEPIAROW_NEON) + if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) { + ARGBSepiaRow = ARGBSepiaRow_NEON; + } #endif uint8* dst = dst_argb + dst_y * dst_stride_argb + dst_x * 4; for (int y = 0; y < height; ++y) { diff --git a/source/row_neon.cc b/source/row_neon.cc index 1db91443c..0dd879013 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -2565,7 +2565,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { "vmull.u8 q2, d0, d24 \n" // B "vmlal.u8 q2, d1, d25 \n" // G "vmlal.u8 q2, d2, d26 \n" // R - "vqshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y + "vqshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit B "vmov d1, d0 \n" // G "vmov d2, d0 \n" // R "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels. @@ -2578,6 +2578,47 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { ); } +// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. +// b = (r * 35 + g * 68 + b * 17) >> 7 +// g = (r * 45 + g * 88 + b * 22) >> 7 +// r = (r * 50 + g * 98 + b * 24) >> 7 +void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { + asm volatile ( + "vmov.u8 d20, #17 \n" // BB coefficient + "vmov.u8 d21, #68 \n" // BG coefficient + "vmov.u8 d22, #35 \n" // BR coefficient + "vmov.u8 d24, #22 \n" // GB coefficient + "vmov.u8 d25, #88 \n" // GG coefficient + "vmov.u8 d26, #45 \n" // GR coefficient + "vmov.u8 d28, #24 \n" // BB coefficient + "vmov.u8 d29, #98 \n" // BG coefficient + "vmov.u8 d30, #50 \n" // BR coefficient + ".p2align 2 \n" + "1: \n" + "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels. + "subs %1, %1, #8 \n" // 8 processed per loop. + "vmull.u8 q2, d0, d20 \n" // B to Sepia B + "vmlal.u8 q2, d1, d21 \n" // G + "vmlal.u8 q2, d2, d22 \n" // R + "vmull.u8 q3, d0, d24 \n" // B to Sepia G + "vmlal.u8 q3, d1, d25 \n" // G + "vmlal.u8 q3, d2, d26 \n" // R + "vmull.u8 q8, d0, d28 \n" // B to Sepia R + "vmlal.u8 q8, d1, d29 \n" // G + "vmlal.u8 q8, d2, d30 \n" // R + "vqshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit B + "vqshrun.s16 d1, q3, #7 \n" // 16 bit to 8 bit G + "vqshrun.s16 d2, q8, #7 \n" // 16 bit to 8 bit R + "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels. + "bgt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(width) // %1 + : + : "cc", "memory", "q0", "q1", "q2", "q3", + "q10", "q11", "q12", "q13", "q14", "q15" + ); +} + #endif // __ARM_NEON__ #ifdef __cplusplus