diff --git a/README.chromium b/README.chromium index be44f7aa4..75bd2cfa1 100644 --- a/README.chromium +++ b/README.chromium @@ -1,6 +1,6 @@ Name: libyuv URL: http://code.google.com/p/libyuv/ -Version: 1724 +Version: 1725 License: BSD License File: LICENSE diff --git a/include/libyuv/convert.h b/include/libyuv/convert.h index 674325a38..892a3c91b 100644 --- a/include/libyuv/convert.h +++ b/include/libyuv/convert.h @@ -226,6 +226,17 @@ int UYVYToI420(const uint8_t* src_uyvy, int width, int height); +// Convert AYUV to NV21. +LIBYUV_API +int AYUVToNV21(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height); + // Convert M420 to I420. LIBYUV_API int M420ToI420(const uint8_t* src_m420, @@ -375,13 +386,11 @@ int ARGB4444ToI420(const uint8_t* src_argb4444, int height); #ifdef HAVE_JPEG -// src_mjpg is pointer to raw jpeg bytes in memory -// src_size_mjpg is size of jpeg in bytes // src_width/height provided by capture. // dst_width/height for clipping determine final size. LIBYUV_API -int MJPGToI420(const uint8_t* src_mjpg, - size_t src_size_mjpg, +int MJPGToI420(const uint8_t* sample, + size_t sample_size, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_u, @@ -395,8 +404,8 @@ int MJPGToI420(const uint8_t* src_mjpg, // JPEG to NV21 LIBYUV_API -int MJPGToNV21(const uint8_t* src_mjpg, - size_t src_size_mjpg, +int MJPGToNV21(const uint8_t* sample, + size_t sample_size, uint8_t* dst_y, int dst_stride_y, uint8_t* dst_vu, @@ -408,8 +417,8 @@ int MJPGToNV21(const uint8_t* src_mjpg, // Query size of MJPG in pixels. LIBYUV_API -int MJPGSize(const uint8_t* src_mjpg, - size_t src_size_mjpg, +int MJPGSize(const uint8_t* sample, + size_t sample_size, int* width, int* height); #endif diff --git a/include/libyuv/convert_argb.h b/include/libyuv/convert_argb.h index 283ece952..b51f49d1f 100644 --- a/include/libyuv/convert_argb.h +++ b/include/libyuv/convert_argb.h @@ -298,6 +298,17 @@ int NV21ToRGB24(const uint8_t* src_y, int width, int height); +// Convert NV21 to YUV24. +LIBYUV_API +int NV21ToYUV24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_yuv24, + int dst_stride_yuv24, + int width, + int height); + // Convert NV12 to RAW. LIBYUV_API int NV12ToRAW(const uint8_t* src_y, @@ -627,8 +638,8 @@ int AR30ToAB30(const uint8_t* src_ar30, // src_width/height provided by capture // dst_width/height for clipping determine final size. LIBYUV_API -int MJPGToARGB(const uint8_t* src_mjpg, - size_t src_size_mjpg, +int MJPGToARGB(const uint8_t* sample, + size_t sample_size, uint8_t* dst_argb, int dst_stride_argb, int src_width, diff --git a/include/libyuv/mjpeg_decoder.h b/include/libyuv/mjpeg_decoder.h index fecbedfb8..275f8d4c1 100644 --- a/include/libyuv/mjpeg_decoder.h +++ b/include/libyuv/mjpeg_decoder.h @@ -26,7 +26,7 @@ namespace libyuv { extern "C" { #endif -LIBYUV_BOOL ValidateJpeg(const uint8_t* src_mjpg, size_t src_size_mjpg_size); +LIBYUV_BOOL ValidateJpeg(const uint8_t* sample, size_t sample_size); #ifdef __cplusplus } // extern "C" diff --git a/include/libyuv/row.h b/include/libyuv/row.h index da54f7d7a..8cfec20ef 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -295,6 +295,8 @@ extern "C" { #define HAS_I422TOYUY2ROW_AVX2 #define HAS_MERGEUVROW_16_AVX2 #define HAS_MULTIPLYROW_16_AVX2 +// TODO(fbarchard): Fix AVX2 version of YUV24 +// #define HAS_NV21TOYUV24ROW_AVX2 #endif // The following are available for AVX512 clang x86 platforms: @@ -330,6 +332,8 @@ extern "C" { #define HAS_ARGBTOUVROW_NEON #define HAS_ARGBTOYJROW_NEON #define HAS_ARGBTOYROW_NEON +#define HAS_AYUVTOVUROW_NEON +#define HAS_AYUVTOYROW_NEON #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOYROW_NEON #define HAS_BYTETOFLOATROW_NEON @@ -355,6 +359,7 @@ extern "C" { #define HAS_NV12TORGB565ROW_NEON #define HAS_NV21TOARGBROW_NEON #define HAS_NV21TORGB24ROW_NEON +#define HAS_NV21TOYUV24ROW_NEON #define HAS_RAWTOARGBROW_NEON #define HAS_RAWTORGB24ROW_NEON #define HAS_RAWTOUVROW_NEON @@ -402,6 +407,7 @@ extern "C" { // The following are available on AArch64 platforms: #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) +#define HAS_FLOATDIVTOBYTEROW_NEON #define HAS_SCALESUMSAMPLES_NEON #endif #if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa) @@ -815,6 +821,10 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); +void NV21ToYUV24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width); void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_argb, const struct YuvConstants* yuvconstants, @@ -2183,6 +2193,10 @@ void NV21ToRGB24Row_C(const uint8_t* src_y, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, int width); +void NV21ToYUV24Row_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width); void YUY2ToARGBRow_C(const uint8_t* src_yuy2, uint8_t* rgb_buf, const struct YuvConstants* yuvconstants, @@ -2349,6 +2363,10 @@ void NV21ToRGB24Row_AVX2(const uint8_t* src_y, uint8_t* dst_rgb24, const struct YuvConstants* yuvconstants, int width); +void NV21ToYUV24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width); void NV12ToRGB565Row_AVX2(const uint8_t* src_y, const uint8_t* src_uv, uint8_t* dst_rgb565, @@ -2554,6 +2572,10 @@ void NV21ToRGB24Row_Any_AVX2(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void NV21ToYUV24Row_Any_AVX2(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width); void NV12ToRGB565Row_Any_SSSE3(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -3027,6 +3049,10 @@ void NV21ToRGB24Row_Any_NEON(const uint8_t* y_buf, uint8_t* dst_ptr, const struct YuvConstants* yuvconstants, int width); +void NV21ToYUV24Row_Any_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width); void NV12ToRGB565Row_Any_NEON(const uint8_t* y_buf, const uint8_t* uv_buf, uint8_t* dst_ptr, @@ -3345,6 +3371,19 @@ void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr, uint8_t* dst_v, int width); +void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width); +void AYUVToVURow_C(const uint8_t* src_ayuv, int stride_ayuv, + uint8_t* dst_vu, + int width); +void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width); +void AYUVToVURow_NEON(const uint8_t* src_ayuv, int stride_ayuv, + uint8_t* dst_vu, + int width); +void AYUVToYRow_Any_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width); +void AYUVToVURow_Any_NEON(const uint8_t* src_ayuv, int stride_ayuv, + uint8_t* dst_vu, + int width); + void I422ToYUY2Row_C(const uint8_t* src_y, const uint8_t* src_u, const uint8_t* src_v, @@ -3960,6 +3999,18 @@ float ScaleSumSamples_NEON(const float* src, void ScaleSamples_C(const float* src, float* dst, float scale, int width); void ScaleSamples_NEON(const float* src, float* dst, float scale, int width); +void FloatDivToByteRow_C(const float* src_weights, + const float* src_values, + uint8_t* dst_out, + uint8_t* dst_mask, + int width); +void FloatDivToByteRow_NEON(const float* src_weights, + const float* src_values, + uint8_t* dst_out, + uint8_t* dst_mask, + int width); + + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/include/libyuv/version.h b/include/libyuv/version.h index 522eb5224..e6bf67e16 100644 --- a/include/libyuv/version.h +++ b/include/libyuv/version.h @@ -11,6 +11,6 @@ #ifndef INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_ -#define LIBYUV_VERSION 1724 +#define LIBYUV_VERSION 1725 #endif // INCLUDE_LIBYUV_VERSION_H_ diff --git a/source/convert.cc b/source/convert.cc index 4b8d0dc57..b4550685e 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -880,6 +880,76 @@ int UYVYToI420(const uint8_t* src_uyvy, return 0; } +// Convert AYUV to NV21. +LIBYUV_API +int AYUVToNV21(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_y, + int dst_stride_y, + uint8_t* dst_vu, + int dst_stride_vu, + int width, + int height) { + int y; + void (*AYUVToVURow)(const uint8_t* src_ayuv, int src_stride_ayuv, + uint8_t* dst_vu, int width) = + AYUVToVURow_C; + void (*AYUVToYRow)(const uint8_t* src_ayuv, uint8_t* dst_y, int width) = + AYUVToYRow_C; + // Negative height means invert the image. + if (height < 0) { + height = -height; + src_ayuv = src_ayuv + (height - 1) * src_stride_ayuv; + src_stride_ayuv = -src_stride_ayuv; + } +// place holders for future intel code +#if defined(HAS_AYUVTOYROW_SSE2) + if (TestCpuFlag(kCpuHasSSE2)) { + AYUVToVURow = AYUVToVURow_Any_SSE2; + AYUVToYRow = AYUVToYRow_Any_SSE2; + if (IS_ALIGNED(width, 16)) { + AYUVToVURow = AYUVToVURow_SSE2; + AYUVToYRow = AYUVToYRow_SSE2; + } + } +#endif +#if defined(HAS_AYUVTOYROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + AYUVToVURow = AYUVToVURow_Any_AVX2; + AYUVToYRow = AYUVToYRow_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + AYUVToVURow = AYUVToVURow_AVX2; + AYUVToYRow = AYUVToYRow_AVX2; + } + } +#endif + +#if defined(HAS_AYUVTOYROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + AYUVToYRow = AYUVToYRow_Any_NEON; + AYUVToVURow = AYUVToVURow_Any_NEON; + if (IS_ALIGNED(width, 16)) { + AYUVToYRow = AYUVToYRow_NEON; + AYUVToVURow = AYUVToVURow_NEON; + } + } +#endif + + for (y = 0; y < height - 1; y += 2) { + AYUVToVURow(src_ayuv, src_stride_ayuv, dst_vu, width); + AYUVToYRow(src_ayuv, dst_y, width); + AYUVToYRow(src_ayuv + src_stride_ayuv, dst_y + dst_stride_y, width); + src_ayuv += src_stride_ayuv * 2; + dst_y += dst_stride_y * 2; + dst_vu += dst_stride_vu; + } + if (height & 1) { + AYUVToVURow(src_ayuv, 0, dst_vu, width); + AYUVToYRow(src_ayuv, dst_y, width); + } + return 0; +} + // Convert ARGB to I420. LIBYUV_API int ARGBToI420(const uint8_t* src_argb, @@ -2165,6 +2235,7 @@ int Android420ToI420(const uint8_t* src_y, return 0; } + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/convert_argb.cc b/source/convert_argb.cc index b376a0f38..d9660b115 100644 --- a/source/convert_argb.cc +++ b/source/convert_argb.cc @@ -1998,6 +1998,56 @@ int NV21ToRAW(const uint8_t* src_y, dst_stride_raw, &kYvuI601Constants, width, height); } +// Convert NV21 to YUV24 +int NV21ToYUV24(const uint8_t* src_y, + int src_stride_y, + const uint8_t* src_vu, + int src_stride_vu, + uint8_t* dst_yuv24, + int dst_stride_yuv24, + int width, + int height) { + int y; + void (*NV21ToYUV24Row)(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) = NV21ToYUV24Row_C; + if (!src_y || !src_vu || !dst_yuv24 || width <= 0 || height == 0) { + return -1; + } + // Negative height means invert the image. + if (height < 0) { + height = -height; + dst_yuv24 = dst_yuv24 + (height - 1) * dst_stride_yuv24; + dst_stride_yuv24 = -dst_stride_yuv24; + } +#if defined(HAS_NV21TOYUV24ROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + NV21ToYUV24Row = NV21ToYUV24Row_Any_NEON; + if (IS_ALIGNED(width, 16)) { + NV21ToYUV24Row = NV21ToYUV24Row_NEON; + } + } +#endif +#if defined(HAS_NV21TOYUV24ROW_AVX2) + if (TestCpuFlag(kCpuHasAVX2)) { + NV21ToYUV24Row = NV21ToYUV24Row_Any_AVX2; + if (IS_ALIGNED(width, 32)) { + NV21ToYUV24Row = NV21ToYUV24Row_AVX2; + } + } +#endif + for (y = 0; y < height; ++y) { + NV21ToYUV24Row(src_y, src_vu, dst_yuv24, width); + dst_yuv24 += dst_stride_yuv24; + src_y += src_stride_y; + if (y & 1) { + src_vu += src_stride_vu; + } + } + return 0; +} + // Convert M420 to ARGB. LIBYUV_API int M420ToARGB(const uint8_t* src_m420, diff --git a/source/row_any.cc b/source/row_any.cc index 031a8f649..37bd9970f 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -286,7 +286,12 @@ ANY21(MergeUVRow_Any_MSA, MergeUVRow_MSA, 0, 1, 1, 2, 15) #ifdef HAS_MERGEUVROW_MMI ANY21(MergeUVRow_Any_MMI, MergeUVRow_MMI, 0, 1, 1, 2, 7) #endif - +#ifdef HAS_NV21TOYUV24ROW_NEON +ANY21(NV21ToYUV24Row_Any_NEON, NV21ToYUV24Row_NEON, 1, 1, 2, 3, 15) +#endif +#ifdef HAS_NV21TOYUV24ROW_AVX2 +ANY21(NV21ToYUV24Row_Any_AVX2, NV21ToYUV24Row_AVX2, 1, 1, 2, 3, 31) +#endif // Math functions. #ifdef HAS_ARGBMULTIPLYROW_SSE2 ANY21(ARGBMultiplyRow_Any_SSE2, ARGBMultiplyRow_SSE2, 0, 4, 4, 4, 3) @@ -702,6 +707,10 @@ ANY11(UYVYToYRow_Any_MSA, UYVYToYRow_MSA, 1, 4, 1, 31) #ifdef HAS_UYVYTOYROW_MMI ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15) #endif + +#ifdef HAS_AYUVTOYROW_NEON +ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15) +#endif #ifdef HAS_RGB24TOARGBROW_NEON ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7) #endif @@ -1381,6 +1390,36 @@ ANY12S(UYVYToUVRow_Any_MMI, UYVYToUVRow_MMI, 1, 4, 15) #endif #undef ANY12S +// Any 1 to 1 with source stride (2 rows of source). Outputs UV plane. +// 128 byte row allows for 32 avx ARGB pixels. +#define ANY11S(NAMEANY, ANY_SIMD, UVSHIFT, BPP, MASK) \ + void NAMEANY(const uint8_t* src_ptr, int src_stride_ptr, uint8_t* dst_vu, \ + int width) { \ + SIMD_ALIGNED(uint8_t temp[128 * 3]); \ + memset(temp, 0, 128 * 2); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, src_stride_ptr, dst_vu, n); \ + } \ + memcpy(temp, src_ptr + (n >> UVSHIFT) * BPP, SS(r, UVSHIFT) * BPP); \ + memcpy(temp + 128, src_ptr + src_stride_ptr + (n >> UVSHIFT) * BPP, \ + SS(r, UVSHIFT) * BPP); \ + if ((width & 1) && UVSHIFT == 0) { /* repeat last pixel for subsample */ \ + memcpy(temp + SS(r, UVSHIFT) * BPP, temp + SS(r, UVSHIFT) * BPP - BPP, \ + BPP); \ + memcpy(temp + 128 + SS(r, UVSHIFT) * BPP, \ + temp + 128 + SS(r, UVSHIFT) * BPP - BPP, BPP); \ + } \ + ANY_SIMD(temp, 128, temp + 256, MASK + 1); \ + memcpy(dst_vu + (n >> 1) * 2, temp + 256, SS(r, 1) * 2); \ + } + +#ifdef HAS_AYUVTOVUROW_NEON +ANY11S(AYUVToVURow_Any_NEON, AYUVToVURow_NEON, 0, 4, 15) +#endif +#undef ANY11S + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_common.cc b/source/row_common.cc index 2bbc5adbf..257daa6c0 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -3231,6 +3231,73 @@ void GaussCol_C(const uint16_t* src0, } } +// Convert biplanar NV21 to packed YUV24 +void NV21ToYUV24Row_C(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + + int x; + for (x = 0; x < width - 1; x += 2) { + dst_yuv24[0] = src_vu[0]; // V + dst_yuv24[1] = src_vu[1]; // U + dst_yuv24[2] = src_y[0]; // Y0 + dst_yuv24[3] = src_vu[0]; // V + dst_yuv24[4] = src_vu[1]; // U + dst_yuv24[5] = src_y[1]; // Y1 + src_y += 2; + src_vu += 2; + dst_yuv24 += 6; // Advance 2 pixels. + } + if (width & 1) { + dst_yuv24[0] = src_vu[0]; // V + dst_yuv24[1] = src_vu[1]; // U + dst_yuv24[2] = src_y[0]; // Y0 + } +} + +// Filter 2 rows of AYUV UV's (444) into VU (420). +void AYUVToVURow_C(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_vu, + int width) { + // Output a row of VU values, filtering 2x2 rows of AYUV. + int x; + for (x = 0; x < width; x += 2) { + dst_vu[0] = (src_ayuv[0] + src_ayuv[4] + src_ayuv[src_stride_ayuv + 0] + src_ayuv[src_stride_ayuv + 4] + 2) >> 2; + dst_vu[1] = (src_ayuv[1] + src_ayuv[5] + src_ayuv[src_stride_ayuv + 1] + src_ayuv[src_stride_ayuv + 5] + 2) >> 2; + src_ayuv += 8; + dst_vu += 2; + } + if (width & 1) { + dst_vu[0] = (src_ayuv[0] + src_ayuv[0] + src_ayuv[src_stride_ayuv + 0] + src_ayuv[src_stride_ayuv + 0] + 2) >> 2; + dst_vu[1] = (src_ayuv[1] + src_ayuv[1] + src_ayuv[src_stride_ayuv + 1] + src_ayuv[src_stride_ayuv + 1] + 2) >> 2; + } +} + +// Copy row of AYUV Y's into Y +void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { + // Output a row of Y values. + int x; + for (x = 0; x < width; ++x) { + dst_y[x] = src_ayuv[2]; // v,u,y,a + src_ayuv += 4; + } +} + +// divide values by weights and provide mask to indicate weight of 0. +void FloatDivToByteRow_C(const float* src_weights, + const float* src_values, + uint8_t* dst_out, + uint8_t* dst_mask, + int width) { + int x; + for (x = 0; x < width; ++x) { + dst_out[x] = Clamp(src_values[x] / src_weights[x]); + dst_mask[x] = src_weights[x] > 0 ? 0 : 0xff; + } +} + #ifdef __cplusplus } // extern "C" } // namespace libyuv diff --git a/source/row_gcc.cc b/source/row_gcc.cc index 8d3cb81ce..99d73053f 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -6669,6 +6669,121 @@ void ARGBLumaColorTableRow_SSSE3(const uint8_t* src_argb, } #endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3 + +#ifdef HAS_NV21TOYUV24ROW_AVX2 + +// begin NV21ToYUV24Row_C avx2 constants +static const ulvec8 kBLEND0 = {0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, + 0x80, 0x80, 0x00, 0x80, 0x80, 0x00, 0x80, 0x80, + 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, + 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00}; + +static const ulvec8 kBLEND1 = {0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, + 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, + 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80}; + +static const ulvec8 kBLEND2 = {0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, + 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, + 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, + 0x80, 0x00, 0x00, 0x80, 0x00, 0x00, 0x80, 0x00}; + +static const ulvec8 kSHUF0 = {0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d, + 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05, + 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, 0x0d, + 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, 0x05}; + +static const ulvec8 kSHUF1 = {0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, + 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80, + 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, 0x02, + 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, 0x80}; + +static const ulvec8 kSHUF2 = {0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, + 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f, + 0x0a, 0x80, 0x00, 0x0b, 0x80, 0x01, 0x0c, 0x80, + 0x02, 0x0d, 0x80, 0x03, 0x0e, 0x80, 0x04, 0x0f}; + +static const ulvec8 kSHUF3 = {0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80, + 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80, + 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, 0x80, + 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, 0x80}; + +static const ulvec8 kSHUF4 = {0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, + 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a, + 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, 0x80, + 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, 0x0a}; + +static const ulvec8 kSHUF5 = {0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, + 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80, + 0x80, 0x05, 0x80, 0x80, 0x06, 0x80, 0x80, 0x07, + 0x80, 0x80, 0x08, 0x80, 0x80, 0x09, 0x80, 0x80}; + +// NV21ToYUV24Row_AVX2 +void NV21ToYUV24Row_AVX2(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + + uint8_t* src_y_ptr; + uint64_t src_offset = 0; + uint64_t width64; + + width64 = width; + src_y_ptr = (uint8_t *) src_y; + + asm volatile( + "vmovdqu %5, %%ymm0 \n" //init blend value + "vmovdqu %6, %%ymm1 \n" //init blend value + "vmovdqu %7, %%ymm2 \n" //init blend value +// "sub $0x20, %3 \n" //sub 32 from width for final loop + + LABELALIGN + "1: \n" //label 1 + "vmovdqu (%0,%4), %%ymm3 \n" //src_y + "vmovdqu 1(%1,%4), %%ymm4 \n" //src_uv+1 + "vmovdqu (%1), %%ymm5 \n" //src_uv + "vpshufb %8, %%ymm3, %%ymm13 \n" //y, kSHUF0 for shuf + "vpshufb %9, %%ymm4, %%ymm14 \n" //uv+1, kSHUF1 for shuf + "vpshufb %10, %%ymm5, %%ymm15 \n" //uv, kSHUF2 for shuf + "vpshufb %11, %%ymm3, %%ymm3 \n" //y kSHUF3 for shuf + "vpshufb %12, %%ymm4, %%ymm4 \n" //uv+1 kSHUF4 for shuf + "vpblendvb %%ymm0, %%ymm14, %%ymm13, %%ymm12 \n" //blend 0 + "vpblendvb %%ymm0, %%ymm13, %%ymm14, %%ymm14 \n" //blend 0 + "vpblendvb %%ymm2, %%ymm15, %%ymm12, %%ymm12 \n" //blend 2 + "vpblendvb %%ymm1, %%ymm15, %%ymm14, %%ymm13 \n" //blend 1 + "vpshufb %13, %%ymm5, %%ymm15 \n" //shuffle const + "vpor %%ymm4, %%ymm3, %%ymm5 \n" //get results + "vmovdqu %%ymm12, 0x20(%2) \n" //store dst_yuv+20h + "vpor %%ymm15, %%ymm5, %%ymm3 \n" //get results + "add $0x20, %4 \n" //add to src buffer ptr + "vinserti128 $0x1, %%xmm3, %%ymm13, %%ymm4 \n" //insert + "vperm2i128 $0x31, %%ymm13, %%ymm3, %%ymm5 \n" //insert + "vmovdqu %%ymm4, (%2) \n" //store dst_yuv + "vmovdqu %%ymm5, 0x40(%2) \n" //store dst_yuv+40h + "add $0x60,%2 \n" //add to dst buffer ptr +// "cmp %3, %4 \n" //(width64 - 32 bytes) and src_offset + "sub $0x20,%3 \n" // 32 pixels per loop + "jg 1b \n" + "vzeroupper \n" //sse-avx2 transistions + + : "+r"(src_y), //%0 + "+r"(src_vu), //%1 + "+r"(dst_yuv24), //%2 + "+r"(width64), //%3 + "+r"(src_offset) //%4 + : "m"(kBLEND0), //%5 + "m"(kBLEND1), //%6 + "m"(kBLEND2), //%7 + "m"(kSHUF0), //%8 + "m"(kSHUF1), //%9 + "m"(kSHUF2), //%10 + "m"(kSHUF3), //%11 + "m"(kSHUF4), //%12 + "m"(kSHUF5) //%13 + : "memory", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm12", "xmm13", "xmm14", "xmm15"); +} +#endif // HAS_NV21TOYUV24ROW_AVX2 + #endif // defined(__x86_64__) || defined(__i386__) #ifdef __cplusplus diff --git a/source/row_neon.cc b/source/row_neon.cc index ff87e74c6..e440209fc 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -2685,6 +2685,77 @@ void ByteToFloatRow_NEON(const uint8_t* src, : "cc", "memory", "q1", "q2", "q3"); } +// Convert biplanar NV21 to packed YUV24 +void NV21ToYUV24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + asm volatile ( + "1: \n" + "vld1.8 {q2}, [%0]! \n" // load 16 Y values + "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values + "vmov d1, d0 \n" + "vzip.u8 d0, d1 \n" // VV + "vmov d3, d2 \n" + "vzip.u8 d2, d3 \n" // UU + "subs %3, %3, #16 \n" // 16 pixels per loop + "vst3.8 {d0, d2, d4}, [%2]! \n" // store 16 YUV pixels + "vst3.8 {d1, d3, d5}, [%2]! \n" + "bgt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2"); +} + +void AYUVToVURow_NEON(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_vu, + int width) { + asm volatile ( + "add %1, %0, %1 \n" // src_stride + src_AYUV + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels. + "vpaddl.u8 q0, q0 \n" // V 16 bytes -> 8 shorts. + "vpaddl.u8 q1, q1 \n" // U 16 bytes -> 8 shorts. + "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more AYUV pixels. + "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 AYUV pixels. + "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts. + "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts. + "vqrshrun.s16 d0, q0, #2 \n" // 2x2 average + "vqrshrun.s16 d1, q1, #2 \n" + "subs %3, %3, #16 \n" // 16 processed per loop. + "vst2.8 {d0, d1}, [%2]! \n" // store 8 pixels VU. + "bgt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(src_stride_ayuv), // %1 + "+r"(dst_vu), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7" + ); +} + +// Copy row of AYUV Y's into Y. +// Similar to ARGBExtractAlphaRow_NEON +void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { + asm volatile ( + "1: \n" + "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels + "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels + "subs %2, %2, #16 \n" // 16 processed per loop + "vst1.8 {q2}, [%1]! \n" // store 16 Y's. + "bgt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. #ifdef __cplusplus diff --git a/source/row_neon64.cc b/source/row_neon64.cc index 24b4520ba..5d045f645 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2876,6 +2876,113 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) { : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"); } +// Convert biplanar NV21 to packed YUV24 +void NV21ToYUV24Row_NEON(const uint8_t* src_y, + const uint8_t* src_vu, + uint8_t* dst_yuv24, + int width) { + asm volatile ( + "1: \n" + "ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values + "ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values + "zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values + "zip1 v1.16b, v1.16b, v1.16b \n" // replicate U values + "subs %w3, %w3, #16 \n" // 16 pixels per loop + "st3 {v0.16b,v1.16b,v2.16b}, [%2], #48 \n" // store 16 YUV pixels + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_vu), // %1 + "+r"(dst_yuv24), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2"); +} + +void AYUVToVURow_NEON(const uint8_t* src_ayuv, + int src_stride_ayuv, + uint8_t* dst_vu, + int width) { + const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv; + asm volatile( + + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. + "uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts. + "ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16 + "uadalp v0.8h, v4.16b \n" // V 16 bytes -> 8 shorts. + "uadalp v1.8h, v5.16b \n" // U 16 bytes -> 8 shorts. + "uqrshrn v0.8b, v0.8h, #2 \n" // 2x2 average + "uqrshrn v1.8b, v1.8h, #2 \n" + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "st2 {v0.8b,v1.8b}, [%2], #16 \n" // store 8 pixels VU. + "b.gt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(src_ayuv_1), // %1 + "+r"(dst_vu), // %2 + "+r"(width) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" + ); +} + +// Copy row of AYUV Y's into Y +void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { + asm volatile ( + "1: \n" + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels + "subs %w2, %w2, #16 \n" // 16 pixels per loop + "st1 {v2.16b}, [%1], #16 \n" // store 16 Y pixels + "b.gt 1b \n" + : "+r"(src_ayuv), // %0 + "+r"(dst_y), // %1 + "+r"(width) // %2 + : + : "cc", "memory", "v0", "v1", "v2", "v3"); +} + +void FloatDivToByteRow_NEON(const float* src_weights, + const float* src_values, + uint8_t* dst_out, + uint8_t* dst_mask, + int width) { + asm volatile( + "movi v0.4s, #0 \n" + + "1: \n" + "ld1 {v1.4s,v2.4s}, [%0], #32 \n" // load 8 float weights + "ld1 {v3.4s,v4.4s}, [%1], #32 \n" // load 8 float values + "subs %w4, %w4, #8 \n" // 8 pixels per loop + + "fdiv v1.4s, v3.4s, v1.4s \n" // values / weights + "fdiv v2.4s, v4.4s, v2.4s \n" + + "fcvtzu v1.4s, v1.4s \n" // float to int + "fcvtzu v2.4s, v2.4s \n" // float to int + "uqxtn v1.4h, v1.4s \n" // 8 shorts + "uqxtn2 v1.8h, v2.4s \n" + "uqxtn v1.8b, v1.8h \n" // 8 bytes + + "st1 {v1.8b}, [%2], #8 \n" // store 8 byte out + + "fcmgt v5.4s, v1.4s, v0.4s \n" // cmp weight to zero + "fcmgt v6.4s, v2.4s, v0.4s \n" + "uqxtn v5.4h, v5.4s \n" // 8 shorts + "uqxtn2 v5.8h, v6.4s \n" + "uqxtn v5.8b, v1.8h \n" // 8 bytes + + "st1 {v5.8b}, [%3], #8 \n" // store 8 byte mask + + "b.gt 1b \n" + : "+r"(src_weights), // %0 + "+r"(src_values), // %1 + "+r"(dst_out), // %2 + "+r"(dst_mask), // %3 + "+r"(width) // %4 + : + : "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus diff --git a/unit_test/convert_test.cc b/unit_test/convert_test.cc index d97b4fc72..0fc6f873e 100644 --- a/unit_test/convert_test.cc +++ b/unit_test/convert_test.cc @@ -680,7 +680,7 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1) TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1, 2) TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2) -#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ +#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B,\ W1280, DIFF, N, NEG, OFF) \ TEST_F(LibYUVConvertTest, FMT_PLANAR##To##FMT_B##N) { \ const int kWidth = ((W1280) > 0) ? (W1280) : 1; \ @@ -716,9 +716,9 @@ TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2) align_buffer_page_end(dst_argb32_opt, kWidth * 4 * kHeight); \ memset(dst_argb32_c, 2, kWidth * 4 * kHeight); \ memset(dst_argb32_opt, 102, kWidth * 4 * kHeight); \ - FMT_B##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth, \ + FMT_C##ToARGB(dst_argb_c, kStrideB, dst_argb32_c, kWidth * 4, kWidth, \ kHeight); \ - FMT_B##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \ + FMT_C##ToARGB(dst_argb_opt, kStrideB, dst_argb32_opt, kWidth * 4, kWidth, \ kHeight); \ int max_diff = 0; \ for (int i = 0; i < kHeight; ++i) { \ @@ -740,25 +740,27 @@ TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1, 2) free_aligned_buffer_page_end(dst_argb32_opt); \ } -#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, DIFF) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ +#define TESTBIPLANARTOB(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, \ + BPP_B, DIFF) \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ benchmark_width_ - 4, DIFF, _Any, +, 0) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ benchmark_width_, DIFF, _Unaligned, +, 1) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ benchmark_width_, DIFF, _Invert, -, 0) \ - TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, \ + TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, BPP_B, \ benchmark_width_, DIFF, _Opt, +, 0) -TESTBIPLANARTOB(NV12, 2, 2, ARGB, 4, 2) -TESTBIPLANARTOB(NV21, 2, 2, ARGB, 4, 2) -TESTBIPLANARTOB(NV12, 2, 2, ABGR, 4, 2) -TESTBIPLANARTOB(NV21, 2, 2, ABGR, 4, 2) -TESTBIPLANARTOB(NV12, 2, 2, RGB24, 3, 2) -TESTBIPLANARTOB(NV21, 2, 2, RGB24, 3, 2) -TESTBIPLANARTOB(NV12, 2, 2, RAW, 3, 2) -TESTBIPLANARTOB(NV21, 2, 2, RAW, 3, 2) -TESTBIPLANARTOB(NV12, 2, 2, RGB565, 2, 9) +TESTBIPLANARTOB(NV12, 2, 2, ARGB, ARGB, 4, 2) +TESTBIPLANARTOB(NV21, 2, 2, ARGB, ARGB, 4, 2) +TESTBIPLANARTOB(NV12, 2, 2, ABGR, ABGR, 4, 2) +TESTBIPLANARTOB(NV21, 2, 2, ABGR, ABGR, 4, 2) +TESTBIPLANARTOB(NV12, 2, 2, RGB24, RGB24, 3, 2) +TESTBIPLANARTOB(NV21, 2, 2, RGB24, RGB24, 3, 2) +TESTBIPLANARTOB(NV12, 2, 2, RAW, RAW, 3, 2) +TESTBIPLANARTOB(NV21, 2, 2, RAW, RAW, 3, 2) +TESTBIPLANARTOB(NV12, 2, 2, RGB565, RGB565, 2, 9) +TESTBIPLANARTOB(NV21, 2, 2, YUV24, RAW, 3, 2) #ifdef DO_THREE_PLANES // Do 3 allocations for yuv. conventional but slower. @@ -978,6 +980,7 @@ TESTATOBIPLANAR(ARGB, 1, 4, NV12, 2, 2) TESTATOBIPLANAR(ARGB, 1, 4, NV21, 2, 2) TESTATOBIPLANAR(YUY2, 2, 4, NV12, 2, 2) TESTATOBIPLANAR(UYVY, 2, 4, NV12, 2, 2) +TESTATOBIPLANAR(AYUV, 1, 4, NV21, 2, 2) #define TESTATOBI(FMT_A, BPP_A, STRIDE_A, HEIGHT_A, FMT_B, BPP_B, STRIDE_B, \ HEIGHT_B, W1280, DIFF, N, NEG, OFF) \ diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 756089558..fcd073a2a 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -3267,4 +3267,85 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) { EXPECT_EQ(dst_pixels_c[639], static_cast(30704)); } +float TestFloatDivToByte(int benchmark_width, + int benchmark_height, + int benchmark_iterations, + float scale, + bool opt) { + int i, j; + // NEON does multiple of 8, so round count up + const int kPixels = (benchmark_width * benchmark_height + 7) & ~7; + align_buffer_page_end(src_weights, kPixels * 4); + align_buffer_page_end(src_values, kPixels * 4); + align_buffer_page_end(dst_out_c, kPixels); + align_buffer_page_end(dst_out_opt, kPixels); + align_buffer_page_end(dst_mask_c, kPixels); + align_buffer_page_end(dst_mask_opt, kPixels); + + // Randomize works but may contain some denormals affecting performance. + // MemRandomize(orig_y, kPixels * 4); + // large values are problematic. audio is really -1 to 1. + for (i = 0; i < kPixels; ++i) { + (reinterpret_cast(src_weights))[i] = scale; + (reinterpret_cast(src_values))[i] = sinf(static_cast(i) * 0.1f); + } + memset(dst_out_c, 0, kPixels); + memset(dst_out_opt, 1, kPixels); + memset(dst_mask_c, 2, kPixels); + memset(dst_mask_opt, 3, kPixels); + + FloatDivToByteRow_C(reinterpret_cast(src_weights), + reinterpret_cast(src_values), + dst_out_c, dst_mask_c, kPixels); + + for (j = 0; j < benchmark_iterations; j++) { + if (opt) { +#ifdef HAS_FLOATDIVTOBYTEROW_NEON + FloatDivToByteRow_NEON(reinterpret_cast(src_weights), + reinterpret_cast(src_values), + dst_out_opt, dst_mask_opt, kPixels); +#else + FloatDivToByteRow_C(reinterpret_cast(src_weights), + reinterpret_cast(src_values), + dst_out_opt, dst_mask_opt, kPixels); +#endif + } else { + FloatDivToByteRow_C(reinterpret_cast(src_weights), + reinterpret_cast(src_values), + dst_out_opt, dst_mask_opt, kPixels); + } + } + + uint8_t max_diff = 0; + for (i = 0; i < kPixels; ++i) { + uint8_t abs_diff = abs(dst_out_c[i] - dst_out_opt[i]) + + abs(dst_mask_c[i] - dst_mask_opt[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + + free_aligned_buffer_page_end(src_weights); + free_aligned_buffer_page_end(src_values); + free_aligned_buffer_page_end(dst_out_c); + free_aligned_buffer_page_end(dst_out_opt); + free_aligned_buffer_page_end(dst_mask_c); + free_aligned_buffer_page_end(dst_mask_opt); + + return max_diff; +} + +TEST_F(LibYUVPlanarTest, TestFloatDivToByte_C) { + float diff = TestFloatDivToByte(benchmark_width_, benchmark_height_, + benchmark_iterations_, 1.2f, false); + EXPECT_EQ(0, diff); +} + +TEST_F(LibYUVPlanarTest, TestFloatDivToByte_Opt) { + float diff = TestFloatDivToByte(benchmark_width_, benchmark_height_, + benchmark_iterations_, 1.2f, true); + EXPECT_EQ(0, diff); +} + + } // namespace libyuv