diff --git a/include/libyuv/planar_functions.h b/include/libyuv/planar_functions.h index 324bb1ed0..91137baba 100644 --- a/include/libyuv/planar_functions.h +++ b/include/libyuv/planar_functions.h @@ -498,6 +498,10 @@ int HalfFloatPlane(const uint16_t* src_y, int width, int height); +// Convert a buffer of bytes to floats, scale the values and store as floats. +LIBYUV_API +int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width); + // Quantize a rectangle of ARGB. Alpha unaffected. // scale is a 16 bit fractional fixed point scaler between 0 and 65535. // interval_size should be a value between 1 and 255. diff --git a/include/libyuv/row.h b/include/libyuv/row.h index ac91369a4..b5a42d0e6 100644 --- a/include/libyuv/row.h +++ b/include/libyuv/row.h @@ -308,6 +308,7 @@ extern "C" { #define HAS_ARGBTOYROW_NEON #define HAS_BGRATOUVROW_NEON #define HAS_BGRATOYROW_NEON +#define HAS_BYTETOFLOATROW_NEON #define HAS_COPYROW_NEON #define HAS_HALFFLOATROW_NEON #define HAS_I400TOARGBROW_NEON @@ -3352,6 +3353,15 @@ void HalfFloatRow_Any_MSA(const uint16_t* src_ptr, uint16_t* dst_ptr, float param, int width); +void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width); +void ByteToFloatRow_NEON(const uint8_t* src, + float* dst, + float scale, + int width); +void ByteToFloatRow_Any_NEON(const uint8_t* src, + float* dst, + float scale, + int width); void ARGBLumaColorTableRow_C(const uint8_t* src_argb, uint8_t* dst_argb, diff --git a/source/planar_functions.cc b/source/planar_functions.cc index 77d71633f..5eae3f763 100644 --- a/source/planar_functions.cc +++ b/source/planar_functions.cc @@ -3123,6 +3123,27 @@ int HalfFloatPlane(const uint16_t* src_y, return 0; } +// Convert a buffer of bytes to floats, scale the values and store as floats. +LIBYUV_API +int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) { + void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale, + int width) = ByteToFloatRow_C; + if (!src_y || !dst_y || width <= 0) { + return -1; + } +#if defined(HAS_BYTETOFLOATROW_NEON) + if (TestCpuFlag(kCpuHasNEON)) { + ByteToFloatRow = ByteToFloatRow_Any_NEON; + if (IS_ALIGNED(width, 8)) { + ByteToFloatRow = ByteToFloatRow_NEON; + } + } +#endif + + ByteToFloatRow(src_y, dst_y, scale, width); + return 0; +} + // Apply a lumacolortable to each ARGB pixel. LIBYUV_API int ARGBLumaColorTable(const uint8_t* src_argb, diff --git a/source/row_any.cc b/source/row_any.cc index 39bc6b03c..cc5914dd2 100644 --- a/source/row_any.cc +++ b/source/row_any.cc @@ -807,37 +807,52 @@ ANY11C(Convert8To16Row_Any_AVX2, #undef ANY11C // Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts. -#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \ - void NAMEANY(const uint16_t* src_ptr, uint16_t* dst_ptr, T param, \ - int width) { \ - SIMD_ALIGNED(uint16_t temp[32 * 2]); \ - memset(temp, 0, 64); /* for msan */ \ - int r = width & MASK; \ - int n = width & ~MASK; \ - if (n > 0) { \ - ANY_SIMD(src_ptr, dst_ptr, param, n); \ - } \ - memcpy(temp, src_ptr + n, r * SBPP); \ - ANY_SIMD(temp, temp + 16, param, MASK + 1); \ - memcpy(dst_ptr + n, temp + 16, r * BPP); \ +#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \ + void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \ + SIMD_ALIGNED(ST temp[32]); \ + SIMD_ALIGNED(T out[32]); \ + memset(temp, 0, SBPP * 32); /* for msan */ \ + int r = width & MASK; \ + int n = width & ~MASK; \ + if (n > 0) { \ + ANY_SIMD(src_ptr, dst_ptr, param, n); \ + } \ + memcpy(temp, src_ptr + n, r * SBPP); \ + ANY_SIMD(temp, out, param, MASK + 1); \ + memcpy(dst_ptr + n, out, r * BPP); \ } #ifdef HAS_HALFFLOATROW_SSE2 -ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 2, 2, 7) +ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7) #endif #ifdef HAS_HALFFLOATROW_AVX2 -ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 2, 2, 15) +ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15) #endif #ifdef HAS_HALFFLOATROW_F16C -ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 2, 2, 15) -ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 2, 2, 15) +ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15) +ANY11P16(HalfFloat1Row_Any_F16C, + HalfFloat1Row_F16C, + uint16_t, + uint16_t, + 2, + 2, + 15) #endif #ifdef HAS_HALFFLOATROW_NEON -ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 2, 2, 7) -ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 2, 2, 7) +ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7) +ANY11P16(HalfFloat1Row_Any_NEON, + HalfFloat1Row_NEON, + uint16_t, + uint16_t, + 2, + 2, + 7) #endif #ifdef HAS_HALFFLOATROW_MSA -ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, float, 2, 2, 31) +ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31) +#endif +#ifdef HAS_BYTETOFLOATROW_NEON +ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7) #endif #undef ANY11P16 diff --git a/source/row_common.cc b/source/row_common.cc index 049130af2..da97821f7 100644 --- a/source/row_common.cc +++ b/source/row_common.cc @@ -2774,6 +2774,14 @@ void HalfFloatRow_C(const uint16_t* src, } } +void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) { + int i; + for (i = 0; i < width; ++i) { + float value = src[i] * scale; + dst[i] = value; + } +} + void ARGBLumaColorTableRow_C(const uint8_t* src_argb, uint8_t* dst_argb, int width, diff --git a/source/row_neon.cc b/source/row_neon.cc index b77d7200a..8b6c19520 100644 --- a/source/row_neon.cc +++ b/source/row_neon.cc @@ -2659,6 +2659,32 @@ void HalfFloatRow_NEON(const uint16_t* src, : "cc", "memory", "q0", "q1", "q2", "q3"); } +void ByteToFloatRow_NEON(const uint8_t* src, + float* dst, + float scale, + int width) { + asm volatile( + "vdup.32 q0, %3 \n" + + "1: \n" + "vld1.8 {d2}, [%0]! \n" // load 8 bytes + "subs %2, %2, #8 \n" // 8 pixels per loop + "vmovl.u8 q1, d2 \n" // 8 shorts + "vmovl.u16 q2, d2 \n" // 8 ints + "vmovl.u16 q3, d3 \n" + "vcvt.f32.u32 q2, q2 \n" // 8 floats + "vcvt.f32.u32 q3, q3 \n" + "vmul.f32 q2, q2, d0[0] \n" // scale + "vmul.f32 q3, q3, d0[0] \n" + "vst1.8 {q2, q3}, [%1]! \n" // store 8 floats + "bgt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "r"(scale) // %3 + : "cc", "memory", "q0", "q1", "q2", "q3"); +} + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__).. #ifdef __cplusplus diff --git a/source/row_neon64.cc b/source/row_neon64.cc index f6cf13201..24b4520ba 100644 --- a/source/row_neon64.cc +++ b/source/row_neon64.cc @@ -2700,6 +2700,30 @@ void HalfFloatRow_NEON(const uint16_t* src, : "cc", "memory", "v1", "v2", "v3"); } +void ByteToFloatRow_NEON(const uint8_t* src, + float* dst, + float scale, + int width) { + asm volatile( + "1: \n" + "ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes + "subs %w2, %w2, #8 \n" // 8 pixels per loop + "uxtl v1.8h, v1.8b \n" // 8 shorts + "uxtl v2.4s, v1.4h \n" // 8 ints + "uxtl2 v3.4s, v1.8h \n" + "scvtf v2.4s, v2.4s \n" // 8 floats + "scvtf v3.4s, v3.4s \n" + "fmul v2.4s, v2.4s, %3.s[0] \n" // scale + "fmul v3.4s, v3.4s, %3.s[0] \n" + "st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats + "b.gt 1b \n" + : "+r"(src), // %0 + "+r"(dst), // %1 + "+r"(width) // %2 + : "w"(scale) // %3 + : "cc", "memory", "v1", "v2", "v3"); +} + float ScaleMaxSamples_NEON(const float* src, float* dst, float scale, diff --git a/unit_test/planar_test.cc b/unit_test/planar_test.cc index 9f95941ce..756089558 100644 --- a/unit_test/planar_test.cc +++ b/unit_test/planar_test.cc @@ -2168,6 +2168,52 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_One) { EXPECT_LE(diff, 1); } +float TestByteToFloat(int benchmark_width, + int benchmark_height, + int benchmark_iterations, + int disable_cpu_flags, + int benchmark_cpu_info, + float scale) { + int i, j; + const int y_plane_size = benchmark_width * benchmark_height; + + align_buffer_page_end(orig_y, y_plane_size * (1 + 4 + 4)); + float* dst_opt = reinterpret_cast(orig_y + y_plane_size); + float* dst_c = reinterpret_cast(orig_y + y_plane_size * 5); + + MemRandomize(orig_y, y_plane_size); + memset(dst_c, 0, y_plane_size * 4); + memset(dst_opt, 1, y_plane_size * 4); + + // Disable all optimizations. + MaskCpuFlags(disable_cpu_flags); + ByteToFloat(orig_y, dst_c, scale, y_plane_size); + + // Enable optimizations. + MaskCpuFlags(benchmark_cpu_info); + for (j = 0; j < benchmark_iterations; j++) { + ByteToFloat(orig_y, dst_opt, scale, y_plane_size); + } + + float max_diff = 0; + for (i = 0; i < y_plane_size; ++i) { + float abs_diff = fabs(dst_c[i] - dst_opt[i]); + if (abs_diff > max_diff) { + max_diff = abs_diff; + } + } + + free_aligned_buffer_page_end(orig_y); + return max_diff; +} + +TEST_F(LibYUVPlanarTest, TestByteToFloat) { + float diff = TestByteToFloat(benchmark_width_, benchmark_height_, + benchmark_iterations_, disable_cpu_flags_, + benchmark_cpu_info_, 1.0f); + EXPECT_EQ(0.f, diff); +} + TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) { SIMD_ALIGNED(uint8_t orig_pixels[1280][4]); SIMD_ALIGNED(uint8_t dst_pixels_opt[1280][4]);