mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-06 16:56:55 +08:00
ByteToFloatRow_NEON to convert and scale bytes to floats
Each byte is converted to float (0.0 to 255.0) and then multiplied by a scale parameter. Bug: None Test: arm 64 build passes. Change-Id: I04736798540b8d985f60abdf0388e24a209d075b Reviewed-on: https://chromium-review.googlesource.com/930226 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Ian Field <ianfield@google.com>
This commit is contained in:
parent
0ea50cbc74
commit
85722f5d93
@ -498,6 +498,10 @@ int HalfFloatPlane(const uint16_t* src_y,
|
||||
int width,
|
||||
int height);
|
||||
|
||||
// Convert a buffer of bytes to floats, scale the values and store as floats.
|
||||
LIBYUV_API
|
||||
int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width);
|
||||
|
||||
// Quantize a rectangle of ARGB. Alpha unaffected.
|
||||
// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
|
||||
// interval_size should be a value between 1 and 255.
|
||||
|
||||
@ -308,6 +308,7 @@ extern "C" {
|
||||
#define HAS_ARGBTOYROW_NEON
|
||||
#define HAS_BGRATOUVROW_NEON
|
||||
#define HAS_BGRATOYROW_NEON
|
||||
#define HAS_BYTETOFLOATROW_NEON
|
||||
#define HAS_COPYROW_NEON
|
||||
#define HAS_HALFFLOATROW_NEON
|
||||
#define HAS_I400TOARGBROW_NEON
|
||||
@ -3352,6 +3353,15 @@ void HalfFloatRow_Any_MSA(const uint16_t* src_ptr,
|
||||
uint16_t* dst_ptr,
|
||||
float param,
|
||||
int width);
|
||||
void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width);
|
||||
void ByteToFloatRow_NEON(const uint8_t* src,
|
||||
float* dst,
|
||||
float scale,
|
||||
int width);
|
||||
void ByteToFloatRow_Any_NEON(const uint8_t* src,
|
||||
float* dst,
|
||||
float scale,
|
||||
int width);
|
||||
|
||||
void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb,
|
||||
|
||||
@ -3123,6 +3123,27 @@ int HalfFloatPlane(const uint16_t* src_y,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Convert a buffer of bytes to floats, scale the values and store as floats.
|
||||
LIBYUV_API
|
||||
int ByteToFloat(const uint8_t* src_y, float* dst_y, float scale, int width) {
|
||||
void (*ByteToFloatRow)(const uint8_t* src, float* dst, float scale,
|
||||
int width) = ByteToFloatRow_C;
|
||||
if (!src_y || !dst_y || width <= 0) {
|
||||
return -1;
|
||||
}
|
||||
#if defined(HAS_BYTETOFLOATROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
ByteToFloatRow = ByteToFloatRow_Any_NEON;
|
||||
if (IS_ALIGNED(width, 8)) {
|
||||
ByteToFloatRow = ByteToFloatRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
ByteToFloatRow(src_y, dst_y, scale, width);
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Apply a lumacolortable to each ARGB pixel.
|
||||
LIBYUV_API
|
||||
int ARGBLumaColorTable(const uint8_t* src_argb,
|
||||
|
||||
@ -807,37 +807,52 @@ ANY11C(Convert8To16Row_Any_AVX2,
|
||||
#undef ANY11C
|
||||
|
||||
// Any 1 to 1 with parameter and shorts to byte. BPP measures in shorts.
|
||||
#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
|
||||
void NAMEANY(const uint16_t* src_ptr, uint16_t* dst_ptr, T param, \
|
||||
int width) { \
|
||||
SIMD_ALIGNED(uint16_t temp[32 * 2]); \
|
||||
memset(temp, 0, 64); /* for msan */ \
|
||||
#define ANY11P16(NAMEANY, ANY_SIMD, ST, T, SBPP, BPP, MASK) \
|
||||
void NAMEANY(const ST* src_ptr, T* dst_ptr, float param, int width) { \
|
||||
SIMD_ALIGNED(ST temp[32]); \
|
||||
SIMD_ALIGNED(T out[32]); \
|
||||
memset(temp, 0, SBPP * 32); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(src_ptr, dst_ptr, param, n); \
|
||||
} \
|
||||
memcpy(temp, src_ptr + n, r * SBPP); \
|
||||
ANY_SIMD(temp, temp + 16, param, MASK + 1); \
|
||||
memcpy(dst_ptr + n, temp + 16, r * BPP); \
|
||||
ANY_SIMD(temp, out, param, MASK + 1); \
|
||||
memcpy(dst_ptr + n, out, r * BPP); \
|
||||
}
|
||||
|
||||
#ifdef HAS_HALFFLOATROW_SSE2
|
||||
ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, float, 2, 2, 7)
|
||||
ANY11P16(HalfFloatRow_Any_SSE2, HalfFloatRow_SSE2, uint16_t, uint16_t, 2, 2, 7)
|
||||
#endif
|
||||
#ifdef HAS_HALFFLOATROW_AVX2
|
||||
ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 2, 2, 15)
|
||||
ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, uint16_t, uint16_t, 2, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_HALFFLOATROW_F16C
|
||||
ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, float, 2, 2, 15)
|
||||
ANY11P16(HalfFloat1Row_Any_F16C, HalfFloat1Row_F16C, float, 2, 2, 15)
|
||||
ANY11P16(HalfFloatRow_Any_F16C, HalfFloatRow_F16C, uint16_t, uint16_t, 2, 2, 15)
|
||||
ANY11P16(HalfFloat1Row_Any_F16C,
|
||||
HalfFloat1Row_F16C,
|
||||
uint16_t,
|
||||
uint16_t,
|
||||
2,
|
||||
2,
|
||||
15)
|
||||
#endif
|
||||
#ifdef HAS_HALFFLOATROW_NEON
|
||||
ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, float, 2, 2, 7)
|
||||
ANY11P16(HalfFloat1Row_Any_NEON, HalfFloat1Row_NEON, float, 2, 2, 7)
|
||||
ANY11P16(HalfFloatRow_Any_NEON, HalfFloatRow_NEON, uint16_t, uint16_t, 2, 2, 7)
|
||||
ANY11P16(HalfFloat1Row_Any_NEON,
|
||||
HalfFloat1Row_NEON,
|
||||
uint16_t,
|
||||
uint16_t,
|
||||
2,
|
||||
2,
|
||||
7)
|
||||
#endif
|
||||
#ifdef HAS_HALFFLOATROW_MSA
|
||||
ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, float, 2, 2, 31)
|
||||
ANY11P16(HalfFloatRow_Any_MSA, HalfFloatRow_MSA, uint16_t, uint16_t, 2, 2, 31)
|
||||
#endif
|
||||
#ifdef HAS_BYTETOFLOATROW_NEON
|
||||
ANY11P16(ByteToFloatRow_Any_NEON, ByteToFloatRow_NEON, uint8_t, float, 1, 3, 7)
|
||||
#endif
|
||||
#undef ANY11P16
|
||||
|
||||
|
||||
@ -2774,6 +2774,14 @@ void HalfFloatRow_C(const uint16_t* src,
|
||||
}
|
||||
}
|
||||
|
||||
void ByteToFloatRow_C(const uint8_t* src, float* dst, float scale, int width) {
|
||||
int i;
|
||||
for (i = 0; i < width; ++i) {
|
||||
float value = src[i] * scale;
|
||||
dst[i] = value;
|
||||
}
|
||||
}
|
||||
|
||||
void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
|
||||
uint8_t* dst_argb,
|
||||
int width,
|
||||
|
||||
@ -2659,6 +2659,32 @@ void HalfFloatRow_NEON(const uint16_t* src,
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3");
|
||||
}
|
||||
|
||||
void ByteToFloatRow_NEON(const uint8_t* src,
|
||||
float* dst,
|
||||
float scale,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"vdup.32 q0, %3 \n"
|
||||
|
||||
"1: \n"
|
||||
"vld1.8 {d2}, [%0]! \n" // load 8 bytes
|
||||
"subs %2, %2, #8 \n" // 8 pixels per loop
|
||||
"vmovl.u8 q1, d2 \n" // 8 shorts
|
||||
"vmovl.u16 q2, d2 \n" // 8 ints
|
||||
"vmovl.u16 q3, d3 \n"
|
||||
"vcvt.f32.u32 q2, q2 \n" // 8 floats
|
||||
"vcvt.f32.u32 q3, q3 \n"
|
||||
"vmul.f32 q2, q2, d0[0] \n" // scale
|
||||
"vmul.f32 q3, q3, d0[0] \n"
|
||||
"vst1.8 {q2, q3}, [%1]! \n" // store 8 floats
|
||||
"bgt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(scale) // %3
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3");
|
||||
}
|
||||
|
||||
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)..
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
@ -2700,6 +2700,30 @@ void HalfFloatRow_NEON(const uint16_t* src,
|
||||
: "cc", "memory", "v1", "v2", "v3");
|
||||
}
|
||||
|
||||
void ByteToFloatRow_NEON(const uint8_t* src,
|
||||
float* dst,
|
||||
float scale,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"ld1 {v1.8b}, [%0], #8 \n" // load 8 bytes
|
||||
"subs %w2, %w2, #8 \n" // 8 pixels per loop
|
||||
"uxtl v1.8h, v1.8b \n" // 8 shorts
|
||||
"uxtl v2.4s, v1.4h \n" // 8 ints
|
||||
"uxtl2 v3.4s, v1.8h \n"
|
||||
"scvtf v2.4s, v2.4s \n" // 8 floats
|
||||
"scvtf v3.4s, v3.4s \n"
|
||||
"fmul v2.4s, v2.4s, %3.s[0] \n" // scale
|
||||
"fmul v3.4s, v3.4s, %3.s[0] \n"
|
||||
"st1 {v2.16b, v3.16b}, [%1], #32 \n" // store 8 floats
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "w"(scale) // %3
|
||||
: "cc", "memory", "v1", "v2", "v3");
|
||||
}
|
||||
|
||||
float ScaleMaxSamples_NEON(const float* src,
|
||||
float* dst,
|
||||
float scale,
|
||||
|
||||
@ -2168,6 +2168,52 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_One) {
|
||||
EXPECT_LE(diff, 1);
|
||||
}
|
||||
|
||||
float TestByteToFloat(int benchmark_width,
|
||||
int benchmark_height,
|
||||
int benchmark_iterations,
|
||||
int disable_cpu_flags,
|
||||
int benchmark_cpu_info,
|
||||
float scale) {
|
||||
int i, j;
|
||||
const int y_plane_size = benchmark_width * benchmark_height;
|
||||
|
||||
align_buffer_page_end(orig_y, y_plane_size * (1 + 4 + 4));
|
||||
float* dst_opt = reinterpret_cast<float*>(orig_y + y_plane_size);
|
||||
float* dst_c = reinterpret_cast<float*>(orig_y + y_plane_size * 5);
|
||||
|
||||
MemRandomize(orig_y, y_plane_size);
|
||||
memset(dst_c, 0, y_plane_size * 4);
|
||||
memset(dst_opt, 1, y_plane_size * 4);
|
||||
|
||||
// Disable all optimizations.
|
||||
MaskCpuFlags(disable_cpu_flags);
|
||||
ByteToFloat(orig_y, dst_c, scale, y_plane_size);
|
||||
|
||||
// Enable optimizations.
|
||||
MaskCpuFlags(benchmark_cpu_info);
|
||||
for (j = 0; j < benchmark_iterations; j++) {
|
||||
ByteToFloat(orig_y, dst_opt, scale, y_plane_size);
|
||||
}
|
||||
|
||||
float max_diff = 0;
|
||||
for (i = 0; i < y_plane_size; ++i) {
|
||||
float abs_diff = fabs(dst_c[i] - dst_opt[i]);
|
||||
if (abs_diff > max_diff) {
|
||||
max_diff = abs_diff;
|
||||
}
|
||||
}
|
||||
|
||||
free_aligned_buffer_page_end(orig_y);
|
||||
return max_diff;
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestByteToFloat) {
|
||||
float diff = TestByteToFloat(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_, disable_cpu_flags_,
|
||||
benchmark_cpu_info_, 1.0f);
|
||||
EXPECT_EQ(0.f, diff);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
|
||||
SIMD_ALIGNED(uint8_t orig_pixels[1280][4]);
|
||||
SIMD_ALIGNED(uint8_t dst_pixels_opt[1280][4]);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user