mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
FP16 to FP32 float conversion row function
Bug: None Change-Id: I97aab6aafd41c3bf36bfbf33fdcc424e5b3fd6e3 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/4590225 Commit-Queue: Frank Barchard <fbarchard@chromium.org> Reviewed-by: Wan-Teh Chang <wtc@google.com>
This commit is contained in:
parent
1602e4c607
commit
b08ccb6a83
@ -6180,7 +6180,14 @@ void ByteToFloatRow_Any_NEON(const uint8_t* src_ptr,
|
|||||||
float* dst_ptr,
|
float* dst_ptr,
|
||||||
float param,
|
float param,
|
||||||
int width);
|
int width);
|
||||||
|
// Convert FP16 Half Floats to FP32 Floats
|
||||||
|
void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16
|
||||||
|
float* dst,
|
||||||
|
int width);
|
||||||
|
// Convert FP32 Floats to FP16 Half Floats
|
||||||
|
void ConvertFP32ToFP16Row_NEON(const float* src,
|
||||||
|
uint16_t* dst, // fp16
|
||||||
|
int width);
|
||||||
void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
|
void ARGBLumaColorTableRow_C(const uint8_t* src_argb,
|
||||||
uint8_t* dst_argb,
|
uint8_t* dst_argb,
|
||||||
int width,
|
int width,
|
||||||
|
|||||||
@ -3960,6 +3960,46 @@ void ByteToFloatRow_NEON(const uint8_t* src,
|
|||||||
: "cc", "memory", "v1", "v2", "v3");
|
: "cc", "memory", "v1", "v2", "v3");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Convert FP16 Half Floats to FP32 Floats
|
||||||
|
void ConvertFP16ToFP32Row_NEON(const uint16_t* src, // fp16
|
||||||
|
float* dst,
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
"1: \n"
|
||||||
|
"ld1 {v1.8h}, [%0], #16 \n" // load 8 halffloats
|
||||||
|
"subs %w2, %w2, #8 \n" // 8 floats per loop
|
||||||
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
|
"fcvtl v2.4s, v1.4h \n" // 8 floats
|
||||||
|
"fcvtl2 v3.4s, v1.8h \n"
|
||||||
|
"stp q2, q3, [%1], #32 \n" // store 8 floats
|
||||||
|
"b.gt 1b \n"
|
||||||
|
: "+r"(src), // %0
|
||||||
|
"+r"(dst), // %1
|
||||||
|
"+r"(width) // %2
|
||||||
|
:
|
||||||
|
: "cc", "memory", "v1", "v2", "v3");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Convert FP32 Floats to FP16 Half Floats
|
||||||
|
void ConvertFP32ToFP16Row_NEON(const float* src,
|
||||||
|
uint16_t* dst, // fp16
|
||||||
|
int width) {
|
||||||
|
asm volatile(
|
||||||
|
"1: \n"
|
||||||
|
"ldp q2, q3, [%0], #32 \n" // load 8 floats
|
||||||
|
"subs %w2, %w2, #8 \n" // 8 floats per loop
|
||||||
|
"prfm pldl1keep, [%0, 448] \n"
|
||||||
|
"fcvtn v1.4h, v2.4s \n" // 8 fp16 halffloats
|
||||||
|
"fcvtn2 v1.8h, v3.4s \n"
|
||||||
|
"str q1, [%1], #16 \n" // store 8 fp16 halffloats
|
||||||
|
"b.gt 1b \n"
|
||||||
|
: "+r"(src), // %0
|
||||||
|
"+r"(dst), // %1
|
||||||
|
"+r"(width) // %2
|
||||||
|
:
|
||||||
|
: "cc", "memory", "v1", "v2", "v3");
|
||||||
|
}
|
||||||
|
|
||||||
float ScaleMaxSamples_NEON(const float* src,
|
float ScaleMaxSamples_NEON(const float* src,
|
||||||
float* dst,
|
float* dst,
|
||||||
float scale,
|
float scale,
|
||||||
|
|||||||
@ -4468,4 +4468,46 @@ TEST_F(LibYUVPlanarTest, NV21Copy) {
|
|||||||
free_aligned_buffer_page_end(dst_vu);
|
free_aligned_buffer_page_end(dst_vu);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if defined(ENABLE_ROW_TESTS) && !defined(LIBYUV_DISABLE_NEON) && \
|
||||||
|
defined(__aarch64__)
|
||||||
|
|
||||||
|
TEST_F(LibYUVPlanarTest, TestConvertFP16ToFP32) {
|
||||||
|
int i, j;
|
||||||
|
const int y_plane_size = benchmark_width_ * benchmark_height_;
|
||||||
|
|
||||||
|
align_buffer_page_end(orig_f, y_plane_size * 4);
|
||||||
|
align_buffer_page_end(orig_y, y_plane_size * 2);
|
||||||
|
align_buffer_page_end(dst_opt, y_plane_size * 4);
|
||||||
|
align_buffer_page_end(rec_opt, y_plane_size * 2);
|
||||||
|
|
||||||
|
for (i = 0; i < y_plane_size; ++i) {
|
||||||
|
((float*)orig_f)[i] = (float)(i % 10000) * 3.14f;
|
||||||
|
}
|
||||||
|
memset(orig_y, 1, y_plane_size * 2);
|
||||||
|
memset(dst_opt, 2, y_plane_size * 4);
|
||||||
|
memset(rec_opt, 3, y_plane_size * 2);
|
||||||
|
|
||||||
|
ConvertFP32ToFP16Row_NEON((const float*)orig_f, (uint16_t*)orig_y,
|
||||||
|
y_plane_size);
|
||||||
|
|
||||||
|
for (j = 0; j < benchmark_iterations_; j++) {
|
||||||
|
ConvertFP16ToFP32Row_NEON((const uint16_t*)orig_y, (float*)dst_opt,
|
||||||
|
y_plane_size);
|
||||||
|
}
|
||||||
|
|
||||||
|
ConvertFP32ToFP16Row_NEON((const float*)dst_opt, (uint16_t*)rec_opt,
|
||||||
|
y_plane_size);
|
||||||
|
|
||||||
|
for (i = 0; i < y_plane_size; ++i) {
|
||||||
|
EXPECT_EQ(((const uint16_t*)orig_y)[i], ((const uint16_t*)rec_opt)[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
free_aligned_buffer_page_end(orig_f);
|
||||||
|
free_aligned_buffer_page_end(orig_y);
|
||||||
|
free_aligned_buffer_page_end(dst_opt);
|
||||||
|
free_aligned_buffer_page_end(rec_opt);
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif // defined(ENABLE_ROW_TESTS) && defined(__aarch64__)
|
||||||
|
|
||||||
} // namespace libyuv
|
} // namespace libyuv
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user