mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-01-01 03:12:16 +08:00
Fix arm unittest failure by removing unused FloatDivToByteRow.
Apply clang-format to fix jpeg if() for lint fix. Change comments about 4th pixel for open source compliance. Rename UVToVU to SwapUV for consistency with MergeUV. BUG=b/135532289, b/136515133 Change-Id: I9ce377c57b1d4d8f8b373c4cb44cd3f836300f79 Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/1685936 Reviewed-by: Chong Zhang <chz@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
c6dcbdfaac
commit
f9aacffa02
@ -376,7 +376,7 @@ extern "C" {
|
||||
#define HAS_SETROW_NEON
|
||||
#define HAS_SPLITRGBROW_NEON
|
||||
#define HAS_SPLITUVROW_NEON
|
||||
#define HAS_UVToVUROW_NEON
|
||||
#define HAS_SWAPUVROW_NEON
|
||||
#define HAS_UYVYTOARGBROW_NEON
|
||||
#define HAS_UYVYTOUV422ROW_NEON
|
||||
#define HAS_UYVYTOUVROW_NEON
|
||||
@ -409,7 +409,6 @@ extern "C" {
|
||||
|
||||
// The following are available on AArch64 platforms:
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
|
||||
#define HAS_FLOATDIVTOBYTEROW_NEON
|
||||
#define HAS_SCALESUMSAMPLES_NEON
|
||||
#endif
|
||||
#if !defined(LIBYUV_DISABLE_MSA) && defined(__mips_msa)
|
||||
@ -3372,9 +3371,9 @@ void UYVYToUV422Row_Any_MMI(const uint8_t* src_ptr,
|
||||
uint8_t* dst_u,
|
||||
uint8_t* dst_v,
|
||||
int width);
|
||||
void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
|
||||
void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
|
||||
void UVToVURow_Any_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
|
||||
void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width);
|
||||
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
|
||||
void SwapUVRow_Any_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width);
|
||||
void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width);
|
||||
void AYUVToUVRow_C(const uint8_t* src_ayuv,
|
||||
int stride_ayuv,
|
||||
@ -4018,17 +4017,6 @@ float ScaleSumSamples_NEON(const float* src,
|
||||
void ScaleSamples_C(const float* src, float* dst, float scale, int width);
|
||||
void ScaleSamples_NEON(const float* src, float* dst, float scale, int width);
|
||||
|
||||
void FloatDivToByteRow_C(const float* src_weights,
|
||||
const float* src_values,
|
||||
uint8_t* dst_out,
|
||||
uint8_t* dst_mask,
|
||||
int width);
|
||||
void FloatDivToByteRow_NEON(const float* src_weights,
|
||||
const float* src_values,
|
||||
uint8_t* dst_out,
|
||||
uint8_t* dst_mask,
|
||||
int width);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -69,13 +69,13 @@ static uint32_t ARGBDetectRow_C(const uint8_t* argb, int width) {
|
||||
if (argb[0] != 255) { // First byte is not Alpha of 255, so not ARGB.
|
||||
return FOURCC_BGRA;
|
||||
}
|
||||
if (argb[3] != 255) { // 4th byte is not Alpha of 255, so not BGRA.
|
||||
if (argb[3] != 255) { // Fourth byte is not Alpha of 255, so not BGRA.
|
||||
return FOURCC_ARGB;
|
||||
}
|
||||
if (argb[4] != 255) { // Second pixel first byte is not Alpha of 255.
|
||||
return FOURCC_BGRA;
|
||||
}
|
||||
if (argb[7] != 255) { // Second pixel 4th byte is not Alpha of 255.
|
||||
if (argb[7] != 255) { // Second pixel fourth byte is not Alpha of 255.
|
||||
return FOURCC_ARGB;
|
||||
}
|
||||
argb += 8;
|
||||
|
||||
@ -430,7 +430,7 @@ boolean fill_input_buffer(j_decompress_ptr cinfo) {
|
||||
void skip_input_data(j_decompress_ptr cinfo, long num_bytes) { // NOLINT
|
||||
jpeg_source_mgr* src = cinfo->src;
|
||||
size_t bytes = static_cast<size_t>(num_bytes);
|
||||
if(bytes > src->bytes_in_buffer) {
|
||||
if (bytes > src->bytes_in_buffer) {
|
||||
src->next_input_byte = nullptr;
|
||||
src->bytes_in_buffer = 0;
|
||||
} else {
|
||||
|
||||
@ -516,8 +516,8 @@ int NV21ToNV12(const uint8_t* src_y,
|
||||
int width,
|
||||
int height) {
|
||||
int y;
|
||||
void (*UVToVURow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
|
||||
UVToVURow_C;
|
||||
void (*SwapUVRow)(const uint8_t* src_uv, uint8_t* dst_vu, int width) =
|
||||
SwapUVRow_C;
|
||||
|
||||
int halfwidth = (width + 1) >> 1;
|
||||
int halfheight = (height + 1) >> 1;
|
||||
@ -540,11 +540,11 @@ int NV21ToNV12(const uint8_t* src_y,
|
||||
src_stride_vu = dst_stride_uv = 0;
|
||||
}
|
||||
|
||||
#if defined(HAS_UVToVUROW_NEON)
|
||||
#if defined(HAS_SWAPUVROW_NEON)
|
||||
if (TestCpuFlag(kCpuHasNEON)) {
|
||||
UVToVURow = UVToVURow_Any_NEON;
|
||||
SwapUVRow = SwapUVRow_Any_NEON;
|
||||
if (IS_ALIGNED(halfwidth, 16)) {
|
||||
UVToVURow = UVToVURow_NEON;
|
||||
SwapUVRow = SwapUVRow_NEON;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
@ -553,7 +553,7 @@ int NV21ToNV12(const uint8_t* src_y,
|
||||
}
|
||||
|
||||
for (y = 0; y < halfheight; ++y) {
|
||||
UVToVURow(src_vu, dst_uv, halfwidth);
|
||||
SwapUVRow(src_vu, dst_uv, halfwidth);
|
||||
src_vu += src_stride_vu;
|
||||
dst_uv += dst_stride_uv;
|
||||
}
|
||||
|
||||
@ -710,8 +710,8 @@ ANY11(UYVYToYRow_Any_MMI, UYVYToYRow_MMI, 1, 4, 1, 15)
|
||||
#ifdef HAS_AYUVTOYROW_NEON
|
||||
ANY11(AYUVToYRow_Any_NEON, AYUVToYRow_NEON, 0, 4, 1, 15)
|
||||
#endif
|
||||
#ifdef HAS_AYUVTOYROW_NEON
|
||||
ANY11(UVToVURow_Any_NEON, UVToVURow_NEON, 0, 2, 2, 15)
|
||||
#ifdef HAS_SWAPUVROW_NEON
|
||||
ANY11(SwapUVRow_Any_NEON, SwapUVRow_NEON, 0, 2, 2, 15)
|
||||
#endif
|
||||
#ifdef HAS_RGB24TOARGBROW_NEON
|
||||
ANY11(RGB24ToARGBRow_Any_NEON, RGB24ToARGBRow_NEON, 0, 3, 4, 7)
|
||||
|
||||
@ -3319,7 +3319,7 @@ void AYUVToYRow_C(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
|
||||
}
|
||||
}
|
||||
|
||||
void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
void SwapUVRow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
int x;
|
||||
for (x = 0; x < width; ++x) {
|
||||
uint8_t u = src_uv[0];
|
||||
@ -3331,19 +3331,6 @@ void UVToVURow_C(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
}
|
||||
}
|
||||
|
||||
// divide values by weights and provide mask to indicate weight of 0.
|
||||
void FloatDivToByteRow_C(const float* src_weights,
|
||||
const float* src_values,
|
||||
uint8_t* dst_out,
|
||||
uint8_t* dst_mask,
|
||||
int width) {
|
||||
int x;
|
||||
for (x = 0; x < width; ++x) {
|
||||
dst_out[x] = Clamp(src_values[x] / src_weights[x]);
|
||||
dst_mask[x] = src_weights[x] > 0 ? 0 : 0xff;
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
} // namespace libyuv
|
||||
|
||||
@ -6120,24 +6120,24 @@ void I422ToYUY2Row_SSE2(const uint8_t* src_y,
|
||||
int width) {
|
||||
asm volatile(
|
||||
|
||||
"sub %1,%2 \n"
|
||||
"sub %1,%2 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movq (%1),%%xmm2 \n"
|
||||
"movq 0x00(%1,%2,1),%%xmm1 \n"
|
||||
"add $0x8,%1 \n"
|
||||
"punpcklbw %%xmm1,%%xmm2 \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"add $0x10,%0 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"punpcklbw %%xmm2,%%xmm0 \n"
|
||||
"punpckhbw %%xmm2,%%xmm1 \n"
|
||||
"movdqu %%xmm0,(%3) \n"
|
||||
"movdqu %%xmm1,0x10(%3) \n"
|
||||
"lea 0x20(%3),%3 \n"
|
||||
"sub $0x10,%4 \n"
|
||||
"jg 1b \n"
|
||||
"1: \n"
|
||||
"movq (%1),%%xmm2 \n"
|
||||
"movq 0x00(%1,%2,1),%%xmm1 \n"
|
||||
"add $0x8,%1 \n"
|
||||
"punpcklbw %%xmm1,%%xmm2 \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"add $0x10,%0 \n"
|
||||
"movdqa %%xmm0,%%xmm1 \n"
|
||||
"punpcklbw %%xmm2,%%xmm0 \n"
|
||||
"punpckhbw %%xmm2,%%xmm1 \n"
|
||||
"movdqu %%xmm0,(%3) \n"
|
||||
"movdqu %%xmm1,0x10(%3) \n"
|
||||
"lea 0x20(%3),%3 \n"
|
||||
"sub $0x10,%4 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_u), // %1
|
||||
"+r"(src_v), // %2
|
||||
@ -6156,24 +6156,24 @@ void I422ToUYVYRow_SSE2(const uint8_t* src_y,
|
||||
int width) {
|
||||
asm volatile(
|
||||
|
||||
"sub %1,%2 \n"
|
||||
"sub %1,%2 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"movq (%1),%%xmm2 \n"
|
||||
"movq 0x00(%1,%2,1),%%xmm1 \n"
|
||||
"add $0x8,%1 \n"
|
||||
"punpcklbw %%xmm1,%%xmm2 \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqa %%xmm2,%%xmm1 \n"
|
||||
"add $0x10,%0 \n"
|
||||
"punpcklbw %%xmm0,%%xmm1 \n"
|
||||
"punpckhbw %%xmm0,%%xmm2 \n"
|
||||
"movdqu %%xmm1,(%3) \n"
|
||||
"movdqu %%xmm2,0x10(%3) \n"
|
||||
"lea 0x20(%3),%3 \n"
|
||||
"sub $0x10,%4 \n"
|
||||
"jg 1b \n"
|
||||
"1: \n"
|
||||
"movq (%1),%%xmm2 \n"
|
||||
"movq 0x00(%1,%2,1),%%xmm1 \n"
|
||||
"add $0x8,%1 \n"
|
||||
"punpcklbw %%xmm1,%%xmm2 \n"
|
||||
"movdqu (%0),%%xmm0 \n"
|
||||
"movdqa %%xmm2,%%xmm1 \n"
|
||||
"add $0x10,%0 \n"
|
||||
"punpcklbw %%xmm0,%%xmm1 \n"
|
||||
"punpckhbw %%xmm0,%%xmm2 \n"
|
||||
"movdqu %%xmm1,(%3) \n"
|
||||
"movdqu %%xmm2,0x10(%3) \n"
|
||||
"lea 0x20(%3),%3 \n"
|
||||
"sub $0x10,%4 \n"
|
||||
"jg 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_u), // %1
|
||||
"+r"(src_v), // %2
|
||||
@ -6192,27 +6192,27 @@ void I422ToYUY2Row_AVX2(const uint8_t* src_y,
|
||||
int width) {
|
||||
asm volatile(
|
||||
|
||||
"sub %1,%2 \n"
|
||||
"sub %1,%2 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vpmovzxbw (%1),%%ymm1 \n"
|
||||
"vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
|
||||
"add $0x10,%1 \n"
|
||||
"vpsllw $0x8,%%ymm2,%%ymm2 \n"
|
||||
"vpor %%ymm1,%%ymm2,%%ymm2 \n"
|
||||
"vmovdqu (%0),%%ymm0 \n"
|
||||
"add $0x20,%0 \n"
|
||||
"vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
|
||||
"vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
|
||||
"vextractf128 $0x0,%%ymm1,(%3) \n"
|
||||
"vextractf128 $0x0,%%ymm2,0x10(%3) \n"
|
||||
"vextractf128 $0x1,%%ymm1,0x20(%3) \n"
|
||||
"vextractf128 $0x1,%%ymm2,0x30(%3) \n"
|
||||
"lea 0x40(%3),%3 \n"
|
||||
"sub $0x20,%4 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
"1: \n"
|
||||
"vpmovzxbw (%1),%%ymm1 \n"
|
||||
"vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
|
||||
"add $0x10,%1 \n"
|
||||
"vpsllw $0x8,%%ymm2,%%ymm2 \n"
|
||||
"vpor %%ymm1,%%ymm2,%%ymm2 \n"
|
||||
"vmovdqu (%0),%%ymm0 \n"
|
||||
"add $0x20,%0 \n"
|
||||
"vpunpcklbw %%ymm2,%%ymm0,%%ymm1 \n"
|
||||
"vpunpckhbw %%ymm2,%%ymm0,%%ymm2 \n"
|
||||
"vextractf128 $0x0,%%ymm1,(%3) \n"
|
||||
"vextractf128 $0x0,%%ymm2,0x10(%3) \n"
|
||||
"vextractf128 $0x1,%%ymm1,0x20(%3) \n"
|
||||
"vextractf128 $0x1,%%ymm2,0x30(%3) \n"
|
||||
"lea 0x40(%3),%3 \n"
|
||||
"sub $0x20,%4 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_u), // %1
|
||||
"+r"(src_v), // %2
|
||||
@ -6231,27 +6231,27 @@ void I422ToUYVYRow_AVX2(const uint8_t* src_y,
|
||||
int width) {
|
||||
asm volatile(
|
||||
|
||||
"sub %1,%2 \n"
|
||||
"sub %1,%2 \n"
|
||||
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vpmovzxbw (%1),%%ymm1 \n"
|
||||
"vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
|
||||
"add $0x10,%1 \n"
|
||||
"vpsllw $0x8,%%ymm2,%%ymm2 \n"
|
||||
"vpor %%ymm1,%%ymm2,%%ymm2 \n"
|
||||
"vmovdqu (%0),%%ymm0 \n"
|
||||
"add $0x20,%0 \n"
|
||||
"vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
|
||||
"vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
|
||||
"vextractf128 $0x0,%%ymm1,(%3) \n"
|
||||
"vextractf128 $0x0,%%ymm2,0x10(%3) \n"
|
||||
"vextractf128 $0x1,%%ymm1,0x20(%3) \n"
|
||||
"vextractf128 $0x1,%%ymm2,0x30(%3) \n"
|
||||
"lea 0x40(%3),%3 \n"
|
||||
"sub $0x20,%4 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
"1: \n"
|
||||
"vpmovzxbw (%1),%%ymm1 \n"
|
||||
"vpmovzxbw 0x00(%1,%2,1),%%ymm2 \n"
|
||||
"add $0x10,%1 \n"
|
||||
"vpsllw $0x8,%%ymm2,%%ymm2 \n"
|
||||
"vpor %%ymm1,%%ymm2,%%ymm2 \n"
|
||||
"vmovdqu (%0),%%ymm0 \n"
|
||||
"add $0x20,%0 \n"
|
||||
"vpunpcklbw %%ymm0,%%ymm2,%%ymm1 \n"
|
||||
"vpunpckhbw %%ymm0,%%ymm2,%%ymm2 \n"
|
||||
"vextractf128 $0x0,%%ymm1,(%3) \n"
|
||||
"vextractf128 $0x0,%%ymm2,0x10(%3) \n"
|
||||
"vextractf128 $0x1,%%ymm1,0x20(%3) \n"
|
||||
"vextractf128 $0x1,%%ymm2,0x30(%3) \n"
|
||||
"lea 0x40(%3),%3 \n"
|
||||
"sub $0x20,%4 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(src_u), // %1
|
||||
"+r"(src_v), // %2
|
||||
|
||||
@ -2769,7 +2769,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_yuv24,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld1.8 {q2}, [%0]! \n" // load 16 Y values
|
||||
"vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
|
||||
"vmov d1, d0 \n"
|
||||
@ -2854,7 +2854,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
|
||||
// Similar to ARGBExtractAlphaRow_NEON
|
||||
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
|
||||
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
|
||||
"subs %2, %2, #16 \n" // 16 processed per loop
|
||||
@ -2868,9 +2868,9 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
|
||||
}
|
||||
|
||||
// Convert biplanar UV channel of NV12 to NV21
|
||||
void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
|
||||
"vld2.8 {d1, d3}, [%0]! \n"
|
||||
"vorr.u8 q2, q0, q0 \n" // move U after V
|
||||
|
||||
@ -2882,7 +2882,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
|
||||
uint8_t* dst_yuv24,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld1 {v2.16b}, [%0], #16 \n" // load 16 Y values
|
||||
"ld2 {v0.8b, v1.8b}, [%1], #16 \n" // load 8 VU values
|
||||
"zip1 v0.16b, v0.16b, v0.16b \n" // replicate V values
|
||||
@ -2905,9 +2905,8 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
|
||||
const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
|
||||
asm volatile(
|
||||
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
|
||||
// pixels.
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 ayuv
|
||||
"uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
|
||||
"uaddlp v1.8h, v1.16b \n" // U 16 bytes -> 8 shorts.
|
||||
"ld4 {v4.16b,v5.16b,v6.16b,v7.16b}, [%1], #64 \n" // load next 16
|
||||
@ -2933,7 +2932,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
|
||||
const uint8_t* src_ayuv_1 = src_ayuv + src_stride_ayuv;
|
||||
asm volatile(
|
||||
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
|
||||
// pixels.
|
||||
"uaddlp v0.8h, v0.16b \n" // V 16 bytes -> 8 shorts.
|
||||
@ -2957,7 +2956,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
|
||||
// Copy row of AYUV Y's into Y
|
||||
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16
|
||||
// pixels
|
||||
"subs %w2, %w2, #16 \n" // 16 pixels per loop
|
||||
@ -2970,52 +2969,10 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3");
|
||||
}
|
||||
|
||||
void FloatDivToByteRow_NEON(const float* src_weights,
|
||||
const float* src_values,
|
||||
uint8_t* dst_out,
|
||||
uint8_t* dst_mask,
|
||||
int width) {
|
||||
asm volatile(
|
||||
"movi v0.4s, #0 \n"
|
||||
|
||||
"1: \n"
|
||||
"ld1 {v1.4s,v2.4s}, [%0], #32 \n" // load 8 float weights
|
||||
"ld1 {v3.4s,v4.4s}, [%1], #32 \n" // load 8 float values
|
||||
"subs %w4, %w4, #8 \n" // 8 pixels per loop
|
||||
|
||||
"fdiv v1.4s, v3.4s, v1.4s \n" // values / weights
|
||||
"fdiv v2.4s, v4.4s, v2.4s \n"
|
||||
|
||||
"fcvtas v1.4s, v1.4s \n" // float to int
|
||||
"fcvtas v2.4s, v2.4s \n" // float to int
|
||||
"uqxtn v1.4h, v1.4s \n" // 8 shorts
|
||||
"uqxtn2 v1.8h, v2.4s \n"
|
||||
"uqxtn v1.8b, v1.8h \n" // 8 bytes
|
||||
|
||||
"st1 {v1.8b}, [%2], #8 \n" // store 8 byte out
|
||||
|
||||
"fcmgt v5.4s, v1.4s, v0.4s \n" // cmp weight to zero
|
||||
"fcmgt v6.4s, v2.4s, v0.4s \n"
|
||||
"uqxtn v5.4h, v5.4s \n" // 8 shorts
|
||||
"uqxtn2 v5.8h, v6.4s \n"
|
||||
"uqxtn v5.8b, v1.8h \n" // 8 bytes
|
||||
|
||||
"st1 {v5.8b}, [%3], #8 \n" // store 8 byte mask
|
||||
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_weights), // %0
|
||||
"+r"(src_values), // %1
|
||||
"+r"(dst_out), // %2
|
||||
"+r"(dst_mask), // %3
|
||||
"+r"(width) // %4
|
||||
:
|
||||
: "cc", "memory", "v1", "v2", "v3", "v4", "v5", "v6");
|
||||
}
|
||||
|
||||
// Convert biplanar UV channel of NV12 to NV21
|
||||
void UVToVURow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
|
||||
asm volatile(
|
||||
"1: \n"
|
||||
"1: \n"
|
||||
"ld2 {v0.16b, v1.16b}, [%0], #32 \n" // load 16 UV values
|
||||
"orr v2.16b, v0.16b, v0.16b \n" // move U after V
|
||||
"subs %w2, %w2, #16 \n" // 16 pixels per loop
|
||||
|
||||
@ -5450,7 +5450,7 @@ void ComputeCumulativeSumRow_SSE2(const uint8_t* row,
|
||||
|
||||
// 1 pixel loop
|
||||
l1:
|
||||
movd xmm2, dword ptr [eax] // 1 argb pixel, 4 bytes.
|
||||
movd xmm2, dword ptr [eax] // 1 argb pixel
|
||||
lea eax, [eax + 4]
|
||||
punpcklbw xmm2, xmm1
|
||||
punpcklwd xmm2, xmm1
|
||||
|
||||
@ -993,8 +993,7 @@ TESTATOPLANAR(I400, 1, 1, I420, 2, 2, 2)
|
||||
TESTATOPLANAR(J400, 1, 1, J420, 2, 2, 2)
|
||||
TESTATOPLANAR(RAW, 3, 1, I420, 2, 2, 4)
|
||||
TESTATOPLANAR(RGB24, 3, 1, I420, 2, 2, 4)
|
||||
// TODO(fbarchard): Investigate J420 error of 11 on Windows.
|
||||
TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, 11)
|
||||
TESTATOPLANAR(RGB24, 3, 1, J420, 2, 2, 4)
|
||||
TESTATOPLANAR(RGB565, 2, 1, I420, 2, 2, 5)
|
||||
TESTATOPLANAR(RGBA, 4, 1, I420, 2, 2, 4)
|
||||
TESTATOPLANAR(UYVY, 2, 1, I420, 2, 2, 2)
|
||||
|
||||
@ -3269,88 +3269,7 @@ TEST_F(LibYUVPlanarTest, TestGaussCol_Opt) {
|
||||
EXPECT_EQ(dst_pixels_c[639], static_cast<uint32_t>(30704));
|
||||
}
|
||||
|
||||
float TestFloatDivToByte(int benchmark_width,
|
||||
int benchmark_height,
|
||||
int benchmark_iterations,
|
||||
float scale,
|
||||
bool opt) {
|
||||
int i, j;
|
||||
// NEON does multiple of 8, so round count up
|
||||
const int kPixels = (benchmark_width * benchmark_height + 7) & ~7;
|
||||
align_buffer_page_end(src_weights, kPixels * 4);
|
||||
align_buffer_page_end(src_values, kPixels * 4);
|
||||
align_buffer_page_end(dst_out_c, kPixels);
|
||||
align_buffer_page_end(dst_out_opt, kPixels);
|
||||
align_buffer_page_end(dst_mask_c, kPixels);
|
||||
align_buffer_page_end(dst_mask_opt, kPixels);
|
||||
|
||||
// Randomize works but may contain some denormals affecting performance.
|
||||
// MemRandomize(orig_y, kPixels * 4);
|
||||
// large values are problematic. audio is really -1 to 1.
|
||||
for (i = 0; i < kPixels; ++i) {
|
||||
(reinterpret_cast<float*>(src_weights))[i] = scale;
|
||||
(reinterpret_cast<float*>(src_values))[i] =
|
||||
sinf(static_cast<float>(i) * 0.1f);
|
||||
}
|
||||
memset(dst_out_c, 0, kPixels);
|
||||
memset(dst_out_opt, 1, kPixels);
|
||||
memset(dst_mask_c, 2, kPixels);
|
||||
memset(dst_mask_opt, 3, kPixels);
|
||||
|
||||
FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
|
||||
reinterpret_cast<float*>(src_values), dst_out_c,
|
||||
dst_mask_c, kPixels);
|
||||
|
||||
for (j = 0; j < benchmark_iterations; j++) {
|
||||
if (opt) {
|
||||
#ifdef HAS_FLOATDIVTOBYTEROW_NEON
|
||||
FloatDivToByteRow_NEON(reinterpret_cast<float*>(src_weights),
|
||||
reinterpret_cast<float*>(src_values), dst_out_opt,
|
||||
dst_mask_opt, kPixels);
|
||||
#else
|
||||
FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
|
||||
reinterpret_cast<float*>(src_values), dst_out_opt,
|
||||
dst_mask_opt, kPixels);
|
||||
#endif
|
||||
} else {
|
||||
FloatDivToByteRow_C(reinterpret_cast<float*>(src_weights),
|
||||
reinterpret_cast<float*>(src_values), dst_out_opt,
|
||||
dst_mask_opt, kPixels);
|
||||
}
|
||||
}
|
||||
|
||||
uint8_t max_diff = 0;
|
||||
for (i = 0; i < kPixels; ++i) {
|
||||
uint8_t abs_diff = abs(dst_out_c[i] - dst_out_opt[i]) +
|
||||
abs(dst_mask_c[i] - dst_mask_opt[i]);
|
||||
if (abs_diff > max_diff) {
|
||||
max_diff = abs_diff;
|
||||
}
|
||||
}
|
||||
|
||||
free_aligned_buffer_page_end(src_weights);
|
||||
free_aligned_buffer_page_end(src_values);
|
||||
free_aligned_buffer_page_end(dst_out_c);
|
||||
free_aligned_buffer_page_end(dst_out_opt);
|
||||
free_aligned_buffer_page_end(dst_mask_c);
|
||||
free_aligned_buffer_page_end(dst_mask_opt);
|
||||
|
||||
return max_diff;
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestFloatDivToByte_C) {
|
||||
float diff = TestFloatDivToByte(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_, 1.2f, false);
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestFloatDivToByte_Opt) {
|
||||
float diff = TestFloatDivToByte(benchmark_width_, benchmark_height_,
|
||||
benchmark_iterations_, 1.2f, true);
|
||||
EXPECT_EQ(0, diff);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, UVToVURow) {
|
||||
TEST_F(LibYUVPlanarTest, SwapUVRow) {
|
||||
const int kPixels = benchmark_width_ * benchmark_height_;
|
||||
align_buffer_page_end(src_pixels_vu, kPixels * 2);
|
||||
align_buffer_page_end(dst_pixels_uv, kPixels * 2);
|
||||
@ -3358,7 +3277,7 @@ TEST_F(LibYUVPlanarTest, UVToVURow) {
|
||||
MemRandomize(src_pixels_vu, kPixels * 2);
|
||||
memset(dst_pixels_uv, 1, kPixels * 2);
|
||||
|
||||
UVToVURow_C(src_pixels_vu, dst_pixels_uv, kPixels);
|
||||
SwapUVRow_C(src_pixels_vu, dst_pixels_uv, kPixels);
|
||||
|
||||
for (int i = 0; i < kPixels; ++i) {
|
||||
EXPECT_EQ(dst_pixels_uv[i * 2 + 0], src_pixels_vu[i * 2 + 1]);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user