mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2026-01-01 03:12:16 +08:00
Convert16To8Row_NEON use shift without rounding
Fixes chromium PaintCanvasVideoRendererTest.HighBitDepth sqdmulh was creating a 9 bit value with rounding, and then shifted it right 1 with no rounding. The rounding had an off by 1 impact in some tests. Pixel 3 C I010ToI420_Opt (749 ms) Was sqdmulh I010ToI420_Opt (370 ms) Now ushl I010ToI420_Opt (324 ms) Pixel 4 C I010ToI420_Opt (581 ms) Was sqdmulh I010ToI420_Opt (240 ms) Now ushl I010ToI420_Opt (231 ms) Bug: b/216321733, b/233233302 Change-Id: I26f673bb411401d1e4a8126bf22d61c649223e9b Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3694143 Reviewed-by: Justin Green <greenjustin@google.com> Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
d011314f14
commit
baef414478
@ -10,8 +10,6 @@
|
||||
|
||||
#include "libyuv/row.h"
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace libyuv {
|
||||
extern "C" {
|
||||
@ -21,6 +19,8 @@ extern "C" {
|
||||
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
|
||||
!defined(__aarch64__)
|
||||
|
||||
// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
|
||||
|
||||
// q0: Y uint16x8_t
|
||||
// d2: U uint8x8_t
|
||||
// d3: V uint8x8_t
|
||||
@ -2715,6 +2715,66 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
|
||||
: "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
|
||||
}
|
||||
|
||||
// Bilinear filter 8x2 -> 8x1
|
||||
void InterpolateRow_16_NEON(uint16_t* dst_ptr,
|
||||
const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int dst_width,
|
||||
int source_y_fraction) {
|
||||
int y1_fraction = source_y_fraction;
|
||||
int y0_fraction = 256 - y1_fraction;
|
||||
const uint16_t* src_ptr1 = src_ptr + src_stride;
|
||||
|
||||
asm volatile(
|
||||
"cmp %4, #0 \n"
|
||||
"beq 100f \n"
|
||||
"cmp %4, #128 \n"
|
||||
"beq 50f \n"
|
||||
|
||||
"vdup.16 d17, %4 \n"
|
||||
"vdup.16 d16, %5 \n"
|
||||
// General purpose row blend.
|
||||
"1: \n"
|
||||
"vld1.16 {q0}, [%1]! \n"
|
||||
"vld1.16 {q1}, [%2]! \n"
|
||||
"subs %3, %3, #8 \n"
|
||||
"vmull.u16 q2, d0, d16 \n"
|
||||
"vmull.u16 q3, d1, d16 \n"
|
||||
"vmlal.u16 q2, d2, d17 \n"
|
||||
"vmlal.u16 q3, d3, d17 \n"
|
||||
"vrshrn.u32 d0, q2, #8 \n"
|
||||
"vrshrn.u32 d1, q3, #8 \n"
|
||||
"vst1.16 {q0}, [%0]! \n"
|
||||
"bgt 1b \n"
|
||||
"b 99f \n"
|
||||
|
||||
// Blend 50 / 50.
|
||||
"50: \n"
|
||||
"vld1.16 {q0}, [%1]! \n"
|
||||
"vld1.16 {q1}, [%2]! \n"
|
||||
"subs %3, %3, #8 \n"
|
||||
"vrhadd.u16 q0, q1 \n"
|
||||
"vst1.16 {q0}, [%0]! \n"
|
||||
"bgt 50b \n"
|
||||
"b 99f \n"
|
||||
|
||||
// Blend 100 / 0 - Copy row unchanged.
|
||||
"100: \n"
|
||||
"vld1.16 {q0}, [%1]! \n"
|
||||
"subs %3, %3, #8 \n"
|
||||
"vst1.16 {q0}, [%0]! \n"
|
||||
"bgt 100b \n"
|
||||
|
||||
"99: \n"
|
||||
: "+r"(dst_ptr), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+r"(src_ptr1), // %2
|
||||
"+r"(dst_width) // %3
|
||||
: "r"(y1_fraction), // %4
|
||||
"r"(y0_fraction) // %5
|
||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q8");
|
||||
}
|
||||
|
||||
// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
|
||||
void ARGBBlendRow_NEON(const uint8_t* src_argb,
|
||||
const uint8_t* src_argb1,
|
||||
@ -3649,31 +3709,31 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
||||
}
|
||||
|
||||
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||
// 32768 = 9 bits
|
||||
// 16384 = 10 bits
|
||||
// 4096 = 12 bits
|
||||
// 256 = 16 bits
|
||||
// 32768 = 9 bits = shr 1
|
||||
// 16384 = 10 bits = shr 2
|
||||
// 4096 = 12 bits = shr 4
|
||||
// 256 = 16 bits = shr 8
|
||||
void Convert16To8Row_NEON(const uint16_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
int width) {
|
||||
int shift = 15 - __builtin_clz(scale); // Negative for shl will shift right
|
||||
asm volatile(
|
||||
"vdup.16 q2, %2 \n"
|
||||
"vdup.16 q2, %3 \n"
|
||||
"1: \n"
|
||||
"vld1.16 {q0}, [%0]! \n"
|
||||
"vld1.16 {q1}, [%0]! \n"
|
||||
"vqdmulh.s16 q0, q0, q2 \n"
|
||||
"vqdmulh.s16 q1, q1, q2 \n"
|
||||
"vqshrn.u16 d0, q0, #1 \n"
|
||||
"vqshrn.u16 d1, q1, #1 \n"
|
||||
"vst1.16 {q0}, [%1]! \n"
|
||||
"subs %3, %3, #16 \n" // 16 src pixels per loop
|
||||
"vshl.u16 q0, q0, q2 \n"
|
||||
"vshl.u16 q1, q1, q2 \n"
|
||||
"vqmovn.u16 d0, q0 \n"
|
||||
"vqmovn.u16 d1, q1 \n"
|
||||
"subs %2, %2, #16 \n" // 16 src pixels per loop
|
||||
"vst1.8 {q0}, [%1]! \n"
|
||||
"bgt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(scale), // %2
|
||||
"+r"(width) // %3
|
||||
:
|
||||
"+r"(width) // %2
|
||||
: "r"(shift) // %3
|
||||
: "cc", "memory", "q0", "q1", "q2");
|
||||
}
|
||||
|
||||
|
||||
@ -2966,6 +2966,71 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
|
||||
: "cc", "memory", "v0", "v1", "v3", "v4", "v5");
|
||||
}
|
||||
|
||||
// Bilinear filter 8x2 -> 8x1
|
||||
void InterpolateRow_16_NEON(uint16_t* dst_ptr,
|
||||
const uint16_t* src_ptr,
|
||||
ptrdiff_t src_stride,
|
||||
int dst_width,
|
||||
int source_y_fraction) {
|
||||
int y1_fraction = source_y_fraction;
|
||||
int y0_fraction = 256 - y1_fraction;
|
||||
const uint16_t* src_ptr1 = src_ptr + src_stride;
|
||||
|
||||
asm volatile(
|
||||
"cmp %w4, #0 \n"
|
||||
"b.eq 100f \n"
|
||||
"cmp %w4, #128 \n"
|
||||
"b.eq 50f \n"
|
||||
|
||||
"dup v5.8h, %w4 \n"
|
||||
"dup v4.8h, %w5 \n"
|
||||
// General purpose row blend.
|
||||
"1: \n"
|
||||
"ld1 {v0.8h}, [%1], #16 \n"
|
||||
"ld1 {v1.8h}, [%2], #16 \n"
|
||||
"subs %w3, %w3, #8 \n"
|
||||
"umull v2.4s, v0.4h, v4.4h \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"umull2 v3.4s, v0.8h, v4.8h \n"
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"umlal v2.4s, v1.4h, v5.4h \n"
|
||||
"umlal2 v3.4s, v1.8h, v5.8h \n"
|
||||
"rshrn v0.4h, v2.4s, #8 \n"
|
||||
"rshrn2 v0.8h, v3.4s, #8 \n"
|
||||
"st1 {v0.8h}, [%0], #16 \n"
|
||||
"b.gt 1b \n"
|
||||
"b 99f \n"
|
||||
|
||||
// Blend 50 / 50.
|
||||
"50: \n"
|
||||
"ld1 {v0.8h}, [%1], #16 \n"
|
||||
"ld1 {v1.8h}, [%2], #16 \n"
|
||||
"subs %w3, %w3, #8 \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"urhadd v0.8h, v0.8h, v1.8h \n"
|
||||
"prfm pldl1keep, [%2, 448] \n"
|
||||
"st1 {v0.8h}, [%0], #16 \n"
|
||||
"b.gt 50b \n"
|
||||
"b 99f \n"
|
||||
|
||||
// Blend 100 / 0 - Copy row unchanged.
|
||||
"100: \n"
|
||||
"ld1 {v0.8h}, [%1], #16 \n"
|
||||
"subs %w3, %w3, #8 \n"
|
||||
"prfm pldl1keep, [%1, 448] \n"
|
||||
"st1 {v0.8h}, [%0], #16 \n"
|
||||
"b.gt 100b \n"
|
||||
|
||||
"99: \n"
|
||||
: "+r"(dst_ptr), // %0
|
||||
"+r"(src_ptr), // %1
|
||||
"+r"(src_ptr1), // %2
|
||||
"+r"(dst_width) // %3
|
||||
: "r"(y1_fraction), // %4
|
||||
"r"(y0_fraction) // %5
|
||||
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
|
||||
}
|
||||
|
||||
// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
|
||||
void ARGBBlendRow_NEON(const uint8_t* src_argb,
|
||||
const uint8_t* src_argb1,
|
||||
@ -4118,30 +4183,31 @@ void DivideRow_16_NEON(const uint16_t* src_y,
|
||||
}
|
||||
|
||||
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||
// 32768 = 9 bits
|
||||
// 16384 = 10 bits
|
||||
// 4096 = 12 bits
|
||||
// 256 = 16 bits
|
||||
// 32768 = 9 bits = shr 1
|
||||
// 16384 = 10 bits = shr 2
|
||||
// 4096 = 12 bits = shr 4
|
||||
// 256 = 16 bits = shr 8
|
||||
void Convert16To8Row_NEON(const uint16_t* src_y,
|
||||
uint8_t* dst_y,
|
||||
int scale,
|
||||
int width) {
|
||||
int shift = 15 - __builtin_clz(scale); // Negative for shl will shift right
|
||||
asm volatile(
|
||||
"dup v2.8h, %w3 \n"
|
||||
"1: \n"
|
||||
"ldp q0, q1, [%0], #32 \n"
|
||||
"sqdmulh v0.8h, v0.8h, v2.8h \n"
|
||||
"sqdmulh v1.8h, v1.8h, v2.8h \n"
|
||||
"ushl v0.8h, v0.8h, v2.8h \n"
|
||||
"ushl v1.8h, v1.8h, v2.8h \n"
|
||||
"prfm pldl1keep, [%0, 448] \n"
|
||||
"uqxtn v0.8b, v0.8h \n"
|
||||
"uqxtn2 v0.16b, v1.8h \n"
|
||||
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
|
||||
"uqshrn v0.8b, v0.8h, #1 \n"
|
||||
"uqshrn2 v0.16b, v1.8h, #1 \n"
|
||||
"str q0, [%1], #16 \n" // store 16 pixels
|
||||
"b.gt 1b \n"
|
||||
: "+r"(src_y), // %0
|
||||
"+r"(dst_y), // %1
|
||||
"+r"(width) // %2
|
||||
: "r"(scale) // %3
|
||||
: "r"(shift) // %3
|
||||
: "cc", "memory", "v0", "v1", "v2");
|
||||
}
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user