Convert16To8Row_NEON use shift without rounding

Fixes chromium PaintCanvasVideoRendererTest.HighBitDepth

sqdmulh was creating a 9 bit value with rounding, and then shifted it right 1 with no rounding.  The rounding had an off by 1 impact in some tests.

Pixel 3
C           I010ToI420_Opt (749 ms)
Was sqdmulh I010ToI420_Opt (370 ms)
Now ushl    I010ToI420_Opt (324 ms)

Pixel 4
C           I010ToI420_Opt (581 ms)
Was sqdmulh I010ToI420_Opt (240 ms)
Now ushl    I010ToI420_Opt (231 ms)

Bug:  b/216321733, b/233233302
Change-Id: I26f673bb411401d1e4a8126bf22d61c649223e9b
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3694143
Reviewed-by: Justin Green <greenjustin@google.com>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2022-06-08 11:26:19 -07:00 committed by libyuv LUCI CQ
parent d011314f14
commit baef414478
2 changed files with 151 additions and 25 deletions

View File

@ -10,8 +10,6 @@
#include "libyuv/row.h"
#include <stdio.h>
#ifdef __cplusplus
namespace libyuv {
extern "C" {
@ -21,6 +19,8 @@ extern "C" {
#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__) && \
!defined(__aarch64__)
// d8-d15, r4-r11,r14(lr) need to be preserved if used. r13(sp),r15(pc) are reserved.
// q0: Y uint16x8_t
// d2: U uint8x8_t
// d3: V uint8x8_t
@ -2715,6 +2715,66 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
: "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14");
}
// Bilinear filter 8x2 -> 8x1
void InterpolateRow_16_NEON(uint16_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
const uint16_t* src_ptr1 = src_ptr + src_stride;
asm volatile(
"cmp %4, #0 \n"
"beq 100f \n"
"cmp %4, #128 \n"
"beq 50f \n"
"vdup.16 d17, %4 \n"
"vdup.16 d16, %5 \n"
// General purpose row blend.
"1: \n"
"vld1.16 {q0}, [%1]! \n"
"vld1.16 {q1}, [%2]! \n"
"subs %3, %3, #8 \n"
"vmull.u16 q2, d0, d16 \n"
"vmull.u16 q3, d1, d16 \n"
"vmlal.u16 q2, d2, d17 \n"
"vmlal.u16 q3, d3, d17 \n"
"vrshrn.u32 d0, q2, #8 \n"
"vrshrn.u32 d1, q3, #8 \n"
"vst1.16 {q0}, [%0]! \n"
"bgt 1b \n"
"b 99f \n"
// Blend 50 / 50.
"50: \n"
"vld1.16 {q0}, [%1]! \n"
"vld1.16 {q1}, [%2]! \n"
"subs %3, %3, #8 \n"
"vrhadd.u16 q0, q1 \n"
"vst1.16 {q0}, [%0]! \n"
"bgt 50b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"vld1.16 {q0}, [%1]! \n"
"subs %3, %3, #8 \n"
"vst1.16 {q0}, [%0]! \n"
"bgt 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_ptr1), // %2
"+r"(dst_width) // %3
: "r"(y1_fraction), // %4
"r"(y0_fraction) // %5
: "cc", "memory", "q0", "q1", "q2", "q3", "q8");
}
// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
void ARGBBlendRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
@ -3649,31 +3709,31 @@ void DivideRow_16_NEON(const uint16_t* src_y,
}
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 32768 = 9 bits
// 16384 = 10 bits
// 4096 = 12 bits
// 256 = 16 bits
// 32768 = 9 bits = shr 1
// 16384 = 10 bits = shr 2
// 4096 = 12 bits = shr 4
// 256 = 16 bits = shr 8
void Convert16To8Row_NEON(const uint16_t* src_y,
uint8_t* dst_y,
int scale,
int width) {
int shift = 15 - __builtin_clz(scale); // Negative for shl will shift right
asm volatile(
"vdup.16 q2, %2 \n"
"vdup.16 q2, %3 \n"
"1: \n"
"vld1.16 {q0}, [%0]! \n"
"vld1.16 {q1}, [%0]! \n"
"vqdmulh.s16 q0, q0, q2 \n"
"vqdmulh.s16 q1, q1, q2 \n"
"vqshrn.u16 d0, q0, #1 \n"
"vqshrn.u16 d1, q1, #1 \n"
"vst1.16 {q0}, [%1]! \n"
"subs %3, %3, #16 \n" // 16 src pixels per loop
"vshl.u16 q0, q0, q2 \n"
"vshl.u16 q1, q1, q2 \n"
"vqmovn.u16 d0, q0 \n"
"vqmovn.u16 d1, q1 \n"
"subs %2, %2, #16 \n" // 16 src pixels per loop
"vst1.8 {q0}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(scale), // %2
"+r"(width) // %3
:
"+r"(width) // %2
: "r"(shift) // %3
: "cc", "memory", "q0", "q1", "q2");
}

View File

@ -2966,6 +2966,71 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
: "cc", "memory", "v0", "v1", "v3", "v4", "v5");
}
// Bilinear filter 8x2 -> 8x1
void InterpolateRow_16_NEON(uint16_t* dst_ptr,
const uint16_t* src_ptr,
ptrdiff_t src_stride,
int dst_width,
int source_y_fraction) {
int y1_fraction = source_y_fraction;
int y0_fraction = 256 - y1_fraction;
const uint16_t* src_ptr1 = src_ptr + src_stride;
asm volatile(
"cmp %w4, #0 \n"
"b.eq 100f \n"
"cmp %w4, #128 \n"
"b.eq 50f \n"
"dup v5.8h, %w4 \n"
"dup v4.8h, %w5 \n"
// General purpose row blend.
"1: \n"
"ld1 {v0.8h}, [%1], #16 \n"
"ld1 {v1.8h}, [%2], #16 \n"
"subs %w3, %w3, #8 \n"
"umull v2.4s, v0.4h, v4.4h \n"
"prfm pldl1keep, [%1, 448] \n"
"umull2 v3.4s, v0.8h, v4.8h \n"
"prfm pldl1keep, [%2, 448] \n"
"umlal v2.4s, v1.4h, v5.4h \n"
"umlal2 v3.4s, v1.8h, v5.8h \n"
"rshrn v0.4h, v2.4s, #8 \n"
"rshrn2 v0.8h, v3.4s, #8 \n"
"st1 {v0.8h}, [%0], #16 \n"
"b.gt 1b \n"
"b 99f \n"
// Blend 50 / 50.
"50: \n"
"ld1 {v0.8h}, [%1], #16 \n"
"ld1 {v1.8h}, [%2], #16 \n"
"subs %w3, %w3, #8 \n"
"prfm pldl1keep, [%1, 448] \n"
"urhadd v0.8h, v0.8h, v1.8h \n"
"prfm pldl1keep, [%2, 448] \n"
"st1 {v0.8h}, [%0], #16 \n"
"b.gt 50b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
"ld1 {v0.8h}, [%1], #16 \n"
"subs %w3, %w3, #8 \n"
"prfm pldl1keep, [%1, 448] \n"
"st1 {v0.8h}, [%0], #16 \n"
"b.gt 100b \n"
"99: \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
"+r"(src_ptr1), // %2
"+r"(dst_width) // %3
: "r"(y1_fraction), // %4
"r"(y0_fraction) // %5
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5");
}
// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
void ARGBBlendRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1,
@ -4118,30 +4183,31 @@ void DivideRow_16_NEON(const uint16_t* src_y,
}
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 32768 = 9 bits
// 16384 = 10 bits
// 4096 = 12 bits
// 256 = 16 bits
// 32768 = 9 bits = shr 1
// 16384 = 10 bits = shr 2
// 4096 = 12 bits = shr 4
// 256 = 16 bits = shr 8
void Convert16To8Row_NEON(const uint16_t* src_y,
uint8_t* dst_y,
int scale,
int width) {
int shift = 15 - __builtin_clz(scale); // Negative for shl will shift right
asm volatile(
"dup v2.8h, %w3 \n"
"1: \n"
"ldp q0, q1, [%0], #32 \n"
"sqdmulh v0.8h, v0.8h, v2.8h \n"
"sqdmulh v1.8h, v1.8h, v2.8h \n"
"ushl v0.8h, v0.8h, v2.8h \n"
"ushl v1.8h, v1.8h, v2.8h \n"
"prfm pldl1keep, [%0, 448] \n"
"uqxtn v0.8b, v0.8h \n"
"uqxtn2 v0.16b, v1.8h \n"
"subs %w2, %w2, #16 \n" // 16 src pixels per loop
"uqshrn v0.8b, v0.8h, #1 \n"
"uqshrn2 v0.16b, v1.8h, #1 \n"
"str q0, [%1], #16 \n" // store 16 pixels
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_y), // %1
"+r"(width) // %2
: "r"(scale) // %3
: "r"(shift) // %3
: "cc", "memory", "v0", "v1", "v2");
}