mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 01:06:46 +08:00
[AArch64] Simplify predicate width calculations
Several of the existing SVE kernels used calculations of the form:
remainder = width & (vl - 1) == 0 ? vl : width & (vl - 1);
This is due to initial SVE contributed code unconditionally using the
predicated tail for the final iteration even if the width was a perfect
multiple of the vector length.
In the current code the fully-predicated main body loop will instead
iterate through the width completely and simply skip over the tail
entirely. Skipping over the tail means that the case handled by the
ternary condition now never occurs, and the remainder calculation can
now simply be:
remainder = width & (vl - 1);
This avoids the need for a compare and conditional select in the
function prologue.
Change-Id: Ia73f5f8bc66fad6bea64439dc2beeaccb54622d2
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6067151
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
parent
2c32b689e4
commit
03a935493d
@ -1887,7 +1887,6 @@ void I210ToARGBRow_SVE2(const uint16_t* src_y,
|
||||
uint64_t vl;
|
||||
asm("cnth %0" : "=r"(vl));
|
||||
int width_last_y = width & (vl - 1);
|
||||
width_last_y = width_last_y == 0 ? vl : width_last_y;
|
||||
asm volatile(
|
||||
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||
"dup z19.b, #255 \n" // A
|
||||
@ -1936,7 +1935,6 @@ void I210AlphaToARGBRow_SVE2(const uint16_t* src_y,
|
||||
uint64_t vl;
|
||||
asm("cnth %0" : "=r"(vl));
|
||||
int width_last_y = width & (vl - 1);
|
||||
width_last_y = width_last_y == 0 ? vl : width_last_y;
|
||||
asm volatile(
|
||||
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||
"subs %w[width], %w[width], %w[vl] \n"
|
||||
@ -1986,7 +1984,6 @@ void I210ToAR30Row_SVE2(const uint16_t* src_y,
|
||||
uint64_t vl;
|
||||
asm("cnth %0" : "=r"(vl));
|
||||
int width_last_y = width & (vl - 1);
|
||||
width_last_y = width_last_y == 0 ? vl : width_last_y;
|
||||
uint16_t limit = 0x3ff0;
|
||||
asm volatile(
|
||||
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||
@ -2033,7 +2030,6 @@ void P210ToARGBRow_SVE2(const uint16_t* src_y,
|
||||
uint64_t vl;
|
||||
asm("cnth %0" : "=r"(vl));
|
||||
int width_last_y = width & (vl - 1);
|
||||
width_last_y = width_last_y == 0 ? vl : width_last_y;
|
||||
int width_last_uv = width_last_y + (width_last_y & 1);
|
||||
uint32_t nv_uv_start = 0x03010301U;
|
||||
uint32_t nv_uv_step = 0x04040404U;
|
||||
@ -2088,7 +2084,6 @@ void P210ToAR30Row_SVE2(const uint16_t* src_y,
|
||||
uint64_t vl;
|
||||
asm("cnth %0" : "=r"(vl));
|
||||
int width_last_y = width & (vl - 1);
|
||||
width_last_y = width_last_y == 0 ? vl : width_last_y;
|
||||
int width_last_uv = width_last_y + (width_last_y & 1);
|
||||
uint32_t nv_uv_start = 0x03010301U;
|
||||
uint32_t nv_uv_step = 0x04040404U;
|
||||
@ -2144,7 +2139,6 @@ void I410ToARGBRow_SVE2(const uint16_t* src_y,
|
||||
uint64_t vl;
|
||||
asm("cnth %0" : "=r"(vl));
|
||||
int width_last_y = width & (vl - 1);
|
||||
width_last_y = width_last_y == 0 ? vl : width_last_y;
|
||||
asm volatile(
|
||||
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||
"dup z19.b, #255 \n" // A
|
||||
@ -2193,7 +2187,6 @@ void I410AlphaToARGBRow_SVE2(const uint16_t* src_y,
|
||||
uint64_t vl;
|
||||
asm("cnth %0" : "=r"(vl));
|
||||
int width_last_y = width & (vl - 1);
|
||||
width_last_y = width_last_y == 0 ? vl : width_last_y;
|
||||
asm volatile(
|
||||
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||
"cmp %w[width], %w[vl] \n"
|
||||
@ -2244,7 +2237,6 @@ void I410ToAR30Row_SVE2(const uint16_t* src_y,
|
||||
uint64_t vl;
|
||||
asm("cnth %0" : "=r"(vl));
|
||||
int width_last_y = width & (vl - 1);
|
||||
width_last_y = width_last_y == 0 ? vl : width_last_y;
|
||||
uint16_t limit = 0x3ff0;
|
||||
asm volatile(
|
||||
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||
@ -2290,7 +2282,6 @@ void P410ToARGBRow_SVE2(const uint16_t* src_y,
|
||||
uint64_t vl;
|
||||
asm("cnth %0" : "=r"(vl));
|
||||
int width_last_y = width & (vl - 1);
|
||||
width_last_y = width_last_y == 0 ? vl : width_last_y;
|
||||
asm volatile(
|
||||
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||
"dup z19.b, #255 \n" // A
|
||||
@ -2341,7 +2332,6 @@ void P410ToAR30Row_SVE2(const uint16_t* src_y,
|
||||
uint64_t vl;
|
||||
asm("cnth %0" : "=r"(vl));
|
||||
int width_last_y = width & (vl - 1);
|
||||
width_last_y = width_last_y == 0 ? vl : width_last_y;
|
||||
uint16_t limit = 0x3ff0;
|
||||
asm volatile(
|
||||
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||
@ -2393,7 +2383,6 @@ void I212ToAR30Row_SVE2(const uint16_t* src_y,
|
||||
uint64_t vl;
|
||||
asm("cnth %0" : "=r"(vl));
|
||||
int width_last_y = width & (vl - 1);
|
||||
width_last_y = width_last_y == 0 ? vl : width_last_y;
|
||||
uint16_t limit = 0x3ff0;
|
||||
asm volatile(
|
||||
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||
@ -2440,7 +2429,6 @@ void I212ToARGBRow_SVE2(const uint16_t* src_y,
|
||||
uint64_t vl;
|
||||
asm("cnth %0" : "=r"(vl));
|
||||
int width_last_y = width & (vl - 1);
|
||||
width_last_y = width_last_y == 0 ? vl : width_last_y;
|
||||
asm volatile(
|
||||
"ptrue p0.b \n" YUVTORGB_SVE_SETUP
|
||||
"dup z19.b, #255 \n" // A
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user