Apply clang format

Bug: None
Change-Id: I0d9db4b384144523e61ae32b6ab3f72e93a0c265
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/6138934
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: Wan-Teh Chang <wtc@google.com>
This commit is contained in:
Frank Barchard 2025-01-02 13:20:17 -08:00
parent b5a18f9d93
commit e0040eb318
33 changed files with 1808 additions and 1804 deletions

View File

@ -67,7 +67,6 @@ static const int kCpuHasLOONGARCH = 0x20;
static const int kCpuHasLSX = 0x100; static const int kCpuHasLSX = 0x100;
static const int kCpuHasLASX = 0x200; static const int kCpuHasLASX = 0x200;
// Optional init function. TestCpuFlag does an auto-init. // Optional init function. TestCpuFlag does an auto-init.
// Returns cpu_info flags. // Returns cpu_info flags.
LIBYUV_API LIBYUV_API

View File

@ -499,8 +499,8 @@ static inline void I422ToRGB565Row_SVE_SC(
// Calculate a predicate for the final iteration to deal with the tail. // Calculate a predicate for the final iteration to deal with the tail.
"cnth %[vl] \n" "cnth %[vl] \n"
"whilelt p1.b, wzr, %w[width] \n" // "whilelt p1.b, wzr, %w[width] \n" //
READYUV422_SVE_2X I422TORGB_SVE_2X READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
RGBTOARGB8_SVE_TOP_2X RGB8TORGB565_SVE_FROM_TOP_2X RGB8TORGB565_SVE_FROM_TOP_2X
"st2h {z18.h, z19.h}, p1, [%[dst]] \n" "st2h {z18.h, z19.h}, p1, [%[dst]] \n"
"99: \n" "99: \n"
@ -558,8 +558,8 @@ static inline void I422ToARGB1555Row_SVE_SC(
// Calculate a predicate for the final iteration to deal with the tail. // Calculate a predicate for the final iteration to deal with the tail.
"cnth %[vl] \n" "cnth %[vl] \n"
"whilelt p1.b, wzr, %w[width] \n" // "whilelt p1.b, wzr, %w[width] \n" //
READYUV422_SVE_2X I422TORGB_SVE_2X READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
RGBTOARGB8_SVE_TOP_2X RGB8TOARGB1555_SVE_FROM_TOP_2X RGB8TOARGB1555_SVE_FROM_TOP_2X
"st2h {z0.h, z1.h}, p1, [%[dst]] \n" "st2h {z0.h, z1.h}, p1, [%[dst]] \n"
"99: \n" "99: \n"
@ -617,8 +617,8 @@ static inline void I422ToARGB4444Row_SVE_SC(
// Calculate a predicate for the final iteration to deal with the tail. // Calculate a predicate for the final iteration to deal with the tail.
"cnth %[vl] \n" "cnth %[vl] \n"
"whilelt p1.b, wzr, %w[width] \n" // "whilelt p1.b, wzr, %w[width] \n" //
READYUV422_SVE_2X I422TORGB_SVE_2X READYUV422_SVE_2X I422TORGB_SVE_2X RGBTOARGB8_SVE_TOP_2X
RGBTOARGB8_SVE_TOP_2X RGB8TOARGB4444_SVE_FROM_TOP_2X RGB8TOARGB4444_SVE_FROM_TOP_2X
"st2h {z0.h, z1.h}, p1, [%[dst]] \n" "st2h {z0.h, z1.h}, p1, [%[dst]] \n"
"99: \n" "99: \n"

View File

@ -29,7 +29,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
int count) { int count) {
uint64_t diff; uint64_t diff;
asm volatile ( asm volatile(
"xor %3,%3 \n" "xor %3,%3 \n"
"xor %%r8,%%r8 \n" "xor %%r8,%%r8 \n"
"xor %%r9,%%r9 \n" "xor %%r9,%%r9 \n"
@ -77,7 +77,7 @@ uint32_t HammingDistance_SSE42(const uint8_t* src_a,
int count) { int count) {
uint32_t diff = 0u; uint32_t diff = 0u;
asm volatile ( asm volatile(
// Process 16 bytes per loop. // Process 16 bytes per loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -121,7 +121,7 @@ uint32_t HammingDistance_SSSE3(const uint8_t* src_a,
int count) { int count) {
uint32_t diff; uint32_t diff;
asm volatile ( asm volatile(
"movdqa %4,%%xmm2 \n" "movdqa %4,%%xmm2 \n"
"movdqa %5,%%xmm3 \n" "movdqa %5,%%xmm3 \n"
"pxor %%xmm0,%%xmm0 \n" "pxor %%xmm0,%%xmm0 \n"
@ -180,7 +180,7 @@ uint32_t HammingDistance_AVX2(const uint8_t* src_a,
int count) { int count) {
uint32_t diff; uint32_t diff;
asm volatile ( asm volatile(
"vbroadcastf128 %4,%%ymm2 \n" "vbroadcastf128 %4,%%ymm2 \n"
"vbroadcastf128 %5,%%ymm3 \n" "vbroadcastf128 %5,%%ymm3 \n"
"vpxor %%ymm0,%%ymm0,%%ymm0 \n" "vpxor %%ymm0,%%ymm0,%%ymm0 \n"
@ -234,7 +234,7 @@ uint32_t SumSquareError_SSE2(const uint8_t* src_a,
const uint8_t* src_b, const uint8_t* src_b,
int count) { int count) {
uint32_t sse; uint32_t sse;
asm volatile ( asm volatile(
"pxor %%xmm0,%%xmm0 \n" "pxor %%xmm0,%%xmm0 \n"
"pxor %%xmm5,%%xmm5 \n" "pxor %%xmm5,%%xmm5 \n"
@ -300,7 +300,7 @@ static const uvec32 kHashMul3 = {
uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) { uint32_t HashDjb2_SSE41(const uint8_t* src, int count, uint32_t seed) {
uint32_t hash; uint32_t hash;
asm volatile ( asm volatile(
"movd %2,%%xmm0 \n" "movd %2,%%xmm0 \n"
"pxor %%xmm7,%%xmm7 \n" "pxor %%xmm7,%%xmm7 \n"
"movdqa %4,%%xmm6 \n" "movdqa %4,%%xmm6 \n"

View File

@ -28,7 +28,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
int count) { int count) {
uint32_t diff; uint32_t diff;
asm volatile ( asm volatile(
"vmov.u16 q4, #0 \n" // accumulator "vmov.u16 q4, #0 \n" // accumulator
"1: \n" "1: \n"
@ -58,7 +58,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
const uint8_t* src_b, const uint8_t* src_b,
int count) { int count) {
uint32_t sse; uint32_t sse;
asm volatile ( asm volatile(
"vmov.u8 q8, #0 \n" "vmov.u8 q8, #0 \n"
"vmov.u8 q10, #0 \n" "vmov.u8 q10, #0 \n"
"vmov.u8 q9, #0 \n" "vmov.u8 q9, #0 \n"

View File

@ -26,7 +26,7 @@ uint32_t HammingDistance_NEON(const uint8_t* src_a,
const uint8_t* src_b, const uint8_t* src_b,
int count) { int count) {
uint32_t diff; uint32_t diff;
asm volatile ( asm volatile(
"movi v4.8h, #0 \n" "movi v4.8h, #0 \n"
"1: \n" "1: \n"
@ -55,7 +55,7 @@ uint32_t SumSquareError_NEON(const uint8_t* src_a,
const uint8_t* src_b, const uint8_t* src_b,
int count) { int count) {
uint32_t sse; uint32_t sse;
asm volatile ( asm volatile(
"movi v16.16b, #0 \n" "movi v16.16b, #0 \n"
"movi v17.16b, #0 \n" "movi v17.16b, #0 \n"
"movi v18.16b, #0 \n" "movi v18.16b, #0 \n"
@ -157,7 +157,7 @@ uint32_t HammingDistance_NEON_DotProd(const uint8_t* src_a,
const uint8_t* src_b, const uint8_t* src_b,
int count) { int count) {
uint32_t diff; uint32_t diff;
asm volatile ( asm volatile(
"movi v4.4s, #0 \n" "movi v4.4s, #0 \n"
"movi v5.4s, #0 \n" "movi v5.4s, #0 \n"
"movi v6.16b, #1 \n" "movi v6.16b, #1 \n"
@ -190,7 +190,7 @@ uint32_t SumSquareError_NEON_DotProd(const uint8_t* src_a,
int count) { int count) {
// count is guaranteed to be a multiple of 32. // count is guaranteed to be a multiple of 32.
uint32_t sse; uint32_t sse;
asm volatile ( asm volatile(
"movi v4.4s, #0 \n" "movi v4.4s, #0 \n"
"movi v5.4s, #0 \n" "movi v5.4s, #0 \n"

View File

@ -70,9 +70,8 @@ int ConvertToARGB(const uint8_t* sample,
uint8_t* rotate_buffer = NULL; uint8_t* rotate_buffer = NULL;
int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height; int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
if (dst_argb == NULL || sample == NULL || if (dst_argb == NULL || sample == NULL || src_width <= 0 ||
src_width <= 0 || src_width > INT_MAX / 4 || src_width > INT_MAX / 4 || crop_width <= 0 || crop_width > INT_MAX / 4 ||
crop_width <= 0 || crop_width > INT_MAX / 4 ||
src_height == 0 || crop_height == 0) { src_height == 0 || crop_height == 0) {
return -1; return -1;
} }
@ -81,7 +80,8 @@ int ConvertToARGB(const uint8_t* sample,
} }
if (need_buf) { if (need_buf) {
const uint64_t rotate_buffer_size = (uint64_t)crop_width * 4 * abs_crop_height; const uint64_t rotate_buffer_size =
(uint64_t)crop_width * 4 * abs_crop_height;
if (rotate_buffer_size > SIZE_MAX) { if (rotate_buffer_size > SIZE_MAX) {
return -1; // Invalid size. return -1; // Invalid size.
} }

View File

@ -65,8 +65,9 @@ int ConvertToI420(const uint8_t* sample,
const int inv_crop_height = const int inv_crop_height =
(src_height < 0) ? -abs_crop_height : abs_crop_height; (src_height < 0) ? -abs_crop_height : abs_crop_height;
if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 || src_width > INT_MAX / 4 || if (!dst_y || !dst_u || !dst_v || !sample || src_width <= 0 ||
crop_width <= 0 || src_height == 0 || crop_height == 0) { src_width > INT_MAX / 4 || crop_width <= 0 || src_height == 0 ||
crop_height == 0) {
return -1; return -1;
} }
@ -78,7 +79,8 @@ int ConvertToI420(const uint8_t* sample,
if (need_buf) { if (need_buf) {
int y_size = crop_width * abs_crop_height; int y_size = crop_width * abs_crop_height;
int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2); int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
const uint64_t rotate_buffer_size = (uint64_t)y_size + (uint64_t)uv_size * 2; const uint64_t rotate_buffer_size =
(uint64_t)y_size + (uint64_t)uv_size * 2;
if (rotate_buffer_size > SIZE_MAX) { if (rotate_buffer_size > SIZE_MAX) {
return -1; // Invalid size. return -1; // Invalid size.
} }

View File

@ -191,7 +191,8 @@ static int ARGBRotate180(const uint8_t* src_argb,
#endif #endif
#if defined(HAS_COPYROW_AVX512BW) #if defined(HAS_COPYROW_AVX512BW)
if (TestCpuFlag(kCpuHasAVX512BW)) { if (TestCpuFlag(kCpuHasAVX512BW)) {
CopyRow = IS_ALIGNED(width * 4, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW; CopyRow =
IS_ALIGNED(width * 4, 128) ? CopyRow_AVX512BW : CopyRow_Any_AVX512BW;
} }
#endif #endif
#if defined(HAS_COPYROW_ERMS) #if defined(HAS_COPYROW_ERMS)

View File

@ -26,7 +26,7 @@ void TransposeWx8_SSSE3(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
int dst_stride, int dst_stride,
int width) { int width) {
asm volatile ( asm volatile(
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
LABELALIGN LABELALIGN
@ -116,7 +116,7 @@ void TransposeWx8_Fast_SSSE3(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
int dst_stride, int dst_stride,
int width) { int width) {
asm volatile ( asm volatile(
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
LABELALIGN LABELALIGN
@ -261,7 +261,7 @@ void TransposeUVWx8_SSE2(const uint8_t* src,
uint8_t* dst_b, uint8_t* dst_b,
int dst_stride_b, int dst_stride_b,
int width) { int width) {
asm volatile ( asm volatile(
// Read in the data from the source pointer. // Read in the data from the source pointer.
// First round of bit swap. // First round of bit swap.
LABELALIGN LABELALIGN
@ -391,7 +391,7 @@ void Transpose4x4_32_SSE2(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
int dst_stride, int dst_stride,
int width) { int width) {
asm volatile ( asm volatile(
// Main loop transpose 4x4. Read a column, write a row. // Main loop transpose 4x4. Read a column, write a row.
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" // a b c d "movdqu (%0),%%xmm0 \n" // a b c d
@ -447,7 +447,7 @@ void Transpose4x4_32_AVX2(const uint8_t* src,
uint8_t* dst, uint8_t* dst,
int dst_stride, int dst_stride,
int width) { int width) {
asm volatile ( asm volatile(
// Main loop transpose 2 blocks of 4x4. Read a column, write a row. // Main loop transpose 2 blocks of 4x4. Read a column, write a row.
"1: \n" "1: \n"
"vmovdqu (%0),%%xmm0 \n" // a b c d "vmovdqu (%0),%%xmm0 \n" // a b c d

View File

@ -27,7 +27,7 @@ void TransposeWx8_NEON(const uint8_t* src,
int dst_stride, int dst_stride,
int width) { int width) {
const uint8_t* temp; const uint8_t* temp;
asm volatile ( asm volatile(
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
@ -95,7 +95,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
int dst_stride_b, int dst_stride_b,
int width) { int width) {
const uint8_t* temp; const uint8_t* temp;
asm volatile ( asm volatile(
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
@ -184,7 +184,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
uint8_t* dst1 = dst + dst_stride; uint8_t* dst1 = dst + dst_stride;
uint8_t* dst2 = dst1 + dst_stride; uint8_t* dst2 = dst1 + dst_stride;
uint8_t* dst3 = dst2 + dst_stride; uint8_t* dst3 = dst2 + dst_stride;
asm volatile ( asm volatile(
// Main loop transpose 4x4. Read a column, write a row. // Main loop transpose 4x4. Read a column, write a row.
"1: \n" "1: \n"
"vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n" "vld4.32 {d0[0], d2[0], d4[0], d6[0]}, [%0], %9 \n"

View File

@ -27,7 +27,7 @@ void TransposeWx16_NEON(const uint8_t* src,
int dst_stride, int dst_stride,
int width) { int width) {
const uint8_t* src_temp; const uint8_t* src_temp;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"mov %[src_temp], %[src] \n" "mov %[src_temp], %[src] \n"
@ -145,7 +145,7 @@ void TransposeUVWx8_NEON(const uint8_t* src,
int dst_stride_b, int dst_stride_b,
int width) { int width) {
const uint8_t* temp; const uint8_t* temp;
asm volatile ( asm volatile(
// loops are on blocks of 8. loop will stop when // loops are on blocks of 8. loop will stop when
// counter gets to or below 0. starting the counter // counter gets to or below 0. starting the counter
// at w-8 allow for this // at w-8 allow for this
@ -239,7 +239,7 @@ void Transpose4x4_32_NEON(const uint8_t* src,
uint8_t* dst1 = dst + dst_stride; uint8_t* dst1 = dst + dst_stride;
uint8_t* dst2 = dst1 + dst_stride; uint8_t* dst2 = dst1 + dst_stride;
uint8_t* dst3 = dst2 + dst_stride; uint8_t* dst3 = dst2 + dst_stride;
asm volatile ( asm volatile(
// Main loop transpose 4x4. Read a column, write a row. // Main loop transpose 4x4. Read a column, write a row.
"1: \n" "1: \n"
"ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n" "ld4 {v0.s, v1.s, v2.s, v3.s}[0], [%0], %9 \n"

File diff suppressed because it is too large Load Diff

View File

@ -2039,7 +2039,7 @@ static void ARGBToYMatrixRow_LASX(const uint8_t* src_argb,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
asm volatile ( asm volatile(
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
@ -2101,7 +2101,7 @@ static void RGBAToYMatrixRow_LASX(const uint8_t* src_rgba,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7}; int32_t shuff[8] = {0, 4, 1, 5, 2, 6, 3, 7};
asm volatile ( asm volatile(
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants
@ -2165,7 +2165,7 @@ static void RGBToYMatrixRow_LASX(const uint8_t* src_rgba,
1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0, 1, 0, 4, 0, 7, 0, 10, 0, 13, 0, 16, 0, 19, 0, 22, 0,
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0, 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0,
25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; 25, 0, 28, 0, 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
asm volatile ( asm volatile(
"xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants "xvldrepl.b $xr0, %3, 0 \n\t" // load rgbconstants
"xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants "xvldrepl.b $xr1, %3, 1 \n\t" // load rgbconstants
"xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants "xvldrepl.b $xr2, %3, 2 \n\t" // load rgbconstants

View File

@ -2807,7 +2807,7 @@ static void ARGBToYMatrixRow_LSX(const uint8_t* src_argb,
uint8_t* dst_y, uint8_t* dst_y,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
asm volatile ( asm volatile(
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
@ -2866,7 +2866,7 @@ static void RGBAToYMatrixRow_LSX(const uint8_t* src_rgba,
uint8_t* dst_y, uint8_t* dst_y,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
asm volatile ( asm volatile(
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants
@ -2924,7 +2924,7 @@ static void RGBToYMatrixRow_LSX(const uint8_t* src_rgba,
7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10, 7, 9, 10, 12, 13, 15, 1, 0, 4, 0, 7, 0, 10,
0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0, 0, 13, 0, 16, 0, 19, 0, 22, 0, 25, 0, 28, 0,
31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0}; 31, 0, 2, 0, 5, 0, 8, 0, 11, 0, 14, 0};
asm volatile ( asm volatile(
"vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants "vldrepl.b $vr0, %3, 0 \n\t" // load rgbconstants
"vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants "vldrepl.b $vr1, %3, 1 \n\t" // load rgbconstants
"vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants "vldrepl.b $vr2, %3, 2 \n\t" // load rgbconstants

View File

@ -140,7 +140,7 @@ void I444ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READYUV444 YUVTORGB "1: \n" READYUV444 YUVTORGB
@ -164,7 +164,7 @@ void I444ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24, uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"1: \n" READYUV444 YUVTORGB "1: \n" READYUV444 YUVTORGB
RGBTORGB8 RGBTORGB8
@ -187,7 +187,7 @@ void I422ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
@ -212,7 +212,7 @@ void I444AlphaToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"1: \n" READYUV444 YUVTORGB "1: \n" READYUV444 YUVTORGB
RGBTORGB8 RGBTORGB8
@ -238,7 +238,7 @@ void I422AlphaToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
RGBTORGB8 RGBTORGB8
@ -263,7 +263,7 @@ void I422ToRGBARow_NEON(const uint8_t* src_y,
uint8_t* dst_rgba, uint8_t* dst_rgba,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
@ -285,7 +285,7 @@ void I422ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24, uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
@ -316,7 +316,7 @@ void I422ToRGB565Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb565, uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
@ -348,7 +348,7 @@ void I422ToARGB1555Row_NEON(const uint8_t* src_y,
uint8_t* dst_argb1555, uint8_t* dst_argb1555,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"1: \n" READYUV422 YUVTORGB "1: \n" READYUV422 YUVTORGB
RGBTORGB8 RGBTORGB8
@ -381,7 +381,7 @@ void I422ToARGB4444Row_NEON(const uint8_t* src_y,
uint8_t* dst_argb4444, uint8_t* dst_argb4444,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"vmov.u8 d7, #0x0f \n" // vbic bits to clear "vmov.u8 d7, #0x0f \n" // vbic bits to clear
@ -404,7 +404,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READYUV400 YUVTORGB "1: \n" READYUV400 YUVTORGB
@ -421,7 +421,7 @@ void I400ToARGBRow_NEON(const uint8_t* src_y,
} }
void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) { void J400ToARGBRow_NEON(const uint8_t* src_y, uint8_t* dst_argb, int width) {
asm volatile ( asm volatile(
"vmov.u8 d23, #255 \n" "vmov.u8 d23, #255 \n"
"1: \n" "1: \n"
"vld1.8 {d20}, [%0]! \n" "vld1.8 {d20}, [%0]! \n"
@ -442,7 +442,7 @@ void NV12ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READNV12 YUVTORGB RGBTORGB8 "1: \n" READNV12 YUVTORGB RGBTORGB8
@ -463,7 +463,7 @@ void NV21ToARGBRow_NEON(const uint8_t* src_y,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READNV21 YUVTORGB RGBTORGB8 "1: \n" READNV21 YUVTORGB RGBTORGB8
@ -484,7 +484,7 @@ void NV12ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24, uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READNV12 YUVTORGB RGBTORGB8 "1: \n" READNV12 YUVTORGB RGBTORGB8
@ -505,7 +505,7 @@ void NV21ToRGB24Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb24, uint8_t* dst_rgb24,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READNV21 YUVTORGB RGBTORGB8 "1: \n" READNV21 YUVTORGB RGBTORGB8
@ -526,7 +526,7 @@ void NV12ToRGB565Row_NEON(const uint8_t* src_y,
uint8_t* dst_rgb565, uint8_t* dst_rgb565,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READNV12 YUVTORGB RGBTORGB8 "1: \n" READNV12 YUVTORGB RGBTORGB8
@ -546,7 +546,7 @@ void YUY2ToARGBRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READYUY2 YUVTORGB RGBTORGB8 "1: \n" READYUY2 YUVTORGB RGBTORGB8
@ -565,7 +565,7 @@ void UYVYToARGBRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_argb, uint8_t* dst_argb,
const struct YuvConstants* yuvconstants, const struct YuvConstants* yuvconstants,
int width) { int width) {
asm volatile ( asm volatile(
YUVTORGB_SETUP YUVTORGB_SETUP
"vmov.u8 d6, #255 \n" "vmov.u8 d6, #255 \n"
"1: \n" READUYVY YUVTORGB RGBTORGB8 "1: \n" READUYVY YUVTORGB RGBTORGB8
@ -585,7 +585,7 @@ void SplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop "subs %3, %3, #16 \n" // 16 processed per loop
@ -609,7 +609,7 @@ void DetileRow_NEON(const uint8_t* src,
ptrdiff_t src_tile_stride, ptrdiff_t src_tile_stride,
uint8_t* dst, uint8_t* dst,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0], %3 \n" // load 16 bytes "vld1.8 {q0}, [%0], %3 \n" // load 16 bytes
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
@ -629,7 +629,7 @@ void DetileRow_16_NEON(const uint16_t* src,
ptrdiff_t src_tile_stride, ptrdiff_t src_tile_stride,
uint16_t* dst, uint16_t* dst,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels "vld1.16 {q0, q1}, [%0], %3 \n" // load 16 pixels
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
@ -650,7 +650,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.8 {d0, d1}, [%0], %4 \n" "vld2.8 {d0, d1}, [%0], %4 \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
@ -675,7 +675,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
ptrdiff_t src_uv_tile_stride, ptrdiff_t src_uv_tile_stride,
uint8_t* dst_yuy2, uint8_t* dst_yuy2,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0], %4 \n" // Load 16 Y "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y
"pld [%0, #1792] \n" "pld [%0, #1792] \n"
@ -701,7 +701,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
ptrdiff_t src_uv_tile_stride, ptrdiff_t src_uv_tile_stride,
uint8_t* dst_yuy2, uint8_t* dst_yuy2,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0], %4 \n" // Load 16 Y "vld1.8 {q0}, [%0], %4 \n" // Load 16 Y
"vld1.8 {q1}, [%1], %5 \n" // Load 8 UV "vld1.8 {q1}, [%1], %5 \n" // Load 8 UV
@ -723,7 +723,7 @@ void DetileToYUY2_NEON(const uint8_t* src_y,
#endif #endif
void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) { void UnpackMT2T_NEON(const uint8_t* src, uint16_t* dst, size_t size) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q14}, [%0]! \n" // Load lower bits. "vld1.8 {q14}, [%0]! \n" // Load lower bits.
"vld1.8 {q9}, [%0]! \n" // Load upper bits row "vld1.8 {q9}, [%0]! \n" // Load upper bits row
@ -767,7 +767,7 @@ void MergeUVRow_NEON(const uint8_t* src_u,
const uint8_t* src_v, const uint8_t* src_v,
uint8_t* dst_uv, uint8_t* dst_uv,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load U "vld1.8 {q0}, [%0]! \n" // load U
"vld1.8 {q1}, [%1]! \n" // load V "vld1.8 {q1}, [%1]! \n" // load V
@ -789,7 +789,7 @@ void SplitRGBRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_g, uint8_t* dst_g,
uint8_t* dst_b, uint8_t* dst_b,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB
"vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB "vld3.8 {d1, d3, d5}, [%0]! \n" // next 8 RGB
@ -814,7 +814,7 @@ void MergeRGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_b, const uint8_t* src_b,
uint8_t* dst_rgb, uint8_t* dst_rgb,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load R "vld1.8 {q0}, [%0]! \n" // load R
"vld1.8 {q1}, [%1]! \n" // load G "vld1.8 {q1}, [%1]! \n" // load G
@ -840,7 +840,7 @@ void SplitARGBRow_NEON(const uint8_t* src_argb,
uint8_t* dst_b, uint8_t* dst_b,
uint8_t* dst_a, uint8_t* dst_a,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
@ -868,7 +868,7 @@ void MergeARGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_a, const uint8_t* src_a,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q2}, [%0]! \n" // load R "vld1.8 {q2}, [%0]! \n" // load R
"vld1.8 {q1}, [%1]! \n" // load G "vld1.8 {q1}, [%1]! \n" // load G
@ -895,7 +895,7 @@ void SplitXRGBRow_NEON(const uint8_t* src_argb,
uint8_t* dst_g, uint8_t* dst_g,
uint8_t* dst_b, uint8_t* dst_b,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // next 8 ARGB
@ -920,7 +920,7 @@ void MergeXRGBRow_NEON(const uint8_t* src_r,
const uint8_t* src_b, const uint8_t* src_b,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 q3, #255 \n" // load A(255) "vmov.u8 q3, #255 \n" // load A(255)
"1: \n" "1: \n"
"vld1.8 {q2}, [%0]! \n" // load R "vld1.8 {q2}, [%0]! \n" // load R
@ -947,7 +947,7 @@ void MergeXR30Row_NEON(const uint16_t* src_r,
int depth, int depth,
int width) { int width) {
int shift = 10 - depth; int shift = 10 - depth;
asm volatile ( asm volatile(
"vmov.u32 q14, #1023 \n" "vmov.u32 q14, #1023 \n"
"vdup.32 q15, %5 \n" "vdup.32 q15, %5 \n"
"1: \n" "1: \n"
@ -984,7 +984,7 @@ void MergeXR30Row_10_NEON(const uint16_t* src_r,
uint8_t* dst_ar30, uint8_t* dst_ar30,
int /* depth */, int /* depth */,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u32 q14, #1023 \n" "vmov.u32 q14, #1023 \n"
"1: \n" "1: \n"
"vld1.16 {d4}, [%2]! \n" // B "vld1.16 {d4}, [%2]! \n" // B
@ -1021,7 +1021,7 @@ void MergeAR64Row_NEON(const uint16_t* src_r,
int width) { int width) {
int shift = 16 - depth; int shift = 16 - depth;
int mask = (1 << depth) - 1; int mask = (1 << depth) - 1;
asm volatile ( asm volatile(
"vdup.u16 q15, %6 \n" "vdup.u16 q15, %6 \n"
"vdup.u16 q14, %7 \n" "vdup.u16 q14, %7 \n"
@ -1061,7 +1061,7 @@ void MergeXR64Row_NEON(const uint16_t* src_r,
int width) { int width) {
int shift = 16 - depth; int shift = 16 - depth;
int mask = (1 << depth) - 1; int mask = (1 << depth) - 1;
asm volatile ( asm volatile(
"vmov.u8 q3, #0xff \n" // A (0xffff) "vmov.u8 q3, #0xff \n" // A (0xffff)
"vdup.u16 q15, %5 \n" "vdup.u16 q15, %5 \n"
@ -1098,7 +1098,7 @@ void MergeARGB16To8Row_NEON(const uint16_t* src_r,
int depth, int depth,
int width) { int width) {
int shift = 8 - depth; int shift = 8 - depth;
asm volatile ( asm volatile(
"vdup.16 q15, %6 \n" "vdup.16 q15, %6 \n"
"1: \n" "1: \n"
@ -1134,7 +1134,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
int depth, int depth,
int width) { int width) {
int shift = 8 - depth; int shift = 8 - depth;
asm volatile ( asm volatile(
"vdup.16 q15, %5 \n" "vdup.16 q15, %5 \n"
"vmov.u8 d6, #0xff \n" // A (0xff) "vmov.u8 d6, #0xff \n" // A (0xff)
@ -1162,7 +1162,7 @@ void MergeXRGB16To8Row_NEON(const uint16_t* src_r,
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) { void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32 "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop "subs %2, %2, #32 \n" // 32 processed per loop
@ -1178,7 +1178,7 @@ void CopyRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
// SetRow writes 'width' bytes using an 8 bit value repeated. // SetRow writes 'width' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) { void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
asm volatile ( asm volatile(
"vdup.8 q0, %2 \n" // duplicate 16 bytes "vdup.8 q0, %2 \n" // duplicate 16 bytes
"1: \n" "1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop "subs %1, %1, #16 \n" // 16 bytes per loop
@ -1192,7 +1192,7 @@ void SetRow_NEON(uint8_t* dst, uint8_t v8, int width) {
// ARGBSetRow writes 'width' pixels using an 32 bit value repeated. // ARGBSetRow writes 'width' pixels using an 32 bit value repeated.
void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) { void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
asm volatile ( asm volatile(
"vdup.u32 q0, %2 \n" // duplicate 4 ints "vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n" "1: \n"
"subs %1, %1, #4 \n" // 4 pixels per loop "subs %1, %1, #4 \n" // 4 pixels per loop
@ -1205,7 +1205,7 @@ void ARGBSetRow_NEON(uint8_t* dst, uint32_t v32, int width) {
} }
void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) { void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
asm volatile ( asm volatile(
// Start at end of source row. // Start at end of source row.
"add %0, %0, %2 \n" "add %0, %0, %2 \n"
"sub %0, %0, #32 \n" // 32 bytes per loop "sub %0, %0, #32 \n" // 32 bytes per loop
@ -1227,7 +1227,7 @@ void MirrorRow_NEON(const uint8_t* src, uint8_t* dst, int width) {
} }
void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) { void MirrorUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_uv, int width) {
asm volatile ( asm volatile(
// Start at end of source row. // Start at end of source row.
"mov r12, #-16 \n" "mov r12, #-16 \n"
"add %0, %0, %2, lsl #1 \n" "add %0, %0, %2, lsl #1 \n"
@ -1250,7 +1250,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
// Start at end of source row. // Start at end of source row.
"mov r12, #-16 \n" "mov r12, #-16 \n"
"add %0, %0, %3, lsl #1 \n" "add %0, %0, %3, lsl #1 \n"
@ -1272,7 +1272,7 @@ void MirrorSplitUVRow_NEON(const uint8_t* src_uv,
} }
void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { void ARGBMirrorRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile ( asm volatile(
"add %0, %0, %2, lsl #2 \n" "add %0, %0, %2, lsl #2 \n"
"sub %0, #32 \n" "sub %0, #32 \n"
@ -1296,7 +1296,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_rgb24, uint8_t* dst_rgb24,
int width) { int width) {
src_rgb24 += width * 3 - 24; src_rgb24 += width * 3 - 24;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24 "vld3.8 {d0, d1, d2}, [%0], %3 \n" // src -= 24
"subs %2, #8 \n" // 8 pixels per loop. "subs %2, #8 \n" // 8 pixels per loop.
@ -1315,7 +1315,7 @@ void RGB24MirrorRow_NEON(const uint8_t* src_rgb24,
void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24, void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d4, #255 \n" // Alpha "vmov.u8 d4, #255 \n" // Alpha
"1: \n" "1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
@ -1331,7 +1331,7 @@ void RGB24ToARGBRow_NEON(const uint8_t* src_rgb24,
} }
void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) { void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
asm volatile ( asm volatile(
"vmov.u8 d4, #255 \n" // Alpha "vmov.u8 d4, #255 \n" // Alpha
"1: \n" "1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
@ -1348,7 +1348,7 @@ void RAWToARGBRow_NEON(const uint8_t* src_raw, uint8_t* dst_argb, int width) {
} }
void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) { void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
asm volatile ( asm volatile(
"vmov.u8 d0, #255 \n" // Alpha "vmov.u8 d0, #255 \n" // Alpha
"1: \n" "1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
@ -1364,7 +1364,7 @@ void RAWToRGBARow_NEON(const uint8_t* src_raw, uint8_t* dst_rgba, int width) {
); );
} }
void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) { void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW. "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
@ -1395,7 +1395,7 @@ void RAWToRGB24Row_NEON(const uint8_t* src_raw, uint8_t* dst_rgb24, int width) {
void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565, void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d3, #255 \n" // Alpha "vmov.u8 d3, #255 \n" // Alpha
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
@ -1441,7 +1441,7 @@ void RGB565ToARGBRow_NEON(const uint8_t* src_rgb565,
void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555, void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d3, #255 \n" // Alpha "vmov.u8 d3, #255 \n" // Alpha
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
@ -1470,7 +1470,7 @@ void ARGB1555ToARGBRow_NEON(const uint8_t* src_argb1555,
void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444, void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d3, #255 \n" // Alpha "vmov.u8 d3, #255 \n" // Alpha
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels. "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
@ -1489,7 +1489,7 @@ void ARGB4444ToARGBRow_NEON(const uint8_t* src_argb4444,
void ARGBToRGB24Row_NEON(const uint8_t* src_argb, void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb24, uint8_t* dst_rgb24,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 16 pixels of ARGB.
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" "vld4.8 {d1, d3, d5, d7}, [%0]! \n"
@ -1506,7 +1506,7 @@ void ARGBToRGB24Row_NEON(const uint8_t* src_argb,
} }
void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) { void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
@ -1522,7 +1522,7 @@ void ARGBToRAWRow_NEON(const uint8_t* src_argb, uint8_t* dst_raw, int width) {
} }
void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) { void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
@ -1537,7 +1537,7 @@ void YUY2ToYRow_NEON(const uint8_t* src_yuy2, uint8_t* dst_y, int width) {
} }
void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) { void UYVYToYRow_NEON(const uint8_t* src_uyvy, uint8_t* dst_y, int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop. "subs %2, %2, #16 \n" // 16 processed per loop.
@ -1555,7 +1555,7 @@ void YUY2ToUV422Row_NEON(const uint8_t* src_yuy2,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
@ -1575,7 +1575,7 @@ void UYVYToUV422Row_NEON(const uint8_t* src_uyvy,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs. "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
@ -1596,7 +1596,7 @@ void YUY2ToUVRow_NEON(const uint8_t* src_yuy2,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
"add %1, %0, %1 \n" // stride + src_yuy2 "add %1, %0, %1 \n" // stride + src_yuy2
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
@ -1623,7 +1623,7 @@ void UYVYToUVRow_NEON(const uint8_t* src_uyvy,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
"add %1, %0, %1 \n" // stride + src_uyvy "add %1, %0, %1 \n" // stride + src_uyvy
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
@ -1649,7 +1649,7 @@ void YUY2ToNVUVRow_NEON(const uint8_t* src_yuy2,
int stride_yuy2, int stride_yuy2,
uint8_t* dst_uv, uint8_t* dst_uv,
int width) { int width) {
asm volatile ( asm volatile(
"add %1, %0, %1 \n" // stride + src_yuy2 "add %1, %0, %1 \n" // stride + src_yuy2
"1: \n" "1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2. "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
@ -1673,7 +1673,7 @@ void ARGBShuffleRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
const uint8_t* shuffler, const uint8_t* shuffler,
int width) { int width) {
asm volatile ( asm volatile(
"vld1.8 {q2}, [%3] \n" // shuffler "vld1.8 {q2}, [%3] \n" // shuffler
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 4 pixels. "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
@ -1695,7 +1695,7 @@ void I422ToYUY2Row_NEON(const uint8_t* src_y,
const uint8_t* src_v, const uint8_t* src_v,
uint8_t* dst_yuy2, uint8_t* dst_yuy2,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
"vld1.8 {d1}, [%1]! \n" // load 8 Us "vld1.8 {d1}, [%1]! \n" // load 8 Us
@ -1717,7 +1717,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
const uint8_t* src_v, const uint8_t* src_v,
uint8_t* dst_uyvy, uint8_t* dst_uyvy,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
"vld1.8 {d0}, [%1]! \n" // load 8 Us "vld1.8 {d0}, [%1]! \n" // load 8 Us
@ -1737,7 +1737,7 @@ void I422ToUYVYRow_NEON(const uint8_t* src_y,
void ARGBToRGB565Row_NEON(const uint8_t* src_argb, void ARGBToRGB565Row_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb565, uint8_t* dst_rgb565,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
@ -1755,7 +1755,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
uint8_t* dst_rgb, uint8_t* dst_rgb,
uint32_t dither4, uint32_t dither4,
int width) { int width) {
asm volatile ( asm volatile(
"vdup.32 d7, %2 \n" // dither4 "vdup.32 d7, %2 \n" // dither4
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB. "vld4.8 {d0, d2, d4, d6}, [%1]! \n" // load 8 pixels of ARGB.
@ -1776,7 +1776,7 @@ void ARGBToRGB565DitherRow_NEON(const uint8_t* src_argb,
void ARGBToARGB1555Row_NEON(const uint8_t* src_argb, void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb1555, uint8_t* dst_argb1555,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop. "subs %2, %2, #8 \n" // 8 processed per loop.
@ -1793,7 +1793,7 @@ void ARGBToARGB1555Row_NEON(const uint8_t* src_argb,
void ARGBToARGB4444Row_NEON(const uint8_t* src_argb, void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
uint8_t* dst_argb4444, uint8_t* dst_argb4444,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d7, #0x0f \n" // bits to clear with "vmov.u8 d7, #0x0f \n" // bits to clear with
// vbic. // vbic.
"1: \n" "1: \n"
@ -1812,7 +1812,7 @@ void ARGBToARGB4444Row_NEON(const uint8_t* src_argb,
void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb, void ARGBExtractAlphaRow_NEON(const uint8_t* src_argb,
uint8_t* dst_a, uint8_t* dst_a,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels
@ -1839,7 +1839,7 @@ static void ARGBToUV444MatrixRow_NEON(
uint8_t* dst_v, uint8_t* dst_v,
int width, int width,
const struct RgbUVConstants* rgbuvconstants) { const struct RgbUVConstants* rgbuvconstants) {
asm volatile ( asm volatile(
"vld1.8 {d0}, [%4] \n" // load rgbuvconstants "vld1.8 {d0}, [%4] \n" // load rgbuvconstants
"vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient "vdup.u8 d24, d0[0] \n" // UB 0.875 coefficient
@ -2367,7 +2367,7 @@ void RGB565ToUVRow_NEON(const uint8_t* src_rgb565,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
"add %1, %0, %1 \n" // src_stride + src_argb "add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient // coefficient
@ -2433,7 +2433,7 @@ void ARGB1555ToUVRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_u, uint8_t* dst_u,
uint8_t* dst_v, uint8_t* dst_v,
int width) { int width) {
asm volatile ( asm volatile(
"add %1, %0, %1 \n" // src_stride + src_argb "add %1, %0, %1 \n" // src_stride + src_argb
"vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875
// coefficient // coefficient
@ -2551,7 +2551,7 @@ void ARGB4444ToUVRow_NEON(const uint8_t* src_argb4444,
} }
void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) { void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
asm volatile ( asm volatile(
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
@ -2577,7 +2577,7 @@ void RGB565ToYRow_NEON(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555, void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
uint8_t* dst_y, uint8_t* dst_y,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
@ -2603,7 +2603,7 @@ void ARGB1555ToYRow_NEON(const uint8_t* src_argb1555,
void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444, void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
uint8_t* dst_y, uint8_t* dst_y,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d24, #25 \n" // B * 0.1016 coefficient "vmov.u8 d24, #25 \n" // B * 0.1016 coefficient
"vmov.u8 d25, #129 \n" // G * 0.5078 coefficient "vmov.u8 d25, #129 \n" // G * 0.5078 coefficient
"vmov.u8 d26, #66 \n" // R * 0.2578 coefficient "vmov.u8 d26, #66 \n" // R * 0.2578 coefficient
@ -2629,7 +2629,7 @@ void ARGB4444ToYRow_NEON(const uint8_t* src_argb4444,
void ARGBToAR64Row_NEON(const uint8_t* src_argb, void ARGBToAR64Row_NEON(const uint8_t* src_argb,
uint16_t* dst_ar64, uint16_t* dst_ar64,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" "vld1.8 {q0}, [%0]! \n"
"vld1.8 {q2}, [%0]! \n" "vld1.8 {q2}, [%0]! \n"
@ -2652,7 +2652,7 @@ static const uvec8 kShuffleARGBToABGR = {2, 1, 0, 3, 6, 5, 4, 7,
void ARGBToAB64Row_NEON(const uint8_t* src_argb, void ARGBToAB64Row_NEON(const uint8_t* src_argb,
uint16_t* dst_ab64, uint16_t* dst_ab64,
int width) { int width) {
asm volatile ( asm volatile(
"vld1.8 {q4}, [%3] \n" // shuffler "vld1.8 {q4}, [%3] \n" // shuffler
"1: \n" "1: \n"
@ -2678,7 +2678,7 @@ void ARGBToAB64Row_NEON(const uint8_t* src_argb,
void AR64ToARGBRow_NEON(const uint16_t* src_ar64, void AR64ToARGBRow_NEON(const uint16_t* src_ar64,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.16 {q0}, [%0]! \n" "vld1.16 {q0}, [%0]! \n"
"vld1.16 {q1}, [%0]! \n" "vld1.16 {q1}, [%0]! \n"
@ -2704,7 +2704,7 @@ static const uvec8 kShuffleAB64ToARGB = {5, 3, 1, 7, 13, 11, 9, 15};
void AB64ToARGBRow_NEON(const uint16_t* src_ab64, void AB64ToARGBRow_NEON(const uint16_t* src_ab64,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vld1.8 {d8}, [%3] \n" // shuffler "vld1.8 {d8}, [%3] \n" // shuffler
"1: \n" "1: \n"
@ -2757,7 +2757,7 @@ static void ARGBToYMatrixRow_NEON(const uint8_t* src_argb,
uint8_t* dst_y, uint8_t* dst_y,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
asm volatile ( asm volatile(
"vld1.8 {d0}, [%3] \n" // load rgbconstants "vld1.8 {d0}, [%3] \n" // load rgbconstants
"vdup.u8 d20, d0[0] \n" "vdup.u8 d20, d0[0] \n"
"vdup.u8 d21, d0[1] \n" "vdup.u8 d21, d0[1] \n"
@ -2807,7 +2807,7 @@ static void RGBAToYMatrixRow_NEON(const uint8_t* src_rgba,
uint8_t* dst_y, uint8_t* dst_y,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
asm volatile ( asm volatile(
"vld1.8 {d0}, [%3] \n" // load rgbconstants "vld1.8 {d0}, [%3] \n" // load rgbconstants
"vdup.u8 d20, d0[0] \n" "vdup.u8 d20, d0[0] \n"
"vdup.u8 d21, d0[1] \n" "vdup.u8 d21, d0[1] \n"
@ -2851,7 +2851,7 @@ static void RGBToYMatrixRow_NEON(const uint8_t* src_rgb,
uint8_t* dst_y, uint8_t* dst_y,
int width, int width,
const struct RgbConstants* rgbconstants) { const struct RgbConstants* rgbconstants) {
asm volatile ( asm volatile(
"vld1.8 {d0}, [%3] \n" // load rgbconstants "vld1.8 {d0}, [%3] \n" // load rgbconstants
"vdup.u8 d20, d0[0] \n" "vdup.u8 d20, d0[0] \n"
"vdup.u8 d21, d0[1] \n" "vdup.u8 d21, d0[1] \n"
@ -2903,7 +2903,7 @@ void InterpolateRow_NEON(uint8_t* dst_ptr,
int dst_width, int dst_width,
int source_y_fraction) { int source_y_fraction) {
int y1_fraction = source_y_fraction; int y1_fraction = source_y_fraction;
asm volatile ( asm volatile(
"cmp %4, #0 \n" "cmp %4, #0 \n"
"beq 100f \n" "beq 100f \n"
"add %2, %1 \n" "add %2, %1 \n"
@ -2965,7 +2965,7 @@ void InterpolateRow_16_NEON(uint16_t* dst_ptr,
int y0_fraction = 256 - y1_fraction; int y0_fraction = 256 - y1_fraction;
const uint16_t* src_ptr1 = src_ptr + src_stride; const uint16_t* src_ptr1 = src_ptr + src_stride;
asm volatile ( asm volatile(
"cmp %4, #0 \n" "cmp %4, #0 \n"
"beq 100f \n" "beq 100f \n"
"cmp %4, #128 \n" "cmp %4, #128 \n"
@ -3020,7 +3020,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1, const uint8_t* src_argb1,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"subs %3, #8 \n" "subs %3, #8 \n"
"blt 89f \n" "blt 89f \n"
// Blend 8 pixels. // Blend 8 pixels.
@ -3079,7 +3079,7 @@ void ARGBBlendRow_NEON(const uint8_t* src_argb,
void ARGBAttenuateRow_NEON(const uint8_t* src_argb, void ARGBAttenuateRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u16 q15, #0x00ff \n" // 255 for rounding up "vmov.u16 q15, #0x00ff \n" // 255 for rounding up
// Attenuate 8 pixels. // Attenuate 8 pixels.
@ -3108,7 +3108,7 @@ void ARGBQuantizeRow_NEON(uint8_t* dst_argb,
int interval_size, int interval_size,
int interval_offset, int interval_offset,
int width) { int width) {
asm volatile ( asm volatile(
"vdup.u16 q8, %2 \n" "vdup.u16 q8, %2 \n"
"vshr.u16 q8, q8, #1 \n" // scale >>= 1 "vshr.u16 q8, q8, #1 \n" // scale >>= 1
"vdup.u16 q9, %3 \n" // interval multiply. "vdup.u16 q9, %3 \n" // interval multiply.
@ -3150,7 +3150,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
int width, int width,
uint32_t value) { uint32_t value) {
asm volatile ( asm volatile(
"vdup.u32 q0, %3 \n" // duplicate scale value. "vdup.u32 q0, %3 \n" // duplicate scale value.
"vzip.u8 d0, d1 \n" // d0 aarrggbb. "vzip.u8 d0, d1 \n" // d0 aarrggbb.
"vshr.u16 q0, q0, #1 \n" // scale / 2. "vshr.u16 q0, q0, #1 \n" // scale / 2.
@ -3184,7 +3184,7 @@ void ARGBShadeRow_NEON(const uint8_t* src_argb,
// Similar to ARGBToYJ but stores ARGB. // Similar to ARGBToYJ but stores ARGB.
// C code is (29 * b + 150 * g + 77 * r + 128) >> 8; // C code is (29 * b + 150 * g + 77 * r + 128) >> 8;
void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) { void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
asm volatile ( asm volatile(
"vmov.u8 d24, #29 \n" // B * 0.1140 coefficient "vmov.u8 d24, #29 \n" // B * 0.1140 coefficient
"vmov.u8 d25, #150 \n" // G * 0.5870 coefficient "vmov.u8 d25, #150 \n" // G * 0.5870 coefficient
"vmov.u8 d26, #77 \n" // R * 0.2990 coefficient "vmov.u8 d26, #77 \n" // R * 0.2990 coefficient
@ -3211,7 +3211,7 @@ void ARGBGrayRow_NEON(const uint8_t* src_argb, uint8_t* dst_argb, int width) {
// g = (r * 45 + g * 88 + b * 22) >> 7 // g = (r * 45 + g * 88 + b * 22) >> 7
// r = (r * 50 + g * 98 + b * 24) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7
void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) { void ARGBSepiaRow_NEON(uint8_t* dst_argb, int width) {
asm volatile ( asm volatile(
"vmov.u8 d20, #17 \n" // BB coefficient "vmov.u8 d20, #17 \n" // BB coefficient
"vmov.u8 d21, #68 \n" // BG coefficient "vmov.u8 d21, #68 \n" // BG coefficient
"vmov.u8 d22, #35 \n" // BR coefficient "vmov.u8 d22, #35 \n" // BR coefficient
@ -3252,7 +3252,7 @@ void ARGBColorMatrixRow_NEON(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
const int8_t* matrix_argb, const int8_t* matrix_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors. "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
"vmovl.s8 q0, d4 \n" // B,G coefficients s16. "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
"vmovl.s8 q1, d5 \n" // R,A coefficients s16. "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
@ -3311,7 +3311,7 @@ void ARGBMultiplyRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1, const uint8_t* src_argb1,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
@ -3340,7 +3340,7 @@ void ARGBAddRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1, const uint8_t* src_argb1,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
@ -3363,7 +3363,7 @@ void ARGBSubtractRow_NEON(const uint8_t* src_argb,
const uint8_t* src_argb1, const uint8_t* src_argb1,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels. "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
@ -3390,7 +3390,7 @@ void SobelRow_NEON(const uint8_t* src_sobelx,
const uint8_t* src_sobely, const uint8_t* src_sobely,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d3, #255 \n" // alpha "vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
@ -3415,7 +3415,7 @@ void SobelToPlaneRow_NEON(const uint8_t* src_sobelx,
const uint8_t* src_sobely, const uint8_t* src_sobely,
uint8_t* dst_y, uint8_t* dst_y,
int width) { int width) {
asm volatile ( asm volatile(
// 16 pixel loop. // 16 pixel loop.
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx. "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
@ -3441,7 +3441,7 @@ void SobelXYRow_NEON(const uint8_t* src_sobelx,
const uint8_t* src_sobely, const uint8_t* src_sobely,
uint8_t* dst_argb, uint8_t* dst_argb,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u8 d3, #255 \n" // alpha "vmov.u8 d3, #255 \n" // alpha
// 8 pixel loop. // 8 pixel loop.
"1: \n" "1: \n"
@ -3468,7 +3468,7 @@ void SobelXRow_NEON(const uint8_t* src_y0,
const uint8_t* src_y2, const uint8_t* src_y2,
uint8_t* dst_sobelx, uint8_t* dst_sobelx,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {d0}, [%0],%5 \n" // top "vld1.8 {d0}, [%0],%5 \n" // top
"vld1.8 {d1}, [%0],%6 \n" "vld1.8 {d1}, [%0],%6 \n"
@ -3506,7 +3506,7 @@ void SobelYRow_NEON(const uint8_t* src_y0,
const uint8_t* src_y1, const uint8_t* src_y1,
uint8_t* dst_sobely, uint8_t* dst_sobely,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {d0}, [%0],%4 \n" // left "vld1.8 {d0}, [%0],%4 \n" // left
"vld1.8 {d1}, [%1],%4 \n" "vld1.8 {d1}, [%1],%4 \n"
@ -3543,7 +3543,7 @@ void HalfFloatRow_NEON(const uint16_t* src,
uint16_t* dst, uint16_t* dst,
float scale, float scale,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.16 {q0, q1}, [%0]! \n" // load 16 shorts "vld1.16 {q0, q1}, [%0]! \n" // load 16 shorts
@ -3577,7 +3577,7 @@ void ByteToFloatRow_NEON(const uint8_t* src,
float* dst, float* dst,
float scale, float scale,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {d2}, [%0]! \n" // load 8 bytes "vld1.8 {d2}, [%0]! \n" // load 8 bytes
@ -3606,7 +3606,7 @@ void GaussCol_NEON(const uint16_t* src0,
const uint16_t* src4, const uint16_t* src4,
uint32_t* dst, uint32_t* dst,
int width) { int width) {
asm volatile ( asm volatile(
"vmov.u16 d6, #4 \n" // constant 4 "vmov.u16 d6, #4 \n" // constant 4
"vmov.u16 d7, #6 \n" // constant 6 "vmov.u16 d7, #6 \n" // constant 6
@ -3643,7 +3643,7 @@ void GaussRow_NEON(const uint32_t* src, uint16_t* dst, int width) {
const uint32_t* src1 = src + 1; const uint32_t* src1 = src + 1;
const uint32_t* src2 = src + 2; const uint32_t* src2 = src + 2;
const uint32_t* src3 = src + 3; const uint32_t* src3 = src + 3;
asm volatile ( asm volatile(
"vmov.u32 q10, #4 \n" // constant 4 "vmov.u32 q10, #4 \n" // constant 4
"vmov.u32 q11, #6 \n" // constant 6 "vmov.u32 q11, #6 \n" // constant 6
@ -3681,7 +3681,7 @@ void NV21ToYUV24Row_NEON(const uint8_t* src_y,
const uint8_t* src_vu, const uint8_t* src_vu,
uint8_t* dst_yuv24, uint8_t* dst_yuv24,
int width) { int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q2}, [%0]! \n" // load 16 Y values "vld1.8 {q2}, [%0]! \n" // load 16 Y values
"vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values "vld2.8 {d0, d2}, [%1]! \n" // load 8 VU values
@ -3705,7 +3705,7 @@ void AYUVToUVRow_NEON(const uint8_t* src_ayuv,
int src_stride_ayuv, int src_stride_ayuv,
uint8_t* dst_uv, uint8_t* dst_uv,
int width) { int width) {
asm volatile ( asm volatile(
"add %1, %0, %1 \n" // src_stride + src_AYUV "add %1, %0, %1 \n" // src_stride + src_AYUV
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
@ -3736,7 +3736,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
int src_stride_ayuv, int src_stride_ayuv,
uint8_t* dst_vu, uint8_t* dst_vu,
int width) { int width) {
asm volatile ( asm volatile(
"add %1, %0, %1 \n" // src_stride + src_AYUV "add %1, %0, %1 \n" // src_stride + src_AYUV
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels. "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels.
@ -3766,7 +3766,7 @@ void AYUVToVURow_NEON(const uint8_t* src_ayuv,
// Copy row of AYUV Y's into Y. // Copy row of AYUV Y's into Y.
// Similar to ARGBExtractAlphaRow_NEON // Similar to ARGBExtractAlphaRow_NEON
void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) { void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 AYUV pixels
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 AYUV pixels
@ -3782,7 +3782,7 @@ void AYUVToYRow_NEON(const uint8_t* src_ayuv, uint8_t* dst_y, int width) {
// Convert UV plane of NV12 to VU of NV21. // Convert UV plane of NV12 to VU of NV21.
void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) { void SwapUVRow_NEON(const uint8_t* src_uv, uint8_t* dst_vu, int width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values "vld2.8 {d0, d2}, [%0]! \n" // load 16 UV values
"vld2.8 {d1, d3}, [%0]! \n" "vld2.8 {d1, d3}, [%0]! \n"
@ -3805,7 +3805,7 @@ void HalfMergeUVRow_NEON(const uint8_t* src_u,
int width) { int width) {
const uint8_t* src_u_1 = src_u + src_stride_u; const uint8_t* src_u_1 = src_u + src_stride_u;
const uint8_t* src_v_1 = src_v + src_stride_v; const uint8_t* src_v_1 = src_v + src_stride_v;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load 16 U values "vld1.8 {q0}, [%0]! \n" // load 16 U values
"vld1.8 {q1}, [%2]! \n" // load 16 V values "vld1.8 {q1}, [%2]! \n" // load 16 V values
@ -3836,7 +3836,7 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
int depth, int depth,
int width) { int width) {
int shift = depth - 16; // Negative for right shift. int shift = depth - 16; // Negative for right shift.
asm volatile ( asm volatile(
"vdup.16 q2, %4 \n" "vdup.16 q2, %4 \n"
"1: \n" "1: \n"
"vld2.16 {q0, q1}, [%0]! \n" // load 8 UV "vld2.16 {q0, q1}, [%0]! \n" // load 8 UV
@ -3860,7 +3860,7 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
int depth, int depth,
int width) { int width) {
int shift = 16 - depth; int shift = 16 - depth;
asm volatile ( asm volatile(
"vdup.16 q2, %4 \n" "vdup.16 q2, %4 \n"
"1: \n" "1: \n"
"vld1.16 {q0}, [%0]! \n" // load 8 U "vld1.16 {q0}, [%0]! \n" // load 8 U
@ -3882,7 +3882,7 @@ void MultiplyRow_16_NEON(const uint16_t* src_y,
uint16_t* dst_y, uint16_t* dst_y,
int scale, int scale,
int width) { int width) {
asm volatile ( asm volatile(
"vdup.16 q2, %3 \n" "vdup.16 q2, %3 \n"
"1: \n" "1: \n"
"vld1.16 {q0}, [%0]! \n" "vld1.16 {q0}, [%0]! \n"
@ -3904,7 +3904,7 @@ void DivideRow_16_NEON(const uint16_t* src_y,
uint16_t* dst_y, uint16_t* dst_y,
int scale, int scale,
int width) { int width) {
asm volatile ( asm volatile(
"vdup.16 d8, %3 \n" "vdup.16 d8, %3 \n"
"1: \n" "1: \n"
"vld1.16 {q2, q3}, [%0]! \n" "vld1.16 {q2, q3}, [%0]! \n"
@ -3936,7 +3936,7 @@ void Convert16To8Row_NEON(const uint16_t* src_y,
int scale, int scale,
int width) { int width) {
int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr int shift = 15 - __builtin_clz((int32_t)scale); // Negative shl is shr
asm volatile ( asm volatile(
"vdup.16 q2, %3 \n" "vdup.16 q2, %3 \n"
"1: \n" "1: \n"
"vld1.16 {q0}, [%0]! \n" "vld1.16 {q0}, [%0]! \n"

File diff suppressed because it is too large Load Diff

View File

@ -47,7 +47,7 @@ extern "C" {
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \ #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, bb, bg, br) \
{ \ { \
asm volatile ("csrwi vxrm, 0"); \ asm volatile("csrwi vxrm, 0"); \
ub = yuvconst->kUVCoeff[0]; \ ub = yuvconst->kUVCoeff[0]; \
vr = yuvconst->kUVCoeff[1]; \ vr = yuvconst->kUVCoeff[1]; \
ug = yuvconst->kUVCoeff[2]; \ ug = yuvconst->kUVCoeff[2]; \
@ -1238,7 +1238,7 @@ void I400ToARGBRow_RVV(const uint8_t* src_y,
vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl); vuint16m4_t v_yg = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[0], vl);
// To match behavior on other platforms, vxrm (fixed-point rounding mode // To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) sets to round-to-nearest-up mode(0). // register) sets to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
if (is_yb_positive) { if (is_yb_positive) {
v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl); v_yb = __riscv_vmv_v_x_u16m4(yuvconstants->kRGBCoeffBias[4] - 32, vl);
} else { } else {
@ -1632,7 +1632,7 @@ void InterpolateRow_RVV(uint8_t* dst_ptr,
} }
// To match behavior on other platforms, vxrm (fixed-point rounding mode // To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up(0). // register) is set to round-to-nearest-up(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
// Blend 50 / 50. // Blend 50 / 50.
if (y1_fraction == 128) { if (y1_fraction == 128) {
do { do {

View File

@ -241,7 +241,7 @@ static void ARGBToUVMatrixRow_SVE2(const uint8_t* src_argb,
const int16_t* uvconstants) { const int16_t* uvconstants) {
const uint8_t* src_argb_1 = src_argb + src_stride_argb; const uint8_t* src_argb_1 = src_argb + src_stride_argb;
uint64_t vl; uint64_t vl;
asm volatile ( asm volatile(
"ptrue p0.b \n" "ptrue p0.b \n"
"ld1rd {z24.d}, p0/z, [%[uvconstants]] \n" "ld1rd {z24.d}, p0/z, [%[uvconstants]] \n"
"ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n" "ld1rd {z25.d}, p0/z, [%[uvconstants], #8] \n"

View File

@ -10,8 +10,8 @@
#include "libyuv/scale.h" #include "libyuv/scale.h"
#include <limits.h>
#include <assert.h> #include <assert.h>
#include <limits.h>
#include <stdint.h> #include <stdint.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
@ -1233,10 +1233,9 @@ int YUVToARGBScaleClip(const uint8_t* src_y,
(void)src_fourcc; // TODO(fbarchard): implement and/or assert. (void)src_fourcc; // TODO(fbarchard): implement and/or assert.
(void)dst_fourcc; (void)dst_fourcc;
const int abs_src_height = (src_height < 0) ? -src_height : src_height; const int abs_src_height = (src_height < 0) ? -src_height : src_height;
if (!src_y || !src_u || !src_v || !dst_argb || if (!src_y || !src_u || !src_v || !dst_argb || src_width <= 0 ||
src_width <= 0 || src_width > INT_MAX / 4 || src_height == 0 || src_width > INT_MAX / 4 || src_height == 0 || dst_width <= 0 ||
dst_width <= 0 || dst_height <= 0 || dst_height <= 0 || clip_width <= 0 || clip_height <= 0) {
clip_width <= 0 || clip_height <= 0) {
return -1; return -1;
} }
const uint64_t argb_buffer_size = (uint64_t)src_width * abs_src_height * 4; const uint64_t argb_buffer_size = (uint64_t)src_width * abs_src_height * 4;
@ -1250,9 +1249,9 @@ int YUVToARGBScaleClip(const uint8_t* src_y,
I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v, I420ToARGB(src_y, src_stride_y, src_u, src_stride_u, src_v, src_stride_v,
argb_buffer, src_width * 4, src_width, src_height); argb_buffer, src_width * 4, src_width, src_height);
r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, abs_src_height, dst_argb, r = ARGBScaleClip(argb_buffer, src_width * 4, src_width, abs_src_height,
dst_stride_argb, dst_width, dst_height, clip_x, clip_y, dst_argb, dst_stride_argb, dst_width, dst_height, clip_x,
clip_width, clip_height, filtering); clip_y, clip_width, clip_height, filtering);
free(argb_buffer); free(argb_buffer);
return r; return r;
} }

View File

@ -97,7 +97,7 @@ void ScaleRowDown2_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
// 16 pixel loop. // 16 pixel loop.
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -123,7 +123,7 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n" "pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n"
@ -154,7 +154,7 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n" "pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n"
@ -195,7 +195,7 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vmovdqu (%0),%%ymm0 \n" "vmovdqu (%0),%%ymm0 \n"
"vmovdqu 0x20(%0),%%ymm1 \n" "vmovdqu 0x20(%0),%%ymm1 \n"
@ -221,7 +221,7 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
@ -254,7 +254,7 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
@ -297,7 +297,7 @@ void ScaleRowDown4_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"pcmpeqb %%xmm5,%%xmm5 \n" "pcmpeqb %%xmm5,%%xmm5 \n"
"psrld $0x18,%%xmm5 \n" "psrld $0x18,%%xmm5 \n"
"pslld $0x10,%%xmm5 \n" "pslld $0x10,%%xmm5 \n"
@ -328,7 +328,7 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
intptr_t stridex3; intptr_t stridex3;
asm volatile ( asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n" "pcmpeqb %%xmm4,%%xmm4 \n"
"psrlw $0xf,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n"
"movdqa %%xmm4,%%xmm5 \n" "movdqa %%xmm4,%%xmm5 \n"
@ -383,7 +383,7 @@ void ScaleRowDown4_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n" "vpcmpeqb %%ymm5,%%ymm5,%%ymm5 \n"
"vpsrld $0x18,%%ymm5,%%ymm5 \n" "vpsrld $0x18,%%ymm5,%%ymm5 \n"
"vpslld $0x10,%%ymm5,%%ymm5 \n" "vpslld $0x10,%%ymm5,%%ymm5 \n"
@ -416,7 +416,7 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpsllw $0x3,%%ymm4,%%ymm5 \n" "vpsllw $0x3,%%ymm4,%%ymm5 \n"
@ -472,7 +472,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"movdqa %0,%%xmm3 \n" "movdqa %0,%%xmm3 \n"
"movdqa %1,%%xmm4 \n" "movdqa %1,%%xmm4 \n"
"movdqa %2,%%xmm5 \n" "movdqa %2,%%xmm5 \n"
@ -481,7 +481,7 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
"m"(kShuf1), // %1 "m"(kShuf1), // %1
"m"(kShuf2) // %2 "m"(kShuf2) // %2
); );
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm2 \n" "movdqu 0x10(%0),%%xmm2 \n"
@ -508,7 +508,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"movdqa %0,%%xmm2 \n" // kShuf01 "movdqa %0,%%xmm2 \n" // kShuf01
"movdqa %1,%%xmm3 \n" // kShuf11 "movdqa %1,%%xmm3 \n" // kShuf11
"movdqa %2,%%xmm4 \n" // kShuf21 "movdqa %2,%%xmm4 \n" // kShuf21
@ -517,7 +517,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShuf11), // %1 "m"(kShuf11), // %1
"m"(kShuf21) // %2 "m"(kShuf21) // %2
); );
asm volatile ( asm volatile(
"movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11 "movdqa %1,%%xmm0 \n" // kMadd11
"movdqa %2,%%xmm1 \n" // kRound34 "movdqa %2,%%xmm1 \n" // kRound34
@ -526,7 +526,7 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
"m"(kMadd11), // %1 "m"(kMadd11), // %1
"m"(kRound34) // %2 "m"(kRound34) // %2
); );
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm6 \n" "movdqu (%0),%%xmm6 \n"
"movdqu 0x00(%0,%3,1),%%xmm7 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n"
@ -572,7 +572,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"movdqa %0,%%xmm2 \n" // kShuf01 "movdqa %0,%%xmm2 \n" // kShuf01
"movdqa %1,%%xmm3 \n" // kShuf11 "movdqa %1,%%xmm3 \n" // kShuf11
"movdqa %2,%%xmm4 \n" // kShuf21 "movdqa %2,%%xmm4 \n" // kShuf21
@ -581,7 +581,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShuf11), // %1 "m"(kShuf11), // %1
"m"(kShuf21) // %2 "m"(kShuf21) // %2
); );
asm volatile ( asm volatile(
"movdqa %0,%%xmm5 \n" // kMadd01 "movdqa %0,%%xmm5 \n" // kMadd01
"movdqa %1,%%xmm0 \n" // kMadd11 "movdqa %1,%%xmm0 \n" // kMadd11
"movdqa %2,%%xmm1 \n" // kRound34 "movdqa %2,%%xmm1 \n" // kRound34
@ -591,7 +591,7 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
"m"(kRound34) // %2 "m"(kRound34) // %2
); );
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm6 \n" "movdqu (%0),%%xmm6 \n"
"movdqu 0x00(%0,%3,1),%%xmm7 \n" "movdqu 0x00(%0,%3,1),%%xmm7 \n"
@ -641,7 +641,7 @@ void ScaleRowDown38_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"movdqa %3,%%xmm4 \n" "movdqa %3,%%xmm4 \n"
"movdqa %4,%%xmm5 \n" "movdqa %4,%%xmm5 \n"
@ -671,7 +671,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"movdqa %0,%%xmm2 \n" "movdqa %0,%%xmm2 \n"
"movdqa %1,%%xmm3 \n" "movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm4 \n" "movdqa %2,%%xmm4 \n"
@ -682,7 +682,7 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShufAb2), // %2 "m"(kShufAb2), // %2
"m"(kScaleAb2) // %3 "m"(kScaleAb2) // %3
); );
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%3,1),%%xmm1 \n" "movdqu 0x00(%0,%3,1),%%xmm1 \n"
@ -714,7 +714,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"movdqa %0,%%xmm2 \n" "movdqa %0,%%xmm2 \n"
"movdqa %1,%%xmm3 \n" "movdqa %1,%%xmm3 \n"
"movdqa %2,%%xmm4 \n" "movdqa %2,%%xmm4 \n"
@ -724,7 +724,7 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
"m"(kShufAc3), // %1 "m"(kShufAc3), // %1
"m"(kScaleAc33) // %2 "m"(kScaleAc33) // %2
); );
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x00(%0,%3,1),%%xmm6 \n" "movdqu 0x00(%0,%3,1),%%xmm6 \n"
@ -782,7 +782,7 @@ static const uvec8 kLinearMadd31 = {3, 1, 1, 3, 3, 1, 1, 3,
void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr, void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pxor %%xmm0,%%xmm0 \n" // 0 "pxor %%xmm0,%%xmm0 \n" // 0
"pcmpeqw %%xmm6,%%xmm6 \n" "pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n" "psrlw $15,%%xmm6 \n"
@ -838,7 +838,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"pxor %%xmm0,%%xmm0 \n" // 0 "pxor %%xmm0,%%xmm0 \n" // 0
// above line // above line
@ -951,7 +951,7 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr, void ScaleRowUp2_Linear_12_SSSE3(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"movdqa %3,%%xmm5 \n" "movdqa %3,%%xmm5 \n"
"pcmpeqw %%xmm4,%%xmm4 \n" "pcmpeqw %%xmm4,%%xmm4 \n"
"psrlw $15,%%xmm4 \n" "psrlw $15,%%xmm4 \n"
@ -1003,7 +1003,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pcmpeqw %%xmm7,%%xmm7 \n" "pcmpeqw %%xmm7,%%xmm7 \n"
"psrlw $15,%%xmm7 \n" "psrlw $15,%%xmm7 \n"
"psllw $3,%%xmm7 \n" // all 8 "psllw $3,%%xmm7 \n" // all 8
@ -1101,7 +1101,7 @@ void ScaleRowUp2_Bilinear_12_SSSE3(const uint16_t* src_ptr,
void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr, void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pxor %%xmm5,%%xmm5 \n" "pxor %%xmm5,%%xmm5 \n"
"pcmpeqd %%xmm4,%%xmm4 \n" "pcmpeqd %%xmm4,%%xmm4 \n"
"psrld $31,%%xmm4 \n" "psrld $31,%%xmm4 \n"
@ -1154,7 +1154,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pxor %%xmm7,%%xmm7 \n" "pxor %%xmm7,%%xmm7 \n"
"pcmpeqd %%xmm6,%%xmm6 \n" "pcmpeqd %%xmm6,%%xmm6 \n"
"psrld $31,%%xmm6 \n" "psrld $31,%%xmm6 \n"
@ -1262,7 +1262,7 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr, void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pcmpeqw %%xmm4,%%xmm4 \n" "pcmpeqw %%xmm4,%%xmm4 \n"
"psrlw $15,%%xmm4 \n" "psrlw $15,%%xmm4 \n"
"psllw $1,%%xmm4 \n" // all 2 "psllw $1,%%xmm4 \n" // all 2
@ -1303,7 +1303,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pcmpeqw %%xmm6,%%xmm6 \n" "pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n" "psrlw $15,%%xmm6 \n"
"psllw $3,%%xmm6 \n" // all 8 "psllw $3,%%xmm6 \n" // all 8
@ -1388,7 +1388,7 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr, void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
@ -1432,7 +1432,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrlw $15,%%ymm6,%%ymm6 \n" "vpsrlw $15,%%ymm6,%%ymm6 \n"
"vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
@ -1514,7 +1514,7 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr, void ScaleRowUp2_Linear_12_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vbroadcastf128 %3,%%ymm5 \n" "vbroadcastf128 %3,%%ymm5 \n"
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n"
@ -1566,7 +1566,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vbroadcastf128 %5,%%ymm5 \n" "vbroadcastf128 %5,%%ymm5 \n"
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n"
@ -1628,7 +1628,7 @@ void ScaleRowUp2_Bilinear_12_AVX2(const uint16_t* src_ptr,
void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrld $31,%%ymm4,%%ymm4 \n" "vpsrld $31,%%ymm4,%%ymm4 \n"
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2 "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
@ -1678,7 +1678,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrld $31,%%ymm6,%%ymm6 \n" "vpsrld $31,%%ymm6,%%ymm6 \n"
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8 "vpslld $3,%%ymm6,%%ymm6 \n" // all 8
@ -1761,8 +1761,7 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
void ScaleAddRow_SSE2(const uint8_t* src_ptr, void ScaleAddRow_SSE2(const uint8_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int src_width) { int src_width) {
asm volatile ( asm volatile("pxor %%xmm5,%%xmm5 \n"
"pxor %%xmm5,%%xmm5 \n"
// 16 pixel loop. // 16 pixel loop.
LABELALIGN LABELALIGN
@ -1793,8 +1792,7 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
void ScaleAddRow_AVX2(const uint8_t* src_ptr, void ScaleAddRow_AVX2(const uint8_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int src_width) { int src_width) {
asm volatile ( asm volatile("vpxor %%ymm5,%%ymm5,%%ymm5 \n"
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN LABELALIGN
"1: \n" "1: \n"
@ -1835,7 +1833,7 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
int x, int x,
int dx) { int dx) {
intptr_t x0, x1, temp_pixel; intptr_t x0, x1, temp_pixel;
asm volatile ( asm volatile(
"movd %6,%%xmm2 \n" "movd %6,%%xmm2 \n"
"movd %7,%%xmm3 \n" "movd %7,%%xmm3 \n"
"movl $0x04040000,%k2 \n" "movl $0x04040000,%k2 \n"
@ -1932,7 +1930,7 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
int dx) { int dx) {
(void)x; (void)x;
(void)dx; (void)dx;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%1),%%xmm0 \n" "movdqu (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
@ -1957,7 +1955,7 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
@ -1979,7 +1977,7 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
@ -2003,7 +2001,7 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%0),%%xmm0 \n" "movdqu (%0),%%xmm0 \n"
"movdqu 0x10(%0),%%xmm1 \n" "movdqu 0x10(%0),%%xmm1 \n"
@ -2037,7 +2035,7 @@ void ScaleARGBRowDownEven_SSE2(const uint8_t* src_argb,
intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12; intptr_t src_stepx_x12;
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"lea 0x00(,%1,4),%1 \n" "lea 0x00(,%1,4),%1 \n"
"lea 0x00(%1,%1,2),%4 \n" "lea 0x00(%1,%1,2),%4 \n"
@ -2074,7 +2072,7 @@ void ScaleARGBRowDownEvenBox_SSE2(const uint8_t* src_argb,
intptr_t src_stepx_x4 = (intptr_t)(src_stepx); intptr_t src_stepx_x4 = (intptr_t)(src_stepx);
intptr_t src_stepx_x12; intptr_t src_stepx_x12;
intptr_t row1 = (intptr_t)(src_stride); intptr_t row1 = (intptr_t)(src_stride);
asm volatile ( asm volatile(
"lea 0x00(,%1,4),%1 \n" "lea 0x00(,%1,4),%1 \n"
"lea 0x00(%1,%1,2),%4 \n" "lea 0x00(%1,%1,2),%4 \n"
"lea 0x00(%0,%5,1),%5 \n" "lea 0x00(%0,%5,1),%5 \n"
@ -2117,7 +2115,7 @@ void ScaleARGBCols_SSE2(uint8_t* dst_argb,
int x, int x,
int dx) { int dx) {
intptr_t x0, x1; intptr_t x0, x1;
asm volatile ( asm volatile(
"movd %5,%%xmm2 \n" "movd %5,%%xmm2 \n"
"movd %6,%%xmm3 \n" "movd %6,%%xmm3 \n"
"pshufd $0x0,%%xmm2,%%xmm2 \n" "pshufd $0x0,%%xmm2,%%xmm2 \n"
@ -2188,7 +2186,7 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
int dx) { int dx) {
(void)x; (void)x;
(void)dx; (void)dx;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"movdqu (%1),%%xmm0 \n" "movdqu (%1),%%xmm0 \n"
"lea 0x10(%1),%1 \n" "lea 0x10(%1),%1 \n"
@ -2226,7 +2224,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
int x, int x,
int dx) { int dx) {
intptr_t x0, x1; intptr_t x0, x1;
asm volatile ( asm volatile(
"movdqa %0,%%xmm4 \n" "movdqa %0,%%xmm4 \n"
"movdqa %1,%%xmm5 \n" "movdqa %1,%%xmm5 \n"
: :
@ -2234,7 +2232,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
"m"(kShuffleFractions) // %1 "m"(kShuffleFractions) // %1
); );
asm volatile ( asm volatile(
"movd %5,%%xmm2 \n" "movd %5,%%xmm2 \n"
"movd %6,%%xmm3 \n" "movd %6,%%xmm3 \n"
"pcmpeqb %%xmm6,%%xmm6 \n" "pcmpeqb %%xmm6,%%xmm6 \n"
@ -2297,7 +2295,7 @@ void ScaleARGBFilterCols_SSSE3(uint8_t* dst_argb,
// Divide num by div and return as 16.16 fixed point result. // Divide num by div and return as 16.16 fixed point result.
int FixedDiv_X86(int num, int div) { int FixedDiv_X86(int num, int div) {
asm volatile ( asm volatile(
"cdq \n" "cdq \n"
"shld $0x10,%%eax,%%edx \n" "shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n" "shl $0x10,%%eax \n"
@ -2311,7 +2309,7 @@ int FixedDiv_X86(int num, int div) {
// Divide num - 1 by div - 1 and return as 16.16 fixed point result. // Divide num - 1 by div - 1 and return as 16.16 fixed point result.
int FixedDiv1_X86(int num, int div) { int FixedDiv1_X86(int num, int div) {
asm volatile ( asm volatile(
"cdq \n" "cdq \n"
"shld $0x10,%%eax,%%edx \n" "shld $0x10,%%eax,%%edx \n"
"shl $0x10,%%eax \n" "shl $0x10,%%eax \n"
@ -2343,7 +2341,7 @@ void ScaleUVRowDown2Box_SSSE3(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pcmpeqb %%xmm4,%%xmm4 \n" // 01010101 "pcmpeqb %%xmm4,%%xmm4 \n" // 01010101
"psrlw $0xf,%%xmm4 \n" "psrlw $0xf,%%xmm4 \n"
"packuswb %%xmm4,%%xmm4 \n" "packuswb %%xmm4,%%xmm4 \n"
@ -2383,7 +2381,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101
"vpsrlw $0xf,%%ymm4,%%ymm4 \n" "vpsrlw $0xf,%%ymm4,%%ymm4 \n"
"vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n"
@ -2427,7 +2425,7 @@ static const uvec8 kUVLinearMadd31 = {3, 1, 3, 1, 1, 3, 1, 3,
void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr, void ScaleUVRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pcmpeqw %%xmm4,%%xmm4 \n" "pcmpeqw %%xmm4,%%xmm4 \n"
"psrlw $15,%%xmm4 \n" "psrlw $15,%%xmm4 \n"
"psllw $1,%%xmm4 \n" // all 2 "psllw $1,%%xmm4 \n" // all 2
@ -2468,7 +2466,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pcmpeqw %%xmm6,%%xmm6 \n" "pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n" "psrlw $15,%%xmm6 \n"
"psllw $3,%%xmm6 \n" // all 8 "psllw $3,%%xmm6 \n" // all 8
@ -2552,7 +2550,7 @@ void ScaleUVRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr, void ScaleUVRowUp2_Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n" "vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2 "vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
@ -2595,7 +2593,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n" "vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrlw $15,%%ymm6,%%ymm6 \n" "vpsrlw $15,%%ymm6,%%ymm6 \n"
"vpsllw $3,%%ymm6,%%ymm6 \n" // all 8 "vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
@ -2675,7 +2673,7 @@ void ScaleUVRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr, void ScaleUVRowUp2_Linear_16_SSE41(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pxor %%xmm5,%%xmm5 \n" "pxor %%xmm5,%%xmm5 \n"
"pcmpeqd %%xmm4,%%xmm4 \n" "pcmpeqd %%xmm4,%%xmm4 \n"
"psrld $31,%%xmm4 \n" "psrld $31,%%xmm4 \n"
@ -2727,7 +2725,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"pxor %%xmm7,%%xmm7 \n" "pxor %%xmm7,%%xmm7 \n"
"pcmpeqd %%xmm6,%%xmm6 \n" "pcmpeqd %%xmm6,%%xmm6 \n"
"psrld $31,%%xmm6 \n" "psrld $31,%%xmm6 \n"
@ -2818,7 +2816,7 @@ void ScaleUVRowUp2_Bilinear_16_SSE41(const uint16_t* src_ptr,
void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr, void ScaleUVRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n" "vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrld $31,%%ymm4,%%ymm4 \n" "vpsrld $31,%%ymm4,%%ymm4 \n"
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2 "vpslld $1,%%ymm4,%%ymm4 \n" // all 2
@ -2867,7 +2865,7 @@ void ScaleUVRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
ptrdiff_t dst_stride, ptrdiff_t dst_stride,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n" "vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrld $31,%%ymm6,%%ymm6 \n" "vpsrld $31,%%ymm6,%%ymm6 \n"
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8 "vpslld $3,%%ymm6,%%ymm6 \n" // all 8

View File

@ -29,7 +29,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
// load even pixels into q0, odd into q1 // load even pixels into q0, odd into q1
"vld2.8 {q0, q1}, [%0]! \n" "vld2.8 {q0, q1}, [%0]! \n"
@ -50,7 +50,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels "vld2.8 {q0, q1}, [%0]! \n" // load 32 pixels
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
@ -70,7 +70,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0 \n" "add %1, %0 \n"
"1: \n" "1: \n"
@ -101,7 +101,7 @@ void ScaleRowDown4_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop "subs %2, %2, #8 \n" // 8 processed per loop
@ -121,7 +121,7 @@ void ScaleRowDown4Box_NEON(const uint8_t* src_ptr,
const uint8_t* src_ptr1 = src_ptr + src_stride; const uint8_t* src_ptr1 = src_ptr + src_stride;
const uint8_t* src_ptr2 = src_ptr + src_stride * 2; const uint8_t* src_ptr2 = src_ptr + src_stride * 2;
const uint8_t* src_ptr3 = src_ptr + src_stride * 3; const uint8_t* src_ptr3 = src_ptr + src_stride * 3;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.8 {q0}, [%0]! \n" // load up 16x4 "vld1.8 {q0}, [%0]! \n" // load up 16x4
"vld1.8 {q1}, [%3]! \n" "vld1.8 {q1}, [%3]! \n"
@ -155,7 +155,7 @@ void ScaleRowDown34_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0 "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #24 \n" "subs %2, %2, #24 \n"
@ -173,7 +173,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
@ -230,7 +230,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vmov.u8 d24, #3 \n" "vmov.u8 d24, #3 \n"
"add %3, %0 \n" "add %3, %0 \n"
"1: \n" "1: \n"
@ -282,7 +282,7 @@ void ScaleRowDown38_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"vld1.8 {q3}, [%3] \n" "vld1.8 {q3}, [%3] \n"
"1: \n" "1: \n"
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" "vld1.8 {d0, d1, d2, d3}, [%0]! \n"
@ -306,7 +306,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8_t* src_ptr,
int dst_width) { int dst_width) {
const uint8_t* src_ptr1 = src_ptr + src_stride * 2; const uint8_t* src_ptr1 = src_ptr + src_stride * 2;
asm volatile ( asm volatile(
"vld1.16 {q13}, [%5] \n" "vld1.16 {q13}, [%5] \n"
"vld1.8 {q14}, [%6] \n" "vld1.8 {q14}, [%6] \n"
"vld1.8 {q15}, [%7] \n" "vld1.8 {q15}, [%7] \n"
@ -416,7 +416,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"vld1.16 {q13}, [%4] \n" "vld1.16 {q13}, [%4] \n"
"vld1.8 {q14}, [%5] \n" "vld1.8 {q14}, [%5] \n"
"add %3, %0 \n" "add %3, %0 \n"
@ -509,7 +509,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
const uint8_t* src_temp = src_ptr + 1; const uint8_t* src_temp = src_ptr + 1;
asm volatile ( asm volatile(
"vmov.u8 d30, #3 \n" "vmov.u8 d30, #3 \n"
"1: \n" "1: \n"
@ -546,7 +546,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
const uint8_t* src_temp = src_ptr + 1; const uint8_t* src_temp = src_ptr + 1;
const uint8_t* src_temp1 = src_ptr1 + 1; const uint8_t* src_temp1 = src_ptr1 + 1;
asm volatile ( asm volatile(
"vmov.u16 q15, #3 \n" "vmov.u16 q15, #3 \n"
"vmov.u8 d28, #3 \n" "vmov.u8 d28, #3 \n"
@ -608,7 +608,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp = src_ptr + 1;
asm volatile ( asm volatile(
"vmov.u16 q15, #3 \n" "vmov.u16 q15, #3 \n"
"1: \n" "1: \n"
@ -644,7 +644,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp = src_ptr + 1;
const uint16_t* src_temp1 = src_ptr1 + 1; const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile ( asm volatile(
"vmov.u16 q15, #3 \n" "vmov.u16 q15, #3 \n"
"1: \n" "1: \n"
@ -695,7 +695,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp = src_ptr + 1;
asm volatile ( asm volatile(
"vmov.u16 d31, #3 \n" "vmov.u16 d31, #3 \n"
"1: \n" "1: \n"
@ -739,7 +739,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp = src_ptr + 1;
const uint16_t* src_temp1 = src_ptr1 + 1; const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile ( asm volatile(
"vmov.u16 d31, #3 \n" "vmov.u16 d31, #3 \n"
"vmov.u32 q14, #3 \n" "vmov.u32 q14, #3 \n"
@ -791,7 +791,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
const uint8_t* src_temp = src_ptr + 2; const uint8_t* src_temp = src_ptr + 2;
asm volatile ( asm volatile(
"vmov.u8 d30, #3 \n" "vmov.u8 d30, #3 \n"
"1: \n" "1: \n"
@ -828,7 +828,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
const uint8_t* src_temp = src_ptr + 2; const uint8_t* src_temp = src_ptr + 2;
const uint8_t* src_temp1 = src_ptr1 + 2; const uint8_t* src_temp1 = src_ptr1 + 2;
asm volatile ( asm volatile(
"vmov.u16 q15, #3 \n" "vmov.u16 q15, #3 \n"
"vmov.u8 d28, #3 \n" "vmov.u8 d28, #3 \n"
@ -890,7 +890,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
const uint16_t* src_temp = src_ptr + 2; const uint16_t* src_temp = src_ptr + 2;
asm volatile ( asm volatile(
"vmov.u16 d30, #3 \n" "vmov.u16 d30, #3 \n"
"1: \n" "1: \n"
@ -935,7 +935,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 2; const uint16_t* src_temp = src_ptr + 2;
const uint16_t* src_temp1 = src_ptr1 + 2; const uint16_t* src_temp1 = src_ptr1 + 2;
asm volatile ( asm volatile(
"vmov.u16 d30, #3 \n" "vmov.u16 d30, #3 \n"
"vmov.u32 q14, #3 \n" "vmov.u32 q14, #3 \n"
@ -988,7 +988,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
void ScaleAddRow_NEON(const uint8_t* src_ptr, void ScaleAddRow_NEON(const uint8_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int src_width) { int src_width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.16 {q1, q2}, [%1] \n" // load accumulator "vld1.16 {q1, q2}, [%1] \n" // load accumulator
"vld1.8 {q0}, [%0]! \n" // load 16 bytes "vld1.8 {q0}, [%0]! \n" // load 16 bytes
@ -1086,7 +1086,7 @@ void ScaleARGBRowDown2_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
@ -1114,7 +1114,7 @@ void ScaleARGBRowDown2Linear_NEON(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels. "vld4.32 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
"vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB "vld4.32 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB
@ -1135,7 +1135,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
@ -1174,7 +1174,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"mov r12, %3, lsl #2 \n" "mov r12, %3, lsl #2 \n"
"1: \n" "1: \n"
"vld1.32 {d0[0]}, [%0], r12 \n" "vld1.32 {d0[0]}, [%0], r12 \n"
@ -1198,7 +1198,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8_t* src_argb,
int src_stepx, int src_stepx,
uint8_t* dst_argb, uint8_t* dst_argb,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
"mov r12, %4, lsl #2 \n" "mov r12, %4, lsl #2 \n"
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
@ -1246,7 +1246,7 @@ void ScaleARGBCols_NEON(uint8_t* dst_argb,
int dx) { int dx) {
int tmp; int tmp;
const uint8_t* src_tmp = src_argb; const uint8_t* src_tmp = src_argb;
asm volatile ( asm volatile(
"1: \n" "1: \n"
// clang-format off // clang-format off
LOAD1_DATA32_LANE(d0, 0) LOAD1_DATA32_LANE(d0, 0)
@ -1349,7 +1349,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
@ -1368,7 +1368,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels. "vld2.16 {d0, d2}, [%0]! \n" // load 8 UV pixels.
"vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV "vld2.16 {d1, d3}, [%0]! \n" // load next 8 UV
@ -1387,7 +1387,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
@ -1422,7 +1422,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
const uint8_t* src2_ptr = src_ptr + src_stepx * 4; const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
const uint8_t* src3_ptr = src_ptr + src_stepx * 6; const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"vld1.16 {d0[0]}, [%0], %6 \n" "vld1.16 {d0[0]}, [%0], %6 \n"
"vld1.16 {d0[1]}, [%1], %6 \n" "vld1.16 {d0[1]}, [%1], %6 \n"

View File

@ -26,7 +26,7 @@ void ScaleRowDown2_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
// load even pixels into v0, odd into v1 // load even pixels into v0, odd into v1
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
@ -48,7 +48,7 @@ void ScaleRowDown2Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
// load even pixels into v0, odd into v1 // load even pixels into v0, odd into v1
"ld2 {v0.16b,v1.16b}, [%0], #32 \n" "ld2 {v0.16b,v1.16b}, [%0], #32 \n"
@ -70,7 +70,7 @@ void ScaleRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
@ -543,7 +543,7 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
const uint8_t* src_temp = src_ptr + 1; const uint8_t* src_temp = src_ptr + 1;
const uint8_t* src_temp1 = src_ptr1 + 1; const uint8_t* src_temp1 = src_ptr1 + 1;
asm volatile ( asm volatile(
"movi v31.8b, #3 \n" "movi v31.8b, #3 \n"
"movi v30.8h, #3 \n" "movi v30.8h, #3 \n"
@ -599,7 +599,7 @@ void ScaleRowUp2_Linear_12_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp = src_ptr + 1;
asm volatile ( asm volatile(
"movi v31.8h, #3 \n" "movi v31.8h, #3 \n"
"1: \n" "1: \n"
@ -636,7 +636,7 @@ void ScaleRowUp2_Bilinear_12_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp = src_ptr + 1;
const uint16_t* src_temp1 = src_ptr1 + 1; const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile ( asm volatile(
"movi v31.8h, #3 \n" "movi v31.8h, #3 \n"
"1: \n" "1: \n"
@ -690,7 +690,7 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp = src_ptr + 1;
asm volatile ( asm volatile(
"movi v31.8h, #3 \n" "movi v31.8h, #3 \n"
"1: \n" "1: \n"
@ -735,7 +735,7 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 1; const uint16_t* src_temp = src_ptr + 1;
const uint16_t* src_temp1 = src_ptr1 + 1; const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile ( asm volatile(
"movi v31.4h, #3 \n" "movi v31.4h, #3 \n"
"movi v30.4s, #3 \n" "movi v30.4s, #3 \n"
@ -790,7 +790,7 @@ void ScaleUVRowUp2_Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst_ptr, uint8_t* dst_ptr,
int dst_width) { int dst_width) {
const uint8_t* src_temp = src_ptr + 2; const uint8_t* src_temp = src_ptr + 2;
asm volatile ( asm volatile(
"movi v31.8b, #3 \n" "movi v31.8b, #3 \n"
"1: \n" "1: \n"
@ -829,7 +829,7 @@ void ScaleUVRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
const uint8_t* src_temp = src_ptr + 2; const uint8_t* src_temp = src_ptr + 2;
const uint8_t* src_temp1 = src_ptr1 + 2; const uint8_t* src_temp1 = src_ptr1 + 2;
asm volatile ( asm volatile(
"movi v31.8b, #3 \n" "movi v31.8b, #3 \n"
"movi v30.8h, #3 \n" "movi v30.8h, #3 \n"
@ -885,7 +885,7 @@ void ScaleUVRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int dst_width) { int dst_width) {
const uint16_t* src_temp = src_ptr + 2; const uint16_t* src_temp = src_ptr + 2;
asm volatile ( asm volatile(
"movi v31.8h, #3 \n" "movi v31.8h, #3 \n"
"1: \n" "1: \n"
@ -932,7 +932,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp = src_ptr + 2; const uint16_t* src_temp = src_ptr + 2;
const uint16_t* src_temp1 = src_ptr1 + 2; const uint16_t* src_temp1 = src_ptr1 + 2;
asm volatile ( asm volatile(
"movi v31.4h, #3 \n" "movi v31.4h, #3 \n"
"movi v30.4s, #3 \n" "movi v30.4s, #3 \n"
@ -987,7 +987,7 @@ void ScaleUVRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
void ScaleAddRow_NEON(const uint8_t* src_ptr, void ScaleAddRow_NEON(const uint8_t* src_ptr,
uint16_t* dst_ptr, uint16_t* dst_ptr,
int src_width) { int src_width) {
asm volatile ( asm volatile(
"1: \n" "1: \n"
"ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator "ld1 {v1.8h, v2.8h}, [%1] \n" // load accumulator
"ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes "ld1 {v0.16b}, [%0], #16 \n" // load 16 bytes
@ -1135,7 +1135,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
const uint8_t* src_ptr1 = src_ptr + src_stride; const uint8_t* src_ptr1 = src_ptr + src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"ld2 {v0.4s, v1.4s}, [%[src]], #32 \n" "ld2 {v0.4s, v1.4s}, [%[src]], #32 \n"
"ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n" "ld2 {v20.4s, v21.4s}, [%[src1]], #32 \n"
@ -1166,7 +1166,7 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
const uint8_t* src_argb3 = src_argb + src_stepx * 12; const uint8_t* src_argb3 = src_argb + src_stepx * 12;
int64_t i = 0; int64_t i = 0;
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"ldr w10, [%[src], %[i]] \n" "ldr w10, [%[src], %[i]] \n"
"ldr w11, [%[src1], %[i]] \n" "ldr w11, [%[src1], %[i]] \n"
@ -1178,14 +1178,10 @@ void ScaleARGBRowDownEven_NEON(const uint8_t* src_argb,
"stp w10, w11, [%[dst]], #8 \n" "stp w10, w11, [%[dst]], #8 \n"
"stp w12, w13, [%[dst]], #8 \n" "stp w12, w13, [%[dst]], #8 \n"
"b.gt 1b \n" "b.gt 1b \n"
: [src]"+r"(src_argb), : [src] "+r"(src_argb), [src1] "+r"(src_argb1), [src2] "+r"(src_argb2),
[src1]"+r"(src_argb1), [src3] "+r"(src_argb3), [dst] "+r"(dst_argb), [width] "+r"(dst_width),
[src2]"+r"(src_argb2), [i] "+r"(i)
[src3]"+r"(src_argb3), : [step] "r"((int64_t)(src_stepx * 16))
[dst]"+r"(dst_argb),
[width]"+r"(dst_width),
[i]"+r"(i)
: [step]"r"((int64_t)(src_stepx * 16))
: "memory", "cc", "w10", "w11", "w12", "w13"); : "memory", "cc", "w10", "w11", "w12", "w13");
} }
@ -1424,7 +1420,7 @@ void ScaleRowDown2Box_16_NEON(const uint16_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint16_t* dst, uint16_t* dst,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %0, %1, lsl #1 \n" // ptr + stide * 2 "add %1, %0, %1, lsl #1 \n" // ptr + stide * 2
"1: \n" "1: \n"
@ -1455,7 +1451,7 @@ void ScaleUVRowDown2_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
@ -1474,7 +1470,7 @@ void ScaleUVRowDown2Linear_NEON(const uint8_t* src_ptr,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV "ld2 {v0.8h,v1.8h}, [%0], #32 \n" // load 16 UV
"subs %w2, %w2, #8 \n" // 8 processed per loop. "subs %w2, %w2, #8 \n" // 8 processed per loop.
@ -1493,7 +1489,7 @@ void ScaleUVRowDown2Box_NEON(const uint8_t* src_ptr,
ptrdiff_t src_stride, ptrdiff_t src_stride,
uint8_t* dst, uint8_t* dst,
int dst_width) { int dst_width) {
asm volatile ( asm volatile(
// change the stride to row 2 pointer // change the stride to row 2 pointer
"add %1, %1, %0 \n" "add %1, %1, %0 \n"
"1: \n" "1: \n"
@ -1528,7 +1524,7 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
const uint8_t* src2_ptr = src_ptr + src_stepx * 4; const uint8_t* src2_ptr = src_ptr + src_stepx * 4;
const uint8_t* src3_ptr = src_ptr + src_stepx * 6; const uint8_t* src3_ptr = src_ptr + src_stepx * 6;
(void)src_stride; (void)src_stride;
asm volatile ( asm volatile(
"1: \n" "1: \n"
"ld1 {v0.h}[0], [%0], %6 \n" "ld1 {v0.h}[0], [%0], %6 \n"
"ld1 {v1.h}[0], [%1], %6 \n" "ld1 {v1.h}[0], [%1], %6 \n"

View File

@ -10,8 +10,8 @@
#include "libyuv/scale.h" /* For FilterMode */ #include "libyuv/scale.h" /* For FilterMode */
#include <limits.h>
#include <assert.h> #include <assert.h>
#include <limits.h>
#include <stdint.h> #include <stdint.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
@ -41,9 +41,9 @@ int RGBScale(const uint8_t* src_rgb,
int dst_height, int dst_height,
enum FilterMode filtering) { enum FilterMode filtering) {
int r; int r;
if (!src_rgb || !dst_rgb || if (!src_rgb || !dst_rgb || src_width <= 0 || src_width > INT_MAX / 4 ||
src_width <= 0 || src_width > INT_MAX / 4 || src_height == 0 || src_height == 0 || dst_width <= 0 || dst_width > INT_MAX / 4 ||
dst_width <= 0 || dst_width > INT_MAX / 4 || dst_height <= 0) { dst_height <= 0) {
return -1; return -1;
} }
const int abs_src_height = (src_height < 0) ? -src_height : src_height; const int abs_src_height = (src_height < 0) ? -src_height : src_height;

View File

@ -149,7 +149,7 @@ void ScaleARGBRowDown2Linear_RVV(const uint8_t* src_argb,
const uint32_t* src = (const uint32_t*)(src_argb); const uint32_t* src = (const uint32_t*)(src_argb);
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m4_t v_odd, v_even, v_dst; vuint8m4_t v_odd, v_even, v_dst;
vuint32m4_t v_odd_32, v_even_32; vuint32m4_t v_odd_32, v_even_32;
@ -214,7 +214,7 @@ void ScaleARGBRowDown2Box_RVV(const uint8_t* src_argb,
const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride); const uint32_t* src1 = (const uint32_t*)(src_argb + src_stride);
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst; vuint8m4_t v_row0_odd, v_row0_even, v_row1_odd, v_row1_even, v_dst;
vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16; vuint16m8_t v_row0_sum, v_row1_sum, v_dst_16;
@ -311,7 +311,7 @@ void ScaleARGBRowDownEvenBox_RVV(const uint8_t* src_argb,
const int stride_byte = src_stepx * 4; const int stride_byte = src_stepx * 4;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst; vuint8m4_t v_row0_low, v_row0_high, v_row1_low, v_row1_high, v_dst;
vuint16m8_t v_row0_sum, v_row1_sum, v_sum; vuint16m8_t v_row0_sum, v_row1_sum, v_sum;
@ -389,7 +389,7 @@ void ScaleRowDown2Linear_RVV(const uint8_t* src_ptr,
(void)src_stride; (void)src_stride;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m4_t v_s0, v_s1, v_dst; vuint8m4_t v_s0, v_s1, v_dst;
size_t vl = __riscv_vsetvl_e8m4(w); size_t vl = __riscv_vsetvl_e8m4(w);
@ -444,7 +444,7 @@ void ScaleRowDown2Box_RVV(const uint8_t* src_ptr,
size_t w = (size_t)dst_width; size_t w = (size_t)dst_width;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
size_t vl = __riscv_vsetvl_e8m4(w); size_t vl = __riscv_vsetvl_e8m4(w);
vuint8m4_t v_s0, v_s1, v_t0, v_t1; vuint8m4_t v_s0, v_s1, v_t0, v_t1;
@ -577,7 +577,7 @@ void ScaleRowDown4Box_RVV(const uint8_t* src_ptr,
size_t w = (size_t)dst_width; size_t w = (size_t)dst_width;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m2_t v_s0, v_s1, v_s2, v_s3; vuint8m2_t v_s0, v_s1, v_s2, v_s3;
vuint8m2_t v_t0, v_t1, v_t2, v_t3; vuint8m2_t v_t0, v_t1, v_t2, v_t3;
@ -747,7 +747,7 @@ void ScaleRowDown34_0_Box_RVV(const uint8_t* src_ptr,
const uint8_t* t = src_ptr + src_stride; const uint8_t* t = src_ptr + src_stride;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m2_t v_s0, v_s1, v_s2, v_s3; vuint8m2_t v_s0, v_s1, v_s2, v_s3;
vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16; vuint16m4_t v_t0_u16, v_t1_u16, v_t2_u16, v_t3_u16;
@ -876,7 +876,7 @@ void ScaleRowDown34_1_Box_RVV(const uint8_t* src_ptr,
const uint8_t* t = src_ptr + src_stride; const uint8_t* t = src_ptr + src_stride;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m2_t v_s0, v_s1, v_s2, v_s3; vuint8m2_t v_s0, v_s1, v_s2, v_s3;
vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3; vuint8m2_t v_ave0, v_ave1, v_ave2, v_ave3;
@ -1539,7 +1539,7 @@ void ScaleUVRowDown2Linear_RVV(const uint8_t* src_uv,
(void)src_stride; (void)src_stride;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m4_t v_u0v0, v_u1v1, v_avg; vuint8m4_t v_u0v0, v_u1v1, v_avg;
vuint16m4_t v_u0v0_16, v_u1v1_16; vuint16m4_t v_u0v0_16, v_u1v1_16;
@ -1608,7 +1608,7 @@ void ScaleUVRowDown2Box_RVV(const uint8_t* src_uv,
size_t w = (size_t)dst_width; size_t w = (size_t)dst_width;
// NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode // NOTE: To match behavior on other platforms, vxrm (fixed-point rounding mode
// register) is set to round-to-nearest-up mode(0). // register) is set to round-to-nearest-up mode(0).
asm volatile ("csrwi vxrm, 0"); asm volatile("csrwi vxrm, 0");
do { do {
vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0; vuint8m2_t v_u0_row0, v_v0_row0, v_u1_row0, v_v1_row0;
vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1; vuint8m2_t v_u0_row1, v_v0_row1, v_u1_row1, v_v1_row1;

View File

@ -15,7 +15,6 @@ namespace libyuv {
extern "C" { extern "C" {
#endif #endif
#if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \ #if !defined(LIBYUV_DISABLE_SME) && defined(CLANG_HAS_SME) && \
defined(__aarch64__) defined(__aarch64__)

View File

@ -333,14 +333,14 @@ static void ScaleUVDownEven(int src_width,
#endif #endif
#if defined(HAS_SCALEUVROWDOWNEVEN_RVV) || defined(HAS_SCALEUVROWDOWN4_RVV) #if defined(HAS_SCALEUVROWDOWNEVEN_RVV) || defined(HAS_SCALEUVROWDOWN4_RVV)
if (TestCpuFlag(kCpuHasRVV) && !filtering) { if (TestCpuFlag(kCpuHasRVV) && !filtering) {
#if defined(HAS_SCALEUVROWDOWNEVEN_RVV) #if defined(HAS_SCALEUVROWDOWNEVEN_RVV)
ScaleUVRowDownEven = ScaleUVRowDownEven_RVV; ScaleUVRowDownEven = ScaleUVRowDownEven_RVV;
#endif #endif
#if defined(HAS_SCALEUVROWDOWN4_RVV) #if defined(HAS_SCALEUVROWDOWN4_RVV)
if (col_step == 4) { if (col_step == 4) {
ScaleUVRowDownEven = ScaleUVRowDown4_RVV; ScaleUVRowDownEven = ScaleUVRowDown4_RVV;
} }
#endif #endif
} }
#endif #endif

View File

@ -12,6 +12,7 @@
#include <stdlib.h> #include <stdlib.h>
#include <time.h> #include <time.h>
#include "../unit_test/unit_test.h"
#include "libyuv/basic_types.h" #include "libyuv/basic_types.h"
#include "libyuv/compare.h" #include "libyuv/compare.h"
#include "libyuv/convert.h" #include "libyuv/convert.h"
@ -19,7 +20,6 @@
#include "libyuv/convert_from.h" #include "libyuv/convert_from.h"
#include "libyuv/convert_from_argb.h" #include "libyuv/convert_from_argb.h"
#include "libyuv/cpu_id.h" #include "libyuv/cpu_id.h"
#include "../unit_test/unit_test.h"
#include "libyuv/planar_functions.h" #include "libyuv/planar_functions.h"
#include "libyuv/rotate.h" #include "libyuv/rotate.h"
#include "libyuv/video_common.h" #include "libyuv/video_common.h"

View File

@ -67,16 +67,16 @@ TEST_F(LibYUVBaseTest, TestCpuId) {
#endif #endif
#ifdef __linux__ #ifdef __linux__
static void KernelVersion(int *version) { static void KernelVersion(int* version) {
struct utsname buffer; struct utsname buffer;
int i = 0; int i = 0;
version[0] = version[1] = 0; version[0] = version[1] = 0;
if (uname(&buffer) == 0) { if (uname(&buffer) == 0) {
char *v = buffer.release; char* v = buffer.release;
for (i = 0; *v && i < 2; ++v) { for (i = 0; *v && i < 2; ++v) {
if (isdigit(*v)) { if (isdigit(*v)) {
version[i++] = (int) strtol(v, &v, 10); version[i++] = (int)strtol(v, &v, 10);
} }
} }
} }
@ -142,8 +142,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
// Read and print the RVV vector length. // Read and print the RVV vector length.
if (has_rvv) { if (has_rvv) {
register uint32_t vlenb __asm__ ("t0"); register uint32_t vlenb __asm__("t0");
__asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r" (vlenb)); __asm__(".word 0xC22022F3" /* CSRR t0, vlenb */ : "=r"(vlenb));
printf("RVV vector length: %d bytes\n", vlenb); printf("RVV vector length: %d bytes\n", vlenb);
} }
} }
@ -169,8 +169,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
} }
#endif // defined(__loongarch__) #endif // defined(__loongarch__)
#if defined(__i386__) || defined(__x86_64__) || \ #if defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || \
defined(_M_IX86) || defined(_M_X64) defined(_M_X64)
int has_x86 = TestCpuFlag(kCpuHasX86); int has_x86 = TestCpuFlag(kCpuHasX86);
if (has_x86) { if (has_x86) {
int has_sse2 = TestCpuFlag(kCpuHasSSE2); int has_sse2 = TestCpuFlag(kCpuHasSSE2);
@ -215,7 +215,8 @@ TEST_F(LibYUVBaseTest, TestCpuHas) {
printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8); printf("Has AVXVNNIINT8 0x%x\n", has_avxvnniint8);
printf("Has AMXINT8 0x%x\n", has_amxint8); printf("Has AMXINT8 0x%x\n", has_amxint8);
} }
#endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64) #endif // defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) ||
// defined(_M_X64)
} }
TEST_F(LibYUVBaseTest, TestCompilerMacros) { TEST_F(LibYUVBaseTest, TestCompilerMacros) {

View File

@ -1570,18 +1570,21 @@ static int TestCopyPlane(int benchmark_width,
// Disable all optimizations. // Disable all optimizations.
MaskCpuFlags(disable_cpu_flags); MaskCpuFlags(disable_cpu_flags);
for (int i = 0; i < benchmark_iterations; i++) { for (int i = 0; i < benchmark_iterations; i++) {
CopyPlane(orig_y + off, benchmark_width, dst_c, benchmark_width, benchmark_width, benchmark_height * invert); CopyPlane(orig_y + off, benchmark_width, dst_c, benchmark_width,
benchmark_width, benchmark_height * invert);
} }
// Enable optimizations. // Enable optimizations.
MaskCpuFlags(benchmark_cpu_info); MaskCpuFlags(benchmark_cpu_info);
for (int i = 0; i < benchmark_iterations; i++) { for (int i = 0; i < benchmark_iterations; i++) {
CopyPlane(orig_y + off, benchmark_width, dst_opt, benchmark_width, benchmark_width, benchmark_height * invert); CopyPlane(orig_y + off, benchmark_width, dst_opt, benchmark_width,
benchmark_width, benchmark_height * invert);
} }
int max_diff = 0; int max_diff = 0;
for (int i = 0; i < y_plane_size; ++i) { for (int i = 0; i < y_plane_size; ++i) {
int abs_diff = abs(static_cast<int>(dst_c[i]) - static_cast<int>(dst_opt[i])); int abs_diff =
abs(static_cast<int>(dst_c[i]) - static_cast<int>(dst_opt[i]));
if (abs_diff > max_diff) { if (abs_diff > max_diff) {
max_diff = abs_diff; max_diff = abs_diff;
} }
@ -2499,17 +2502,19 @@ static int TestHalfFloatPlane(int benchmark_width,
// Disable all optimizations. // Disable all optimizations.
MaskCpuFlags(disable_cpu_flags); MaskCpuFlags(disable_cpu_flags);
for (j = 0; j < benchmark_iterations; j++) { for (j = 0; j < benchmark_iterations; j++) {
HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off), benchmark_width * 2, HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off),
reinterpret_cast<uint16_t*>(dst_c), benchmark_width * 2, benchmark_width * 2, reinterpret_cast<uint16_t*>(dst_c),
scale, benchmark_width, benchmark_height * invert); benchmark_width * 2, scale, benchmark_width,
benchmark_height * invert);
} }
// Enable optimizations. // Enable optimizations.
MaskCpuFlags(benchmark_cpu_info); MaskCpuFlags(benchmark_cpu_info);
for (j = 0; j < benchmark_iterations; j++) { for (j = 0; j < benchmark_iterations; j++) {
HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off), benchmark_width * 2, HalfFloatPlane(reinterpret_cast<uint16_t*>(orig_y + off),
reinterpret_cast<uint16_t*>(dst_opt), benchmark_width * 2, benchmark_width * 2, reinterpret_cast<uint16_t*>(dst_opt),
scale, benchmark_width, benchmark_height * invert); benchmark_width * 2, scale, benchmark_width,
benchmark_height * invert);
} }
int max_diff = 0; int max_diff = 0;
@ -2536,23 +2541,23 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_One) {
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_Opt) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_Opt) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_12bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 4095.0f, 4095, +1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4095.0f, 4095, +1, 0);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
@ -2564,59 +2569,57 @@ TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_9bit_Opt) {
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Any) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Any) {
int diff = TestHalfFloatPlane(benchmark_width_ + 1, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_ + 1, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Unaligned) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Unaligned) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 2); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 2);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Invert) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Invert) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, -1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, -1, 0);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_Opt) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 4096.0f, 4095, +1, 0);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_denormal) {
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
#if defined(__arm__) #if defined(__arm__)
static void EnableFlushDenormalToZero(void) { static void EnableFlushDenormalToZero(void) {
uint32_t cw; uint32_t cw;
asm volatile ( asm volatile(
"vmrs %0, fpscr \n" "vmrs %0, fpscr \n"
"orr %0, %0, #0x1000000 \n" "orr %0, %0, #0x1000000 \n"
"vmsr fpscr, %0 \n" "vmsr fpscr, %0 \n"
: "=r"(cw) : "=r"(cw)::"memory", "cc"); // Clobber List
::"memory", "cc"); // Clobber List
} }
static void DisableFlushDenormalToZero(void) { static void DisableFlushDenormalToZero(void) {
uint32_t cw; uint32_t cw;
asm volatile ( asm volatile(
"vmrs %0, fpscr \n" "vmrs %0, fpscr \n"
"bic %0, %0, #0x1000000 \n" "bic %0, %0, #0x1000000 \n"
"vmsr fpscr, %0 \n" "vmsr fpscr, %0 \n"
: "=r"(cw) : "=r"(cw)::"memory", "cc"); // Clobber List
::"memory", "cc"); // Clobber List
} }
// 5 bit exponent with bias of 15 will underflow to a denormal if scale causes // 5 bit exponent with bias of 15 will underflow to a denormal if scale causes
@ -2626,18 +2629,18 @@ static void DisableFlushDenormalToZero(void) {
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_flush_denormal) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_16bit_flush_denormal) {
// 32 bit arm rounding on denormal case is off by 1 compared to C. // 32 bit arm rounding on denormal case is off by 1 compared to C.
EnableFlushDenormalToZero(); EnableFlushDenormalToZero();
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 65535.0f, 65535, +1, 0);
DisableFlushDenormalToZero(); DisableFlushDenormalToZero();
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_flush_denormal) { TEST_F(LibYUVPlanarTest, TestHalfFloatPlane_10bit_flush_denormal) {
EnableFlushDenormalToZero(); EnableFlushDenormalToZero();
int diff = TestHalfFloatPlane(benchmark_width_, benchmark_height_, int diff = TestHalfFloatPlane(
benchmark_iterations_, disable_cpu_flags_, benchmark_width_, benchmark_height_, benchmark_iterations_,
benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0); disable_cpu_flags_, benchmark_cpu_info_, 1.0f / 1023.0f, 1023, +1, 0);
DisableFlushDenormalToZero(); DisableFlushDenormalToZero();
EXPECT_EQ(0, diff); EXPECT_EQ(0, diff);
} }
@ -3184,8 +3187,9 @@ TEST_F(LibYUVPlanarTest, MergeRGBPlane_Opt) {
tmp_pixels_c_b, benchmark_width_, benchmark_width_, tmp_pixels_c_b, benchmark_width_, benchmark_width_,
benchmark_height_); benchmark_height_);
MergeRGBPlane(tmp_pixels_c_r, benchmark_width_, tmp_pixels_c_g, MergeRGBPlane(tmp_pixels_c_r, benchmark_width_, tmp_pixels_c_g,
benchmark_width_, tmp_pixels_c_b, benchmark_width_, dst_pixels_c, benchmark_width_, tmp_pixels_c_b, benchmark_width_,
benchmark_width_ * 3, benchmark_width_, benchmark_height_); dst_pixels_c, benchmark_width_ * 3, benchmark_width_,
benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_); MaskCpuFlags(benchmark_cpu_info_);
SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_opt_r, SplitRGBPlane(src_pixels, benchmark_width_ * 3, tmp_pixels_opt_r,
@ -3244,8 +3248,9 @@ TEST_F(LibYUVPlanarTest, SplitRGBPlane_Opt) {
tmp_pixels_c_b, benchmark_width_, benchmark_width_, tmp_pixels_c_b, benchmark_width_, benchmark_width_,
benchmark_height_); benchmark_height_);
MergeRGBPlane(tmp_pixels_c_r, benchmark_width_, tmp_pixels_c_g, MergeRGBPlane(tmp_pixels_c_r, benchmark_width_, tmp_pixels_c_g,
benchmark_width_, tmp_pixels_c_b, benchmark_width_, dst_pixels_c, benchmark_width_, tmp_pixels_c_b, benchmark_width_,
benchmark_width_ * 3, benchmark_width_, benchmark_height_); dst_pixels_c, benchmark_width_ * 3, benchmark_width_,
benchmark_height_);
MaskCpuFlags(benchmark_cpu_info_); MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
@ -3446,8 +3451,8 @@ TEST_F(LibYUVPlanarTest, MergeXRGBPlane_Opt) {
for (int i = 0; i < benchmark_iterations_; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
MergeARGBPlane(tmp_pixels_opt_r, benchmark_width_, tmp_pixels_opt_g, MergeARGBPlane(tmp_pixels_opt_r, benchmark_width_, tmp_pixels_opt_g,
benchmark_width_, tmp_pixels_opt_b, benchmark_width_, NULL, 0, benchmark_width_, tmp_pixels_opt_b, benchmark_width_, NULL,
dst_pixels_opt, benchmark_width_ * 4, benchmark_width_, 0, dst_pixels_opt, benchmark_width_ * 4, benchmark_width_,
benchmark_height_); benchmark_height_);
} }
@ -3502,8 +3507,8 @@ TEST_F(LibYUVPlanarTest, SplitXRGBPlane_Opt) {
for (int i = 0; i < benchmark_iterations_; ++i) { for (int i = 0; i < benchmark_iterations_; ++i) {
SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_opt_r, SplitARGBPlane(src_pixels, benchmark_width_ * 4, tmp_pixels_opt_r,
benchmark_width_, tmp_pixels_opt_g, benchmark_width_, benchmark_width_, tmp_pixels_opt_g, benchmark_width_,
tmp_pixels_opt_b, benchmark_width_, NULL, 0, benchmark_width_, tmp_pixels_opt_b, benchmark_width_, NULL, 0,
benchmark_height_); benchmark_width_, benchmark_height_);
} }
MergeARGBPlane(tmp_pixels_opt_r, benchmark_width_, tmp_pixels_opt_g, MergeARGBPlane(tmp_pixels_opt_r, benchmark_width_, tmp_pixels_opt_g,