mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 09:16:48 +08:00
Fixes for SplitUVPlane_16 and MergeUVPlane_16
Planar functions pass depth instead of scale factor. Row functions pass shift instead of depth. Add assert to C. AVX shift instruction expects a single shift value in XMM. Neon pass shift as input (not output). Split Neon reimplemented as left shift on shorts by negative to achieve right shift. Add planar unitests Bug: libyuv:888 Change-Id: I8fe62d3d777effc5321c361cd595c58b7f93807e Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2782086 Reviewed-by: richard winterton <rrwinterton@gmail.com> Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
This commit is contained in:
parent
d8f1bfc981
commit
312c02a5aa
@ -1,6 +1,6 @@
|
|||||||
Name: libyuv
|
Name: libyuv
|
||||||
URL: http://code.google.com/p/libyuv/
|
URL: http://code.google.com/p/libyuv/
|
||||||
Version: 1782
|
Version: 1783
|
||||||
License: BSD
|
License: BSD
|
||||||
License File: LICENSE
|
License File: LICENSE
|
||||||
|
|
||||||
|
|||||||
@ -11,6 +11,6 @@
|
|||||||
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
#ifndef INCLUDE_LIBYUV_VERSION_H_
|
||||||
#define INCLUDE_LIBYUV_VERSION_H_
|
#define INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|
||||||
#define LIBYUV_VERSION 1782
|
#define LIBYUV_VERSION 1783
|
||||||
|
|
||||||
#endif // INCLUDE_LIBYUV_VERSION_H_
|
#endif // INCLUDE_LIBYUV_VERSION_H_
|
||||||
|
|||||||
@ -400,7 +400,7 @@ int I210ToI010(const uint16_t* src_y,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Any I[420]1[02] to P[420]1[02] format with mirroring.
|
// Any I[420]1[02] to P[420]1[02] format with mirroring.
|
||||||
static int Ix1xToPx1x(const uint16_t* src_y,
|
static int IxxxToPxxx(const uint16_t* src_y,
|
||||||
int src_stride_y,
|
int src_stride_y,
|
||||||
const uint16_t* src_u,
|
const uint16_t* src_u,
|
||||||
int src_stride_u,
|
int src_stride_u,
|
||||||
@ -441,7 +441,7 @@ int I010ToP010(const uint16_t* src_y,
|
|||||||
int dst_stride_uv,
|
int dst_stride_uv,
|
||||||
int width,
|
int width,
|
||||||
int height) {
|
int height) {
|
||||||
return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
|
return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
|
||||||
src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
|
src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
|
||||||
width, height, 1, 1, 10);
|
width, height, 1, 1, 10);
|
||||||
}
|
}
|
||||||
@ -459,7 +459,7 @@ int I210ToP210(const uint16_t* src_y,
|
|||||||
int dst_stride_uv,
|
int dst_stride_uv,
|
||||||
int width,
|
int width,
|
||||||
int height) {
|
int height) {
|
||||||
return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
|
return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
|
||||||
src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
|
src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
|
||||||
width, height, 1, 0, 10);
|
width, height, 1, 0, 10);
|
||||||
}
|
}
|
||||||
@ -477,7 +477,7 @@ int I012ToP012(const uint16_t* src_y,
|
|||||||
int dst_stride_uv,
|
int dst_stride_uv,
|
||||||
int width,
|
int width,
|
||||||
int height) {
|
int height) {
|
||||||
return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
|
return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
|
||||||
src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
|
src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
|
||||||
width, height, 1, 1, 12);
|
width, height, 1, 1, 12);
|
||||||
}
|
}
|
||||||
@ -495,7 +495,7 @@ int I212ToP212(const uint16_t* src_y,
|
|||||||
int dst_stride_uv,
|
int dst_stride_uv,
|
||||||
int width,
|
int width,
|
||||||
int height) {
|
int height) {
|
||||||
return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
|
return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
|
||||||
src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
|
src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
|
||||||
width, height, 1, 0, 12);
|
width, height, 1, 0, 12);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -10,6 +10,7 @@
|
|||||||
|
|
||||||
#include "libyuv/planar_functions.h"
|
#include "libyuv/planar_functions.h"
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
#include <string.h> // for memset()
|
#include <string.h> // for memset()
|
||||||
|
|
||||||
#include "libyuv/cpu_id.h"
|
#include "libyuv/cpu_id.h"
|
||||||
@ -563,9 +564,9 @@ void SplitUVPlane_16(const uint16_t* src_uv,
|
|||||||
int height,
|
int height,
|
||||||
int depth) {
|
int depth) {
|
||||||
int y;
|
int y;
|
||||||
int scale = 1 << depth;
|
void (*SplitUVRow_16)(const uint16_t* src_uv, uint16_t* dst_u,
|
||||||
void (*SplitUVRow)(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v,
|
uint16_t* dst_v, int depth, int width) =
|
||||||
int scale, int width) = SplitUVRow_16_C;
|
SplitUVRow_16_C;
|
||||||
// Negative height means invert the image.
|
// Negative height means invert the image.
|
||||||
if (height < 0) {
|
if (height < 0) {
|
||||||
height = -height;
|
height = -height;
|
||||||
@ -583,24 +584,24 @@ void SplitUVPlane_16(const uint16_t* src_uv,
|
|||||||
}
|
}
|
||||||
#if defined(HAS_SPLITUVROW_16_AVX2)
|
#if defined(HAS_SPLITUVROW_16_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||||
SplitUVRow = SplitUVRow_16_Any_AVX2;
|
SplitUVRow_16 = SplitUVRow_16_Any_AVX2;
|
||||||
if (IS_ALIGNED(width, 16)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
SplitUVRow = SplitUVRow_16_AVX2;
|
SplitUVRow_16 = SplitUVRow_16_AVX2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_SPLITUVROW_16_NEON)
|
#if defined(HAS_SPLITUVROW_16_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
SplitUVRow = SplitUVRow_16_Any_NEON;
|
SplitUVRow_16 = SplitUVRow_16_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 8)) {
|
||||||
SplitUVRow = SplitUVRow_16_NEON;
|
SplitUVRow_16 = SplitUVRow_16_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (y = 0; y < height; ++y) {
|
for (y = 0; y < height; ++y) {
|
||||||
// Copy a row of UV.
|
// Copy a row of UV.
|
||||||
SplitUVRow(src_uv, dst_u, dst_v, scale, width);
|
SplitUVRow_16(src_uv, dst_u, dst_v, depth, width);
|
||||||
dst_u += dst_stride_u;
|
dst_u += dst_stride_u;
|
||||||
dst_v += dst_stride_v;
|
dst_v += dst_stride_v;
|
||||||
src_uv += src_stride_uv;
|
src_uv += src_stride_uv;
|
||||||
@ -618,9 +619,11 @@ void MergeUVPlane_16(const uint16_t* src_u,
|
|||||||
int height,
|
int height,
|
||||||
int depth) {
|
int depth) {
|
||||||
int y;
|
int y;
|
||||||
int scale = 1 << (16 - depth);
|
void (*MergeUVRow_16)(const uint16_t* src_u, const uint16_t* src_v,
|
||||||
void (*MergeUVRow)(const uint16_t* src_u, const uint16_t* src_v,
|
uint16_t* dst_uv, int depth, int width) =
|
||||||
uint16_t* dst_uv, int scale, int width) = MergeUVRow_16_C;
|
MergeUVRow_16_C;
|
||||||
|
assert(depth >= 8);
|
||||||
|
assert(depth <= 16);
|
||||||
// Negative height means invert the image.
|
// Negative height means invert the image.
|
||||||
if (height < 0) {
|
if (height < 0) {
|
||||||
height = -height;
|
height = -height;
|
||||||
@ -636,24 +639,24 @@ void MergeUVPlane_16(const uint16_t* src_u,
|
|||||||
}
|
}
|
||||||
#if defined(HAS_MERGEUVROW_16_AVX2)
|
#if defined(HAS_MERGEUVROW_16_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||||
MergeUVRow = MergeUVRow_16_Any_AVX2;
|
MergeUVRow_16 = MergeUVRow_16_Any_AVX2;
|
||||||
if (IS_ALIGNED(width, 16)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
MergeUVRow = MergeUVRow_16_AVX2;
|
MergeUVRow_16 = MergeUVRow_16_AVX2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_MERGEUVROW_16_NEON)
|
#if defined(HAS_MERGEUVROW_16_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
MergeUVRow = MergeUVRow_16_Any_NEON;
|
MergeUVRow_16 = MergeUVRow_16_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 8)) {
|
if (IS_ALIGNED(width, 8)) {
|
||||||
MergeUVRow = MergeUVRow_16_NEON;
|
MergeUVRow_16 = MergeUVRow_16_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (y = 0; y < height; ++y) {
|
for (y = 0; y < height; ++y) {
|
||||||
// Merge a row of U and V into a row of UV.
|
// Merge a row of U and V into a row of UV.
|
||||||
MergeUVRow(src_u, src_v, dst_uv, scale, width);
|
MergeUVRow_16(src_u, src_v, dst_uv, depth, width);
|
||||||
src_u += src_stride_u;
|
src_u += src_stride_u;
|
||||||
src_v += src_stride_v;
|
src_v += src_stride_v;
|
||||||
dst_uv += dst_stride_uv;
|
dst_uv += dst_stride_uv;
|
||||||
@ -671,7 +674,7 @@ void ConvertToMSBPlane_16(const uint16_t* src_y,
|
|||||||
int depth) {
|
int depth) {
|
||||||
int y;
|
int y;
|
||||||
int scale = 1 << (16 - depth);
|
int scale = 1 << (16 - depth);
|
||||||
void (*MultiplyRow)(const uint16_t* src_y, uint16_t* dst_y, int scale,
|
void (*MultiplyRow_16)(const uint16_t* src_y, uint16_t* dst_y, int scale,
|
||||||
int width) = MultiplyRow_16_C;
|
int width) = MultiplyRow_16_C;
|
||||||
// Negative height means invert the image.
|
// Negative height means invert the image.
|
||||||
if (height < 0) {
|
if (height < 0) {
|
||||||
@ -688,23 +691,23 @@ void ConvertToMSBPlane_16(const uint16_t* src_y,
|
|||||||
|
|
||||||
#if defined(HAS_MULTIPLYROW_16_AVX2)
|
#if defined(HAS_MULTIPLYROW_16_AVX2)
|
||||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||||
MultiplyRow = MultiplyRow_16_Any_AVX2;
|
MultiplyRow_16 = MultiplyRow_16_Any_AVX2;
|
||||||
if (IS_ALIGNED(width, 32)) {
|
if (IS_ALIGNED(width, 32)) {
|
||||||
MultiplyRow = MultiplyRow_16_AVX2;
|
MultiplyRow_16 = MultiplyRow_16_AVX2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#if defined(HAS_MULTIPLYROW_16_NEON)
|
#if defined(HAS_MULTIPLYROW_16_NEON)
|
||||||
if (TestCpuFlag(kCpuHasNEON)) {
|
if (TestCpuFlag(kCpuHasNEON)) {
|
||||||
MultiplyRow = MultiplyRow_16_Any_NEON;
|
MultiplyRow_16 = MultiplyRow_16_Any_NEON;
|
||||||
if (IS_ALIGNED(width, 16)) {
|
if (IS_ALIGNED(width, 16)) {
|
||||||
MultiplyRow = MultiplyRow_16_NEON;
|
MultiplyRow_16 = MultiplyRow_16_NEON;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
for (y = 0; y < height; ++y) {
|
for (y = 0; y < height; ++y) {
|
||||||
MultiplyRow(src_y, dst_y, scale, width);
|
MultiplyRow_16(src_y, dst_y, scale, width);
|
||||||
src_y += src_stride_y;
|
src_y += src_stride_y;
|
||||||
dst_y += dst_stride_y;
|
dst_y += dst_stride_y;
|
||||||
}
|
}
|
||||||
|
|||||||
@ -10,6 +10,7 @@
|
|||||||
|
|
||||||
#include "libyuv/row.h"
|
#include "libyuv/row.h"
|
||||||
|
|
||||||
|
#include <assert.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <string.h> // For memcpy and memset.
|
#include <string.h> // For memcpy and memset.
|
||||||
|
|
||||||
@ -3045,6 +3046,8 @@ void MergeUVRow_16_C(const uint16_t* src_u,
|
|||||||
int depth,
|
int depth,
|
||||||
int width) {
|
int width) {
|
||||||
int shift = 16 - depth;
|
int shift = 16 - depth;
|
||||||
|
assert(depth >= 8);
|
||||||
|
assert(depth <= 16);
|
||||||
int x;
|
int x;
|
||||||
for (x = 0; x < width; ++x) {
|
for (x = 0; x < width; ++x) {
|
||||||
dst_uv[0] = src_u[x] << shift;
|
dst_uv[0] = src_u[x] << shift;
|
||||||
@ -3061,6 +3064,8 @@ void SplitUVRow_16_C(const uint16_t* src_uv,
|
|||||||
int width) {
|
int width) {
|
||||||
int shift = 16 - depth;
|
int shift = 16 - depth;
|
||||||
int x;
|
int x;
|
||||||
|
assert(depth >= 8);
|
||||||
|
assert(depth <= 16);
|
||||||
for (x = 0; x < width; ++x) {
|
for (x = 0; x < width; ++x) {
|
||||||
dst_u[x] = src_uv[0] >> shift;
|
dst_u[x] = src_uv[0] >> shift;
|
||||||
dst_v[x] = src_uv[1] >> shift;
|
dst_v[x] = src_uv[1] >> shift;
|
||||||
@ -3098,6 +3103,9 @@ void Convert16To8Row_C(const uint16_t* src_y,
|
|||||||
int scale,
|
int scale,
|
||||||
int width) {
|
int width) {
|
||||||
int x;
|
int x;
|
||||||
|
assert(scale >= 256);
|
||||||
|
assert(scale <= 32768);
|
||||||
|
|
||||||
for (x = 0; x < width; ++x) {
|
for (x = 0; x < width; ++x) {
|
||||||
dst_y[x] = clamp255((src_y[x] * scale) >> 16);
|
dst_y[x] = clamp255((src_y[x] * scale) >> 16);
|
||||||
}
|
}
|
||||||
|
|||||||
@ -4728,8 +4728,6 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
|
|||||||
// clang-format off
|
// clang-format off
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"vmovd %4,%%xmm3 \n"
|
"vmovd %4,%%xmm3 \n"
|
||||||
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
|
|
||||||
"vbroadcastss %%xmm3,%%xmm3 \n"
|
|
||||||
"sub %0,%1 \n"
|
"sub %0,%1 \n"
|
||||||
|
|
||||||
// 16 pixels per loop.
|
// 16 pixels per loop.
|
||||||
@ -4761,7 +4759,7 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
|
|||||||
}
|
}
|
||||||
#endif // HAS_MERGEUVROW_AVX2
|
#endif // HAS_MERGEUVROW_AVX2
|
||||||
|
|
||||||
#ifdef HAS_MERGEUVROW_16_AVX2
|
#ifdef HAS_SPLITUVROW_16_AVX2
|
||||||
const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8, 9, 12, 13,
|
const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8, 9, 12, 13,
|
||||||
2, 3, 6, 7, 10, 11, 14, 15};
|
2, 3, 6, 7, 10, 11, 14, 15};
|
||||||
void SplitUVRow_16_AVX2(const uint16_t* src_uv,
|
void SplitUVRow_16_AVX2(const uint16_t* src_uv,
|
||||||
@ -4773,8 +4771,6 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv,
|
|||||||
// clang-format off
|
// clang-format off
|
||||||
asm volatile (
|
asm volatile (
|
||||||
"vmovd %4,%%xmm3 \n"
|
"vmovd %4,%%xmm3 \n"
|
||||||
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
|
|
||||||
"vbroadcastss %%xmm3,%%xmm3 \n"
|
|
||||||
"vbroadcastf128 %5,%%ymm4 \n"
|
"vbroadcastf128 %5,%%ymm4 \n"
|
||||||
"sub %1,%2 \n"
|
"sub %1,%2 \n"
|
||||||
|
|
||||||
@ -4802,14 +4798,13 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv,
|
|||||||
: "+r"(src_uv), // %0
|
: "+r"(src_uv), // %0
|
||||||
"+r"(dst_u), // %1
|
"+r"(dst_u), // %1
|
||||||
"+r"(dst_v), // %2
|
"+r"(dst_v), // %2
|
||||||
"+r"(width), // %3
|
"+r"(width) // %3
|
||||||
"+r"(depth) // %4
|
: "r"(depth), // %4
|
||||||
:
|
|
||||||
"m"(kSplitUVShuffle16) // %5
|
"m"(kSplitUVShuffle16) // %5
|
||||||
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
|
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
|
||||||
// clang-format on
|
// clang-format on
|
||||||
}
|
}
|
||||||
#endif // HAS_MERGEUVROW_AVX2
|
#endif // HAS_SPLITUVROW_16_AVX2
|
||||||
|
|
||||||
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
// Use scale to convert lsb formats to msb, depending how many bits there are:
|
||||||
// 128 = 9 bits
|
// 128 = 9 bits
|
||||||
|
|||||||
@ -3270,32 +3270,22 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
|
|||||||
uint16_t* dst_v,
|
uint16_t* dst_v,
|
||||||
int depth,
|
int depth,
|
||||||
int width) {
|
int width) {
|
||||||
|
int shift = depth - 16; // Negative for right shift.
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"vdup.32 q0, %3 \n"
|
"vdup.16 q2, %4 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld2.16 {q1, q2}, [%0]! \n" // load 8 UV
|
"vld2.16 {q0, q1}, [%0]! \n" // load 8 UV
|
||||||
"vmovl.u16 q3, d2 \n"
|
"vshl.u16 q0, q0, q2 \n"
|
||||||
"vmovl.u16 q4, d3 \n"
|
"vshl.u16 q1, q1, q2 \n"
|
||||||
"vshl.u32 q3, q3, q0 \n"
|
"subs %3, %3, #8 \n" // 8 src pixels per loop
|
||||||
"vshl.u32 q4, q4, q0 \n"
|
"vst1.16 {q0}, [%1]! \n" // store 8 U pixels
|
||||||
"vmovn.u32 d2, q3 \n"
|
"vst1.16 {q1}, [%2]! \n" // store 8 V pixels
|
||||||
"vmovn.u32 d3, q4 \n"
|
|
||||||
"vmovl.u16 q3, d4 \n"
|
|
||||||
"vmovl.u16 q4, d5 \n"
|
|
||||||
"vshl.u32 q3, q3, q0 \n"
|
|
||||||
"vshl.u32 q4, q4, q0 \n"
|
|
||||||
"vmovn.u32 d4, q3 \n"
|
|
||||||
"vmovn.u32 d5, q4 \n"
|
|
||||||
"subs %4, %4, #8 \n" // 8 src pixels per loop
|
|
||||||
"vst1.16 {q1}, [%1]! \n" // store 8 U pixels
|
|
||||||
"vst1.16 {q2}, [%2]! \n" // store 8 V pixels
|
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_uv), // %0
|
: "+r"(src_uv), // %0
|
||||||
"+r"(dst_u), // %1
|
"+r"(dst_u), // %1
|
||||||
"+r"(dst_v), // %2
|
"+r"(dst_v), // %2
|
||||||
"+r"(depth), // %3
|
"+r"(width) // %3
|
||||||
"+r"(width) // %4
|
: "r"(shift) // %4
|
||||||
:
|
|
||||||
: "cc", "memory", "q0", "q1", "q2", "q3", "q4");
|
: "cc", "memory", "q0", "q1", "q2", "q3", "q4");
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -3306,21 +3296,20 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
|
|||||||
int width) {
|
int width) {
|
||||||
int shift = 16 - depth;
|
int shift = 16 - depth;
|
||||||
asm volatile(
|
asm volatile(
|
||||||
"vdup.16 q2, %3 \n"
|
"vdup.16 q2, %4 \n"
|
||||||
"1: \n"
|
"1: \n"
|
||||||
"vld1.16 {q0}, [%0]! \n" // load 8 U
|
"vld1.16 {q0}, [%0]! \n" // load 8 U
|
||||||
"vld1.16 {q1}, [%1]! \n" // load 8 V
|
"vld1.16 {q1}, [%1]! \n" // load 8 V
|
||||||
"vshl.u16 q0, q0, q2 \n"
|
"vshl.u16 q0, q0, q2 \n"
|
||||||
"vshl.u16 q1, q1, q2 \n"
|
"vshl.u16 q1, q1, q2 \n"
|
||||||
"subs %4, %4, #8 \n" // 8 src pixels per loop
|
"subs %3, %3, #8 \n" // 8 src pixels per loop
|
||||||
"vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels
|
"vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels
|
||||||
"bgt 1b \n"
|
"bgt 1b \n"
|
||||||
: "+r"(src_u), // %0
|
: "+r"(src_u), // %0
|
||||||
"+r"(src_v), // %1
|
"+r"(src_v), // %1
|
||||||
"+r"(dst_uv), // %2
|
"+r"(dst_uv), // %2
|
||||||
"+r"(shift), // %3
|
"+r"(width) // %3
|
||||||
"+r"(width) // %4
|
: "r"(shift) // %4
|
||||||
:
|
|
||||||
: "cc", "memory", "q0", "q1", "q2");
|
: "cc", "memory", "q0", "q1", "q2");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@ -2605,6 +2605,64 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
|
|||||||
free_aligned_buffer_page_end(dst_pixels_c);
|
free_aligned_buffer_page_end(dst_pixels_c);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 16 bit channel split and merge
|
||||||
|
TEST_F(LibYUVPlanarTest, MergeUVPlane_16_Opt) {
|
||||||
|
// Round count up to multiple of 16
|
||||||
|
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
|
||||||
|
align_buffer_page_end(src_pixels, kPixels * 2 * 2);
|
||||||
|
align_buffer_page_end(tmp_pixels_u_c, kPixels * 2);
|
||||||
|
align_buffer_page_end(tmp_pixels_v_c, kPixels * 2);
|
||||||
|
align_buffer_page_end(tmp_pixels_u_opt, kPixels * 2);
|
||||||
|
align_buffer_page_end(tmp_pixels_v_opt, kPixels * 2);
|
||||||
|
align_buffer_page_end(dst_pixels_opt, kPixels * 2 * 2);
|
||||||
|
align_buffer_page_end(dst_pixels_c, kPixels * 2 * 2);
|
||||||
|
MemRandomize(src_pixels, kPixels * 2 * 2);
|
||||||
|
MemRandomize(tmp_pixels_u_c, kPixels * 2);
|
||||||
|
MemRandomize(tmp_pixels_v_c, kPixels * 2);
|
||||||
|
MemRandomize(tmp_pixels_u_opt, kPixels * 2);
|
||||||
|
MemRandomize(tmp_pixels_v_opt, kPixels * 2);
|
||||||
|
MemRandomize(dst_pixels_opt, kPixels * 2 * 2);
|
||||||
|
MemRandomize(dst_pixels_c, kPixels * 2 * 2);
|
||||||
|
|
||||||
|
MaskCpuFlags(disable_cpu_flags_);
|
||||||
|
SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
|
||||||
|
(uint16_t*)tmp_pixels_u_c, benchmark_width_,
|
||||||
|
(uint16_t*)tmp_pixels_v_c, benchmark_width_, benchmark_width_,
|
||||||
|
benchmark_height_, 12);
|
||||||
|
MergeUVPlane_16((const uint16_t*)tmp_pixels_u_c, benchmark_width_,
|
||||||
|
(const uint16_t*)tmp_pixels_v_c, benchmark_width_,
|
||||||
|
(uint16_t*)dst_pixels_c, benchmark_width_ * 2,
|
||||||
|
benchmark_width_, benchmark_height_, 12);
|
||||||
|
MaskCpuFlags(benchmark_cpu_info_);
|
||||||
|
|
||||||
|
SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
|
||||||
|
(uint16_t*)tmp_pixels_u_opt, benchmark_width_,
|
||||||
|
(uint16_t*)tmp_pixels_v_opt, benchmark_width_,
|
||||||
|
benchmark_width_, benchmark_height_, 12);
|
||||||
|
|
||||||
|
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||||
|
MergeUVPlane_16((const uint16_t*)tmp_pixels_u_opt, benchmark_width_,
|
||||||
|
(const uint16_t*)tmp_pixels_v_opt, benchmark_width_,
|
||||||
|
(uint16_t*)dst_pixels_opt, benchmark_width_ * 2,
|
||||||
|
benchmark_width_, benchmark_height_, 12);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < kPixels * 2; ++i) {
|
||||||
|
EXPECT_EQ(tmp_pixels_u_c[i], tmp_pixels_u_opt[i]);
|
||||||
|
EXPECT_EQ(tmp_pixels_v_c[i], tmp_pixels_v_opt[i]);
|
||||||
|
}
|
||||||
|
for (int i = 0; i < kPixels * 2 * 2; ++i) {
|
||||||
|
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
|
||||||
|
}
|
||||||
|
free_aligned_buffer_page_end(src_pixels);
|
||||||
|
free_aligned_buffer_page_end(tmp_pixels_u_c);
|
||||||
|
free_aligned_buffer_page_end(tmp_pixels_v_c);
|
||||||
|
free_aligned_buffer_page_end(tmp_pixels_u_opt);
|
||||||
|
free_aligned_buffer_page_end(tmp_pixels_v_opt);
|
||||||
|
free_aligned_buffer_page_end(dst_pixels_opt);
|
||||||
|
free_aligned_buffer_page_end(dst_pixels_c);
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
|
TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
|
||||||
// Round count up to multiple of 16
|
// Round count up to multiple of 16
|
||||||
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
|
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
|
||||||
@ -2649,6 +2707,46 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
|
|||||||
free_aligned_buffer_page_end(dst_pixels_c);
|
free_aligned_buffer_page_end(dst_pixels_c);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// 16 bit channel split
|
||||||
|
TEST_F(LibYUVPlanarTest, SplitUVPlane_16_Opt) {
|
||||||
|
// Round count up to multiple of 16
|
||||||
|
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
|
||||||
|
align_buffer_page_end(src_pixels, kPixels * 2 * 2);
|
||||||
|
align_buffer_page_end(dst_pixels_u_c, kPixels * 2);
|
||||||
|
align_buffer_page_end(dst_pixels_v_c, kPixels * 2);
|
||||||
|
align_buffer_page_end(dst_pixels_u_opt, kPixels * 2);
|
||||||
|
align_buffer_page_end(dst_pixels_v_opt, kPixels * 2);
|
||||||
|
MemRandomize(src_pixels, kPixels * 2 * 2);
|
||||||
|
MemRandomize(dst_pixels_u_c, kPixels * 2);
|
||||||
|
MemRandomize(dst_pixels_v_c, kPixels * 2);
|
||||||
|
MemRandomize(dst_pixels_u_opt, kPixels * 2);
|
||||||
|
MemRandomize(dst_pixels_v_opt, kPixels * 2);
|
||||||
|
|
||||||
|
MaskCpuFlags(disable_cpu_flags_);
|
||||||
|
SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
|
||||||
|
(uint16_t*)dst_pixels_u_c, benchmark_width_,
|
||||||
|
(uint16_t*)dst_pixels_v_c, benchmark_width_, benchmark_width_,
|
||||||
|
benchmark_height_, 10);
|
||||||
|
MaskCpuFlags(benchmark_cpu_info_);
|
||||||
|
|
||||||
|
for (int i = 0; i < benchmark_iterations_; ++i) {
|
||||||
|
SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
|
||||||
|
(uint16_t*)dst_pixels_u_opt, benchmark_width_,
|
||||||
|
(uint16_t*)dst_pixels_v_opt, benchmark_width_,
|
||||||
|
benchmark_width_, benchmark_height_, 10);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < kPixels * 2; ++i) {
|
||||||
|
EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]);
|
||||||
|
EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]);
|
||||||
|
}
|
||||||
|
free_aligned_buffer_page_end(src_pixels);
|
||||||
|
free_aligned_buffer_page_end(dst_pixels_u_c);
|
||||||
|
free_aligned_buffer_page_end(dst_pixels_v_c);
|
||||||
|
free_aligned_buffer_page_end(dst_pixels_u_opt);
|
||||||
|
free_aligned_buffer_page_end(dst_pixels_v_opt);
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) {
|
TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) {
|
||||||
// Round count up to multiple of 16
|
// Round count up to multiple of 16
|
||||||
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
|
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user