Fixes for SplitUVPlane_16 and MergeUVPlane_16

Planar functions pass depth instead of scale factor.
Row functions pass shift instead of depth.  Add assert to C.
AVX shift instruction expects a single shift value in XMM.
Neon pass shift as input (not output).
Split Neon reimplemented as left shift on shorts by negative to achieve right shift.
Add planar unitests

Bug: libyuv:888
Change-Id: I8fe62d3d777effc5321c361cd595c58b7f93807e
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2782086
Reviewed-by: richard winterton <rrwinterton@gmail.com>
Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
This commit is contained in:
Frank Barchard 2021-03-24 13:45:04 -07:00 committed by Frank Barchard
parent d8f1bfc981
commit 312c02a5aa
10 changed files with 311 additions and 230 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1782
Version: 1783
License: BSD
License File: LICENSE

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1782
#define LIBYUV_VERSION 1783
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -400,7 +400,7 @@ int I210ToI010(const uint16_t* src_y,
}
// Any I[420]1[02] to P[420]1[02] format with mirroring.
static int Ix1xToPx1x(const uint16_t* src_y,
static int IxxxToPxxx(const uint16_t* src_y,
int src_stride_y,
const uint16_t* src_u,
int src_stride_u,
@ -441,7 +441,7 @@ int I010ToP010(const uint16_t* src_y,
int dst_stride_uv,
int width,
int height) {
return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
width, height, 1, 1, 10);
}
@ -459,7 +459,7 @@ int I210ToP210(const uint16_t* src_y,
int dst_stride_uv,
int width,
int height) {
return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
width, height, 1, 0, 10);
}
@ -477,7 +477,7 @@ int I012ToP012(const uint16_t* src_y,
int dst_stride_uv,
int width,
int height) {
return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
width, height, 1, 1, 12);
}
@ -495,7 +495,7 @@ int I212ToP212(const uint16_t* src_y,
int dst_stride_uv,
int width,
int height) {
return Ix1xToPx1x(src_y, src_stride_y, src_u, src_stride_u, src_v,
return IxxxToPxxx(src_y, src_stride_y, src_u, src_stride_u, src_v,
src_stride_v, dst_y, dst_stride_y, dst_uv, dst_stride_uv,
width, height, 1, 0, 12);
}

View File

@ -10,6 +10,7 @@
#include "libyuv/planar_functions.h"
#include <assert.h>
#include <string.h> // for memset()
#include "libyuv/cpu_id.h"
@ -563,9 +564,9 @@ void SplitUVPlane_16(const uint16_t* src_uv,
int height,
int depth) {
int y;
int scale = 1 << depth;
void (*SplitUVRow)(const uint16_t* src_uv, uint16_t* dst_u, uint16_t* dst_v,
int scale, int width) = SplitUVRow_16_C;
void (*SplitUVRow_16)(const uint16_t* src_uv, uint16_t* dst_u,
uint16_t* dst_v, int depth, int width) =
SplitUVRow_16_C;
// Negative height means invert the image.
if (height < 0) {
height = -height;
@ -583,24 +584,24 @@ void SplitUVPlane_16(const uint16_t* src_uv,
}
#if defined(HAS_SPLITUVROW_16_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
SplitUVRow = SplitUVRow_16_Any_AVX2;
SplitUVRow_16 = SplitUVRow_16_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
SplitUVRow = SplitUVRow_16_AVX2;
SplitUVRow_16 = SplitUVRow_16_AVX2;
}
}
#endif
#if defined(HAS_SPLITUVROW_16_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
SplitUVRow = SplitUVRow_16_Any_NEON;
SplitUVRow_16 = SplitUVRow_16_Any_NEON;
if (IS_ALIGNED(width, 8)) {
SplitUVRow = SplitUVRow_16_NEON;
SplitUVRow_16 = SplitUVRow_16_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
// Copy a row of UV.
SplitUVRow(src_uv, dst_u, dst_v, scale, width);
SplitUVRow_16(src_uv, dst_u, dst_v, depth, width);
dst_u += dst_stride_u;
dst_v += dst_stride_v;
src_uv += src_stride_uv;
@ -618,9 +619,11 @@ void MergeUVPlane_16(const uint16_t* src_u,
int height,
int depth) {
int y;
int scale = 1 << (16 - depth);
void (*MergeUVRow)(const uint16_t* src_u, const uint16_t* src_v,
uint16_t* dst_uv, int scale, int width) = MergeUVRow_16_C;
void (*MergeUVRow_16)(const uint16_t* src_u, const uint16_t* src_v,
uint16_t* dst_uv, int depth, int width) =
MergeUVRow_16_C;
assert(depth >= 8);
assert(depth <= 16);
// Negative height means invert the image.
if (height < 0) {
height = -height;
@ -636,24 +639,24 @@ void MergeUVPlane_16(const uint16_t* src_u,
}
#if defined(HAS_MERGEUVROW_16_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MergeUVRow = MergeUVRow_16_Any_AVX2;
MergeUVRow_16 = MergeUVRow_16_Any_AVX2;
if (IS_ALIGNED(width, 16)) {
MergeUVRow = MergeUVRow_16_AVX2;
MergeUVRow_16 = MergeUVRow_16_AVX2;
}
}
#endif
#if defined(HAS_MERGEUVROW_16_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MergeUVRow = MergeUVRow_16_Any_NEON;
MergeUVRow_16 = MergeUVRow_16_Any_NEON;
if (IS_ALIGNED(width, 8)) {
MergeUVRow = MergeUVRow_16_NEON;
MergeUVRow_16 = MergeUVRow_16_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
// Merge a row of U and V into a row of UV.
MergeUVRow(src_u, src_v, dst_uv, scale, width);
MergeUVRow_16(src_u, src_v, dst_uv, depth, width);
src_u += src_stride_u;
src_v += src_stride_v;
dst_uv += dst_stride_uv;
@ -671,7 +674,7 @@ void ConvertToMSBPlane_16(const uint16_t* src_y,
int depth) {
int y;
int scale = 1 << (16 - depth);
void (*MultiplyRow)(const uint16_t* src_y, uint16_t* dst_y, int scale,
void (*MultiplyRow_16)(const uint16_t* src_y, uint16_t* dst_y, int scale,
int width) = MultiplyRow_16_C;
// Negative height means invert the image.
if (height < 0) {
@ -688,23 +691,23 @@ void ConvertToMSBPlane_16(const uint16_t* src_y,
#if defined(HAS_MULTIPLYROW_16_AVX2)
if (TestCpuFlag(kCpuHasAVX2)) {
MultiplyRow = MultiplyRow_16_Any_AVX2;
MultiplyRow_16 = MultiplyRow_16_Any_AVX2;
if (IS_ALIGNED(width, 32)) {
MultiplyRow = MultiplyRow_16_AVX2;
MultiplyRow_16 = MultiplyRow_16_AVX2;
}
}
#endif
#if defined(HAS_MULTIPLYROW_16_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
MultiplyRow = MultiplyRow_16_Any_NEON;
MultiplyRow_16 = MultiplyRow_16_Any_NEON;
if (IS_ALIGNED(width, 16)) {
MultiplyRow = MultiplyRow_16_NEON;
MultiplyRow_16 = MultiplyRow_16_NEON;
}
}
#endif
for (y = 0; y < height; ++y) {
MultiplyRow(src_y, dst_y, scale, width);
MultiplyRow_16(src_y, dst_y, scale, width);
src_y += src_stride_y;
dst_y += dst_stride_y;
}

View File

@ -10,6 +10,7 @@
#include "libyuv/row.h"
#include <assert.h>
#include <stdio.h>
#include <string.h> // For memcpy and memset.
@ -3045,6 +3046,8 @@ void MergeUVRow_16_C(const uint16_t* src_u,
int depth,
int width) {
int shift = 16 - depth;
assert(depth >= 8);
assert(depth <= 16);
int x;
for (x = 0; x < width; ++x) {
dst_uv[0] = src_u[x] << shift;
@ -3061,6 +3064,8 @@ void SplitUVRow_16_C(const uint16_t* src_uv,
int width) {
int shift = 16 - depth;
int x;
assert(depth >= 8);
assert(depth <= 16);
for (x = 0; x < width; ++x) {
dst_u[x] = src_uv[0] >> shift;
dst_v[x] = src_uv[1] >> shift;
@ -3098,6 +3103,9 @@ void Convert16To8Row_C(const uint16_t* src_y,
int scale,
int width) {
int x;
assert(scale >= 256);
assert(scale <= 32768);
for (x = 0; x < width; ++x) {
dst_y[x] = clamp255((src_y[x] * scale) >> 16);
}

View File

@ -4728,8 +4728,6 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
// clang-format off
asm volatile (
"vmovd %4,%%xmm3 \n"
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
"vbroadcastss %%xmm3,%%xmm3 \n"
"sub %0,%1 \n"
// 16 pixels per loop.
@ -4761,7 +4759,7 @@ void MergeUVRow_16_AVX2(const uint16_t* src_u,
}
#endif // HAS_MERGEUVROW_AVX2
#ifdef HAS_MERGEUVROW_16_AVX2
#ifdef HAS_SPLITUVROW_16_AVX2
const uvec8 kSplitUVShuffle16 = {0, 1, 4, 5, 8, 9, 12, 13,
2, 3, 6, 7, 10, 11, 14, 15};
void SplitUVRow_16_AVX2(const uint16_t* src_uv,
@ -4773,8 +4771,6 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv,
// clang-format off
asm volatile (
"vmovd %4,%%xmm3 \n"
"vpunpcklwd %%xmm3,%%xmm3,%%xmm3 \n"
"vbroadcastss %%xmm3,%%xmm3 \n"
"vbroadcastf128 %5,%%ymm4 \n"
"sub %1,%2 \n"
@ -4802,14 +4798,13 @@ void SplitUVRow_16_AVX2(const uint16_t* src_uv,
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(width), // %3
"+r"(depth) // %4
:
"+r"(width) // %3
: "r"(depth), // %4
"m"(kSplitUVShuffle16) // %5
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4");
// clang-format on
}
#endif // HAS_MERGEUVROW_AVX2
#endif // HAS_SPLITUVROW_16_AVX2
// Use scale to convert lsb formats to msb, depending how many bits there are:
// 128 = 9 bits

View File

@ -3270,32 +3270,22 @@ void SplitUVRow_16_NEON(const uint16_t* src_uv,
uint16_t* dst_v,
int depth,
int width) {
int shift = depth - 16; // Negative for right shift.
asm volatile(
"vdup.32 q0, %3 \n"
"vdup.16 q2, %4 \n"
"1: \n"
"vld2.16 {q1, q2}, [%0]! \n" // load 8 UV
"vmovl.u16 q3, d2 \n"
"vmovl.u16 q4, d3 \n"
"vshl.u32 q3, q3, q0 \n"
"vshl.u32 q4, q4, q0 \n"
"vmovn.u32 d2, q3 \n"
"vmovn.u32 d3, q4 \n"
"vmovl.u16 q3, d4 \n"
"vmovl.u16 q4, d5 \n"
"vshl.u32 q3, q3, q0 \n"
"vshl.u32 q4, q4, q0 \n"
"vmovn.u32 d4, q3 \n"
"vmovn.u32 d5, q4 \n"
"subs %4, %4, #8 \n" // 8 src pixels per loop
"vst1.16 {q1}, [%1]! \n" // store 8 U pixels
"vst1.16 {q2}, [%2]! \n" // store 8 V pixels
"vld2.16 {q0, q1}, [%0]! \n" // load 8 UV
"vshl.u16 q0, q0, q2 \n"
"vshl.u16 q1, q1, q2 \n"
"subs %3, %3, #8 \n" // 8 src pixels per loop
"vst1.16 {q0}, [%1]! \n" // store 8 U pixels
"vst1.16 {q1}, [%2]! \n" // store 8 V pixels
"bgt 1b \n"
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
"+r"(depth), // %3
"+r"(width) // %4
:
"+r"(width) // %3
: "r"(shift) // %4
: "cc", "memory", "q0", "q1", "q2", "q3", "q4");
}
@ -3306,21 +3296,20 @@ void MergeUVRow_16_NEON(const uint16_t* src_u,
int width) {
int shift = 16 - depth;
asm volatile(
"vdup.16 q2, %3 \n"
"vdup.16 q2, %4 \n"
"1: \n"
"vld1.16 {q0}, [%0]! \n" // load 8 U
"vld1.16 {q1}, [%1]! \n" // load 8 V
"vshl.u16 q0, q0, q2 \n"
"vshl.u16 q1, q1, q2 \n"
"subs %4, %4, #8 \n" // 8 src pixels per loop
"subs %3, %3, #8 \n" // 8 src pixels per loop
"vst2.16 {q0, q1}, [%2]! \n" // store 8 UV pixels
"bgt 1b \n"
: "+r"(src_u), // %0
"+r"(src_v), // %1
"+r"(dst_uv), // %2
"+r"(shift), // %3
"+r"(width) // %4
:
"+r"(width) // %3
: "r"(shift) // %4
: "cc", "memory", "q0", "q1", "q2");
}

File diff suppressed because it is too large Load Diff

View File

@ -2605,6 +2605,64 @@ TEST_F(LibYUVPlanarTest, MergeUVPlane_Opt) {
free_aligned_buffer_page_end(dst_pixels_c);
}
// 16 bit channel split and merge
TEST_F(LibYUVPlanarTest, MergeUVPlane_16_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels, kPixels * 2 * 2);
align_buffer_page_end(tmp_pixels_u_c, kPixels * 2);
align_buffer_page_end(tmp_pixels_v_c, kPixels * 2);
align_buffer_page_end(tmp_pixels_u_opt, kPixels * 2);
align_buffer_page_end(tmp_pixels_v_opt, kPixels * 2);
align_buffer_page_end(dst_pixels_opt, kPixels * 2 * 2);
align_buffer_page_end(dst_pixels_c, kPixels * 2 * 2);
MemRandomize(src_pixels, kPixels * 2 * 2);
MemRandomize(tmp_pixels_u_c, kPixels * 2);
MemRandomize(tmp_pixels_v_c, kPixels * 2);
MemRandomize(tmp_pixels_u_opt, kPixels * 2);
MemRandomize(tmp_pixels_v_opt, kPixels * 2);
MemRandomize(dst_pixels_opt, kPixels * 2 * 2);
MemRandomize(dst_pixels_c, kPixels * 2 * 2);
MaskCpuFlags(disable_cpu_flags_);
SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
(uint16_t*)tmp_pixels_u_c, benchmark_width_,
(uint16_t*)tmp_pixels_v_c, benchmark_width_, benchmark_width_,
benchmark_height_, 12);
MergeUVPlane_16((const uint16_t*)tmp_pixels_u_c, benchmark_width_,
(const uint16_t*)tmp_pixels_v_c, benchmark_width_,
(uint16_t*)dst_pixels_c, benchmark_width_ * 2,
benchmark_width_, benchmark_height_, 12);
MaskCpuFlags(benchmark_cpu_info_);
SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
(uint16_t*)tmp_pixels_u_opt, benchmark_width_,
(uint16_t*)tmp_pixels_v_opt, benchmark_width_,
benchmark_width_, benchmark_height_, 12);
for (int i = 0; i < benchmark_iterations_; ++i) {
MergeUVPlane_16((const uint16_t*)tmp_pixels_u_opt, benchmark_width_,
(const uint16_t*)tmp_pixels_v_opt, benchmark_width_,
(uint16_t*)dst_pixels_opt, benchmark_width_ * 2,
benchmark_width_, benchmark_height_, 12);
}
for (int i = 0; i < kPixels * 2; ++i) {
EXPECT_EQ(tmp_pixels_u_c[i], tmp_pixels_u_opt[i]);
EXPECT_EQ(tmp_pixels_v_c[i], tmp_pixels_v_opt[i]);
}
for (int i = 0; i < kPixels * 2 * 2; ++i) {
EXPECT_EQ(dst_pixels_c[i], dst_pixels_opt[i]);
}
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(tmp_pixels_u_c);
free_aligned_buffer_page_end(tmp_pixels_v_c);
free_aligned_buffer_page_end(tmp_pixels_u_opt);
free_aligned_buffer_page_end(tmp_pixels_v_opt);
free_aligned_buffer_page_end(dst_pixels_opt);
free_aligned_buffer_page_end(dst_pixels_c);
}
TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
@ -2649,6 +2707,46 @@ TEST_F(LibYUVPlanarTest, SplitUVPlane_Opt) {
free_aligned_buffer_page_end(dst_pixels_c);
}
// 16 bit channel split
TEST_F(LibYUVPlanarTest, SplitUVPlane_16_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;
align_buffer_page_end(src_pixels, kPixels * 2 * 2);
align_buffer_page_end(dst_pixels_u_c, kPixels * 2);
align_buffer_page_end(dst_pixels_v_c, kPixels * 2);
align_buffer_page_end(dst_pixels_u_opt, kPixels * 2);
align_buffer_page_end(dst_pixels_v_opt, kPixels * 2);
MemRandomize(src_pixels, kPixels * 2 * 2);
MemRandomize(dst_pixels_u_c, kPixels * 2);
MemRandomize(dst_pixels_v_c, kPixels * 2);
MemRandomize(dst_pixels_u_opt, kPixels * 2);
MemRandomize(dst_pixels_v_opt, kPixels * 2);
MaskCpuFlags(disable_cpu_flags_);
SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
(uint16_t*)dst_pixels_u_c, benchmark_width_,
(uint16_t*)dst_pixels_v_c, benchmark_width_, benchmark_width_,
benchmark_height_, 10);
MaskCpuFlags(benchmark_cpu_info_);
for (int i = 0; i < benchmark_iterations_; ++i) {
SplitUVPlane_16((const uint16_t*)src_pixels, benchmark_width_ * 2,
(uint16_t*)dst_pixels_u_opt, benchmark_width_,
(uint16_t*)dst_pixels_v_opt, benchmark_width_,
benchmark_width_, benchmark_height_, 10);
}
for (int i = 0; i < kPixels * 2; ++i) {
EXPECT_EQ(dst_pixels_u_c[i], dst_pixels_u_opt[i]);
EXPECT_EQ(dst_pixels_v_c[i], dst_pixels_v_opt[i]);
}
free_aligned_buffer_page_end(src_pixels);
free_aligned_buffer_page_end(dst_pixels_u_c);
free_aligned_buffer_page_end(dst_pixels_v_c);
free_aligned_buffer_page_end(dst_pixels_u_opt);
free_aligned_buffer_page_end(dst_pixels_v_opt);
}
TEST_F(LibYUVPlanarTest, SwapUVPlane_Opt) {
// Round count up to multiple of 16
const int kPixels = (benchmark_width_ * benchmark_height_ + 15) & ~15;