mirror of
https://chromium.googlesource.com/libyuv/libyuv
synced 2025-12-07 17:26:49 +08:00
Add low level support for 12 bit 420, 422 and 444 YUV video frame conversion.
BUG=libyuv:560,chromium:445071 TEST=untested R=hubbe@chromium.org Review URL: https://codereview.chromium.org/2371293002 .
This commit is contained in:
parent
c11e9b7fb7
commit
7fc932ddd3
@ -281,6 +281,14 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
|
||||
const float* poly,
|
||||
int width, int height);
|
||||
|
||||
// Convert plane of 16 bit shorts to half floats.
|
||||
// Source values are multiplied by scale before storing as half float.
|
||||
LIBYUV_API
|
||||
int HalfFloatPlane(const uint16* src_y, int src_stride_y,
|
||||
uint16* dst_y, int dst_stride_y,
|
||||
float scale,
|
||||
int width, int height);
|
||||
|
||||
// Quantize a rectangle of ARGB. Alpha unaffected.
|
||||
// scale is a 16 bit fractional fixed point scaler between 0 and 65535.
|
||||
// interval_size should be a value between 1 and 255.
|
||||
|
||||
@ -231,6 +231,7 @@ extern "C" {
|
||||
#define HAS_YUY2TOUV422ROW_AVX2
|
||||
#define HAS_YUY2TOUVROW_AVX2
|
||||
#define HAS_YUY2TOYROW_AVX2
|
||||
#define HAS_HALFFLOATROW_AVX2
|
||||
|
||||
// Effects:
|
||||
#define HAS_ARGBADDROW_AVX2
|
||||
@ -252,7 +253,6 @@ extern "C" {
|
||||
#define HAS_ARGBTORGB565ROW_AVX2
|
||||
#define HAS_J400TOARGBROW_AVX2
|
||||
#define HAS_RGB565TOARGBROW_AVX2
|
||||
#define HAS_SHORTTOF16ROW_AVX2
|
||||
#endif
|
||||
|
||||
// The following are also available on x64 Visual C.
|
||||
@ -1934,8 +1934,10 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
||||
int width);
|
||||
|
||||
// Scale and convert to half float.
|
||||
void ShortToF16Row_C(const uint16* src, int16* dst, float scale, int width);
|
||||
void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width);
|
||||
void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width);
|
||||
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width);
|
||||
void HalfFloatRow_Any_AVX2(const uint16* src, uint16* dst, float scale,
|
||||
int width);
|
||||
|
||||
void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
const uint8* luma, uint32 lumacoeff);
|
||||
|
||||
@ -83,6 +83,7 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
|
||||
}
|
||||
|
||||
// TODO(fbarchard): Consider support for negative height.
|
||||
// TODO(fbarchard): Consider stride measured in bytes.
|
||||
LIBYUV_API
|
||||
void CopyPlane_16(const uint16* src_y, int src_stride_y,
|
||||
uint16* dst_y, int dst_stride_y,
|
||||
@ -2441,6 +2442,51 @@ int ARGBPolynomial(const uint8* src_argb, int src_stride_argb,
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Convert plane of 16 bit shorts to half floats.
|
||||
// Source values are multiplied by scale before storing as half float.
|
||||
LIBYUV_API
|
||||
int HalfFloatPlane(const uint16* src_y, int src_stride_y,
|
||||
uint16* dst_y, int dst_stride_y,
|
||||
float scale,
|
||||
int width, int height) {
|
||||
int y;
|
||||
void (*HalfFloatRow)(const uint16* src, uint16* dst, float scale, int width) =
|
||||
HalfFloatRow_C;
|
||||
if (!src_y || !dst_y || width <= 0 || height == 0) {
|
||||
return -1;
|
||||
}
|
||||
src_stride_y >>= 1;
|
||||
dst_stride_y >>= 1;
|
||||
// Negative height means invert the image.
|
||||
if (height < 0) {
|
||||
height = -height;
|
||||
src_y = src_y + (height - 1) * src_stride_y;
|
||||
src_stride_y = -src_stride_y;
|
||||
}
|
||||
// Coalesce rows.
|
||||
if (src_stride_y == width &&
|
||||
dst_stride_y == width) {
|
||||
width *= height;
|
||||
height = 1;
|
||||
src_stride_y = dst_stride_y = 0;
|
||||
}
|
||||
#if defined(HAS_HALFFLOATROW_AVX2)
|
||||
if (TestCpuFlag(kCpuHasAVX2)) {
|
||||
HalfFloatRow = HalfFloatRow_Any_AVX2;
|
||||
if (IS_ALIGNED(width, 16)) {
|
||||
HalfFloatRow = HalfFloatRow_AVX2;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
for (y = 0; y < height; ++y) {
|
||||
HalfFloatRow(src_y, dst_y, scale, width);
|
||||
src_y += src_stride_y;
|
||||
dst_y += dst_stride_y;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Apply a lumacolortable to each ARGB pixel.
|
||||
LIBYUV_API
|
||||
int ARGBLumaColorTable(const uint8* src_argb, int src_stride_argb,
|
||||
|
||||
@ -546,6 +546,28 @@ ANY11P(ARGBShuffleRow_Any_NEON, ARGBShuffleRow_NEON, const uint8*, 4, 4, 3)
|
||||
#endif
|
||||
#undef ANY11P
|
||||
|
||||
// Any 1 to 1 with parameter and shorts. BPP measures in shorts.
|
||||
#define ANY11P16(NAMEANY, ANY_SIMD, T, SBPP, BPP, MASK) \
|
||||
void NAMEANY(const uint16* src_ptr, uint16* dst_ptr, \
|
||||
T shuffler, int width) { \
|
||||
SIMD_ALIGNED(uint16 temp[32 * 2]); \
|
||||
memset(temp, 0, 64); /* for msan */ \
|
||||
int r = width & MASK; \
|
||||
int n = width & ~MASK; \
|
||||
if (n > 0) { \
|
||||
ANY_SIMD(src_ptr, dst_ptr, shuffler, n); \
|
||||
} \
|
||||
memcpy(temp, src_ptr + n * SBPP, r * SBPP); \
|
||||
ANY_SIMD(temp, temp + 64, shuffler, MASK + 1); \
|
||||
memcpy(dst_ptr + n * BPP, temp + 64, r * BPP); \
|
||||
}
|
||||
|
||||
#ifdef HAS_HALFFLOATROW_AVX2
|
||||
ANY11P16(HalfFloatRow_Any_AVX2, HalfFloatRow_AVX2, float, 1, 1, 15)
|
||||
#endif
|
||||
#undef ANY11P16
|
||||
|
||||
|
||||
// Any 1 to 1 with yuvconstants
|
||||
#define ANY11C(NAMEANY, ANY_SIMD, UVSHIFT, SBPP, BPP, MASK) \
|
||||
void NAMEANY(const uint8* src_ptr, uint8* dst_ptr, \
|
||||
|
||||
@ -2333,6 +2333,25 @@ void ARGBPolynomialRow_C(const uint8* src_argb,
|
||||
}
|
||||
}
|
||||
|
||||
// Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor
|
||||
// adjust the source integer range to the half float range desired.
|
||||
|
||||
// This magic constant is 2^-112. Multiplying by this
|
||||
// is the same as subtracting 112 from the exponent, which
|
||||
// is the difference in exponent bias between 32-bit and
|
||||
// 16-bit floats. Once we've done this subtraction, we can
|
||||
// simply extract the low bits of the exponent and the high
|
||||
// bits of the mantissa from our float and we're done.
|
||||
|
||||
void HalfFloatRow_C(const uint16* src, uint16* dst, float scale, int width) {
|
||||
int i;
|
||||
float mult = 1.9259299444e-34f * scale;
|
||||
for (i = 0; i < width; ++i) {
|
||||
float value = src[i] * mult;
|
||||
dst[i] = (uint16)((*(uint32_t*)&value) >> 13);
|
||||
}
|
||||
}
|
||||
|
||||
void ARGBLumaColorTableRow_C(const uint8* src_argb, uint8* dst_argb, int width,
|
||||
const uint8* luma, uint32 lumacoeff) {
|
||||
uint32 bc = lumacoeff & 0xff;
|
||||
|
||||
@ -5366,6 +5366,39 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
||||
}
|
||||
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
|
||||
|
||||
#ifdef HAS_HALFFLOATROW_AVX2
|
||||
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
||||
asm volatile (
|
||||
"vbroadcastss %3, %%ymm4 \n"
|
||||
|
||||
// 16 pixel loop.
|
||||
LABELALIGN
|
||||
"1: \n"
|
||||
"vpmovzxwd " MEMACCESS(0) ",%%ymm0 \n" // 8 shorts -> 8 ints
|
||||
"vpmovzxwd " MEMACCESS2(0x10,0) ",%%ymm1 \n" // 8 more
|
||||
"lea " MEMLEA(0x20,0) ",%0 \n"
|
||||
"vcvtdq2ps %%ymm0,%%ymm0 \n"
|
||||
"vcvtdq2ps %%ymm1,%%ymm1 \n"
|
||||
"vmulps %%ymm0,%%ymm4,%%ymm0 \n"
|
||||
"vmulps %%ymm1,%%ymm4,%%ymm1 \n"
|
||||
"vcvtps2ph $3, %%ymm0, %%xmm0 \n"
|
||||
"vcvtps2ph $3, %%ymm1, %%xmm1 \n"
|
||||
"vmovdqu %%xmm0," MEMACCESS(1) " \n"
|
||||
"vmovdqu %%xmm1," MEMACCESS2(0x10,1) " \n"
|
||||
"lea " MEMLEA(0x20,1) ",%1 \n"
|
||||
"sub $0x10,%2 \n"
|
||||
"jg 1b \n"
|
||||
"vzeroupper \n"
|
||||
: "+r"(src), // %0
|
||||
"+r"(dst), // %1
|
||||
"+r"(width) // %2
|
||||
: "x"(scale) // %3
|
||||
: "memory", "cc",
|
||||
"xmm0", "xmm4"
|
||||
);
|
||||
}
|
||||
#endif // HAS_HALFFLOATROW_AVX2
|
||||
|
||||
#ifdef HAS_ARGBCOLORTABLEROW_X86
|
||||
// Tranform ARGB pixels with color table.
|
||||
void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
|
||||
|
||||
@ -6095,13 +6095,9 @@ void ARGBPolynomialRow_AVX2(const uint8* src_argb,
|
||||
}
|
||||
#endif // HAS_ARGBPOLYNOMIALROW_AVX2
|
||||
|
||||
// Samples assumed to be unsigned in low 9, 10 or 12 bits. Scale factor
|
||||
// adjust the sample range to 0 to 1 using a float multiply.
|
||||
// e.g. 9 bit scale is 1.0f / 512.0f
|
||||
// e.g. 10 bit scale is 1.0f / 1024.0f
|
||||
#ifdef HAS_SHORTTOHALFFLOAT_AVX2
|
||||
#ifdef HAS_HALFFLOATROW_AVX2
|
||||
__declspec(naked)
|
||||
void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width) {
|
||||
void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
|
||||
__asm {
|
||||
mov eax, [esp + 4] /* src */
|
||||
mov edx, [esp + 8] /* dst */
|
||||
@ -6111,19 +6107,24 @@ void ShortToF16Row_AVX2(const uint16* src, int16* dst, float scale, int width) {
|
||||
// 8 pixel loop.
|
||||
convertloop:
|
||||
vpmovzxwd ymm0, xmmword ptr [eax] // 8 shorts -> 8 ints
|
||||
lea eax, [eax + 16]
|
||||
vpmovzxwd ymm1, xmmword ptr [eax + 16] // 8 more shorts
|
||||
lea eax, [eax + 32]
|
||||
vcvtdq2ps ymm0, ymm0 // convert 8 ints to floats
|
||||
vcvtdq2ps ymm1, ymm1
|
||||
vmulps ymm0, ymm0, ymm4 // scale to normalized range 0 to 1
|
||||
vcvtps2ph xmm0, ymm0, 0 // float conver to 8 half floats round even
|
||||
vmulps ymm1, ymm1, ymm4
|
||||
vcvtps2ph xmm0, ymm0, 3 // float convert to 8 half floats truncate
|
||||
vcvtps2ph xmm1, ymm1, 3
|
||||
vmovdqu [edx], xmm0
|
||||
lea edx, [edx + 16]
|
||||
sub ecx, 8
|
||||
vmovdqu [edx + 16], xmm1
|
||||
lea edx, [edx + 32]
|
||||
sub ecx, 16
|
||||
jg convertloop
|
||||
vzeroupper
|
||||
ret
|
||||
}
|
||||
}
|
||||
#endif // HAS_SHORTTOHALFFLOAT_AVX2
|
||||
#endif // HAS_HALFFLOATROW_AVX2
|
||||
|
||||
#ifdef HAS_ARGBCOLORTABLEROW_X86
|
||||
// Tranform ARGB pixels with color table.
|
||||
|
||||
@ -2081,6 +2081,46 @@ TEST_F(LibYUVPlanarTest, TestARGBPolynomial) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestHalfFloatPlane) {
|
||||
int i, j;
|
||||
const int y_plane_size = benchmark_width_ * benchmark_height_ * 2;
|
||||
|
||||
align_buffer_page_end(orig_y, y_plane_size);
|
||||
align_buffer_page_end(dst_c, y_plane_size);
|
||||
align_buffer_page_end(dst_opt, y_plane_size);
|
||||
MemRandomize(orig_y, y_plane_size);
|
||||
memset(dst_c, 0, y_plane_size);
|
||||
memset(dst_opt, 1, y_plane_size);
|
||||
|
||||
// Disable all optimizations.
|
||||
MaskCpuFlags(disable_cpu_flags_);
|
||||
double c_time = get_time();
|
||||
for (j = 0; j < benchmark_iterations_; j++) {
|
||||
HalfFloatPlane((uint16*)orig_y, benchmark_width_ * 2,
|
||||
(uint16*)dst_c, benchmark_width_ * 2,
|
||||
1.0f / 4096.0f, benchmark_width_, benchmark_height_);
|
||||
}
|
||||
c_time = (get_time() - c_time) / benchmark_iterations_;
|
||||
|
||||
// Enable optimizations.
|
||||
MaskCpuFlags(benchmark_cpu_info_);
|
||||
double opt_time = get_time();
|
||||
for (j = 0; j < benchmark_iterations_; j++) {
|
||||
HalfFloatPlane((uint16*)orig_y, benchmark_width_ * 2,
|
||||
(uint16*)dst_opt, benchmark_width_ * 2,
|
||||
1.0f / 4096.0f, benchmark_width_, benchmark_height_);
|
||||
}
|
||||
opt_time = (get_time() - opt_time) / benchmark_iterations_;
|
||||
|
||||
for (i = 0; i < y_plane_size; ++i) {
|
||||
EXPECT_EQ(dst_c[i], dst_opt[i]);
|
||||
}
|
||||
|
||||
free_aligned_buffer_page_end(orig_y);
|
||||
free_aligned_buffer_page_end(dst_c);
|
||||
free_aligned_buffer_page_end(dst_opt);
|
||||
}
|
||||
|
||||
TEST_F(LibYUVPlanarTest, TestARGBLumaColorTable) {
|
||||
SIMD_ALIGNED(uint8 orig_pixels[1280][4]);
|
||||
SIMD_ALIGNED(uint8 dst_pixels_opt[1280][4]);
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user