BT.2020 Full Range yuvconstants

new color util to compute constants needed based on white point.

[ RUN      ] LibYUVColorTest.TestFullYUVV
hist	      -2	      -1	       0	       1	       2
red	       0	 1627136	13670144	 1479936	       0
green	  319285	 3456836	 9243059	 3440771	  317265
blue	       0	 1561088	14202112	 1014016	       0

Bug: libyuv:877, b/178283356
Change-Id: If432ebfab76b01302fdb416a153c4f26ca0832d6
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/2678859
Reviewed-by: Frank Barchard <fbarchard@chromium.org>
Reviewed-by: richard winterton <rrwinterton@gmail.com>
This commit is contained in:
Frank Barchard 2021-02-05 16:14:25 -08:00 committed by Frank Barchard
parent 60d37a064b
commit 942c508448
15 changed files with 733 additions and 571 deletions

View File

@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
Version: 1774
Version: 1775
License: BSD
License File: LICENSE

View File

@ -213,7 +213,7 @@ int I010ToI410(const uint16_t* src_y,
// Convert I012 to I412
#define I012ToI412 I010ToI410
// Convert I212 to I412
// Convert I210 to I410
LIBYUV_API
int I210ToI410(const uint16_t* src_y,
int src_stride_y,

View File

@ -21,18 +21,20 @@ extern "C" {
#endif
// Conversion matrix for YUV to RGB
LIBYUV_API extern const struct YuvConstants kYuvI601Constants; // BT.601
LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants; // JPeg
LIBYUV_API extern const struct YuvConstants kYuvF709Constants; // BT.709 full
LIBYUV_API extern const struct YuvConstants kYuvH709Constants; // BT.709
LIBYUV_API extern const struct YuvConstants kYuv2020Constants; // BT.2020
LIBYUV_API extern const struct YuvConstants kYuvI601Constants; // BT.601
LIBYUV_API extern const struct YuvConstants kYuvJPEGConstants; // JPeg
LIBYUV_API extern const struct YuvConstants kYuvF709Constants; // BT.709 full
LIBYUV_API extern const struct YuvConstants kYuvH709Constants; // BT.709
LIBYUV_API extern const struct YuvConstants kYuv2020Constants; // BT.2020
LIBYUV_API extern const struct YuvConstants kYuvV2020Constants; // BT.2020 full
// Conversion matrix for YVU to BGR
LIBYUV_API extern const struct YuvConstants kYvuI601Constants; // BT.601
LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants; // JPeg
LIBYUV_API extern const struct YuvConstants kYvuF709Constants; // BT.709 full
LIBYUV_API extern const struct YuvConstants kYvuH709Constants; // BT.709
LIBYUV_API extern const struct YuvConstants kYvu2020Constants; // BT.2020
LIBYUV_API extern const struct YuvConstants kYvuI601Constants; // BT.601
LIBYUV_API extern const struct YuvConstants kYvuJPEGConstants; // JPeg
LIBYUV_API extern const struct YuvConstants kYvuF709Constants; // BT.709 full
LIBYUV_API extern const struct YuvConstants kYvuH709Constants; // BT.709
LIBYUV_API extern const struct YuvConstants kYvu2020Constants; // BT.2020
LIBYUV_API extern const struct YuvConstants kYvuV2020Constants; // BT.2020 full
// Macros for end swapped destination Matrix conversions.
// Swap UV and pass mirrored kYvuJPEGConstants matrix.
@ -42,6 +44,8 @@ LIBYUV_API extern const struct YuvConstants kYvu2020Constants; // BT.2020
#define kYuvF709ConstantsVU kYvuF709Constants
#define kYuvH709ConstantsVU kYvuH709Constants
#define kYuv2020ConstantsVU kYvu2020Constants
#define kYuvV2020ConstantsVU kYvuV2020Constants
#define NV12ToABGRMatrix(a, b, c, d, e, f, g, h, i) \
NV21ToARGBMatrix(a, b, c, d, e, f, g##VU, h, i)
#define NV21ToABGRMatrix(a, b, c, d, e, f, g, h, i) \

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1774
#define LIBYUV_VERSION 1775
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -30,6 +30,8 @@ static __inline int Abs(int v) {
}
// I420 To any I4xx YUV format with mirroring.
// TODO(fbarchard): Consider kFilterNone for Y, or CopyPlane
static int I420ToI4xx(const uint8_t* src_y,
int src_stride_y,
const uint8_t* src_u,

View File

@ -1330,234 +1330,218 @@ void J400ToARGBRow_C(const uint8_t* src_y, uint8_t* dst_argb, int width) {
// Macros to create SIMD specific yuv to rgb conversion constants.
#if defined(__aarch64__)
#define MAKEYUVCONSTANTS(name, YG, YGB, UB, UG, VG, VR, BB, BG, BR) \
const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \
{-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, \
{-UB, -VR, -UB, -VR, -UB, -VR, -UB, -VR}, \
{UG, VG, UG, VG, UG, VG, UG, VG}, \
{UG, VG, UG, VG, UG, VG, UG, VG}, \
{BB, BG, BR, YGB, 0, 0, 0, 0}, \
{0x0101 * YG, YG, 0, 0}}; \
const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \
{-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, \
{-VR, -UB, -VR, -UB, -VR, -UB, -VR, -UB}, \
{VG, UG, VG, UG, VG, UG, VG, UG}, \
{VG, UG, VG, UG, VG, UG, VG, UG}, \
{BR, BG, BB, YGB, 0, 0, 0, 0}, \
{0x0101 * YG, YG, 0, 0}};
#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR) \
const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \
{UB, VR, UB, VR, UB, VR, UB, VR}, {UB, VR, UB, VR, UB, VR, UB, VR}, \
{UG, VG, UG, VG, UG, VG, UG, VG}, {UG, VG, UG, VG, UG, VG, UG, VG}, \
{BB, BG, BR, YB, 0, 0, 0, 0}, {0x0101 * YG, YG, 0, 0}}; \
const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \
{VR, UB, VR, UB, VR, UB, VR, UB}, {VR, UB, VR, UB, VR, UB, VR, UB}, \
{VG, UG, VG, UG, VG, UG, VG, UG}, {VG, UG, VG, UG, VG, UG, VG, UG}, \
{BR, BG, BB, YB, 0, 0, 0, 0}, {0x0101 * YG, YG, 0, 0}};
#elif defined(__arm__)
#define MAKEYUVCONSTANTS(name, YG, YGB, UB, UG, VG, VR, BB, BG, BR) \
const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \
{-UB, -UB, -UB, -UB, -VR, -VR, -VR, -VR, 0, 0, 0, 0, 0, 0, 0, 0}, \
{UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, \
{BB, BG, BR, YGB, 0, 0, 0, 0}, \
{0x0101 * YG, YG, 0, 0}}; \
const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \
{-VR, -VR, -VR, -VR, -UB, -UB, -UB, -UB, 0, 0, 0, 0, 0, 0, 0, 0}, \
{VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, \
{BR, BG, BB, YGB, 0, 0, 0, 0}, \
#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR) \
const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \
{UB, UB, UB, UB, VR, VR, VR, VR, 0, 0, 0, 0, 0, 0, 0, 0}, \
{UG, UG, UG, UG, VG, VG, VG, VG, 0, 0, 0, 0, 0, 0, 0, 0}, \
{BB, BG, BR, YB, 0, 0, 0, 0}, \
{0x0101 * YG, YG, 0, 0}}; \
const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \
{VR, VR, VR, VR, UB, UB, UB, UB, 0, 0, 0, 0, 0, 0, 0, 0}, \
{VG, VG, VG, VG, UG, UG, UG, UG, 0, 0, 0, 0, 0, 0, 0, 0}, \
{BR, BG, BB, YB, 0, 0, 0, 0}, \
{0x0101 * YG, YG, 0, 0}};
#else
#define MAKEYUVCONSTANTS(name, YG, YGB, UB, UG, VG, VR, BB, BG, BR) \
const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \
{UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, \
UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0}, \
{UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \
UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
{0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, \
0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR}, \
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, \
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, \
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, \
{YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
{YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, \
YGB, YGB}}; \
const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \
{VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, \
VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0, VR, 0}, \
{VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, \
VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, \
{0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, \
0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB, 0, UB}, \
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, \
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, \
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, \
{YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
{YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, YGB, \
YGB, YGB}};
#define MAKEYUVCONSTANTS(name, YG, YB, UB, UG, VG, VR, BB, BG, BR) \
const struct YuvConstants SIMD_ALIGNED(kYuv##name##Constants) = { \
{-UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, \
-UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0}, \
{UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, \
UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG}, \
{0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, \
0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR}, \
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, \
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, \
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, \
{YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
{YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}}; \
const struct YuvConstants SIMD_ALIGNED(kYvu##name##Constants) = { \
{-VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, \
-VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0, -VR, 0}, \
{VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, \
VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG}, \
{0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, \
0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB, 0, -UB}, \
{BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR, BR}, \
{BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG, BG}, \
{BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB, BB}, \
{YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG, YG}, \
{YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB, YB}};
#endif
// TODO(fbarchard): Generate SIMD structures from float matrix.
// BT.601 YUV to RGB reference
// R = (Y - 16) * 1.164 - V * -1.596
// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
// B = (Y - 16) * 1.164 - U * -2.018
// Bias values to round, and subtract 128 from U and V.
#define BB (-UB * 128 + YB)
#define BG (UG * 128 + VG * 128 + YB)
#define BR (-VR * 128 + YB)
// Y contribution to R,G,B. Scale and bias.
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
// BT.601 limited range YUV to RGB reference
// R = (Y - 16) * 1.164 + V * 1.596
// G = (Y - 16) * 1.164 - U * 0.391 - V * 0.813
// B = (Y - 16) * 1.164 + U * 2.018
// KR = 0.299; KB = 0.114
// U and V contributions to R,G,B.
#define UB -128 /* max(-128, round(-2.018 * 64)) */
#define UG 25 /* round(0.391 * 64) */
#define VG 52 /* round(0.813 * 64) */
#define VR -102 /* round(-1.596 * 64) */
#define UB 128 /* max(128, round(2.018 * 64)) */
#define UG 25 /* round(0.391 * 64) */
#define VG 52 /* round(0.813 * 64) */
#define VR 102 /* round(1.596 * 64) */
// Bias values to subtract 16 from Y and 128 from U and V.
#define BB (UB * 128 + YGB)
#define BG (UG * 128 + VG * 128 + YGB)
#define BR (VR * 128 + YGB)
// Y contribution to R,G,B. Scale and bias.
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
MAKEYUVCONSTANTS(I601, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
MAKEYUVCONSTANTS(I601, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#undef BB
#undef BG
#undef BR
#undef YGB
#undef YG
#undef YB
#undef UB
#undef UG
#undef VG
#undef VR
#undef YG
// JPEG YUV to RGB reference
// * R = Y - V * -1.40200
// * G = Y - U * 0.34414 - V * 0.71414
// * B = Y - U * -1.77200
// BT.601 full range YUV to RGB reference (aka JPEG)
// * R = Y + V * 1.40200
// * G = Y - U * 0.34414 - V * 0.71414
// * B = Y + U * 1.77200
// KR = 0.299; KB = 0.114
// U and V contributions to R,G,B.
#define UB 113 /* round(1.77200 * 64) */
#define UG 22 /* round(0.34414 * 64) */
#define VG 46 /* round(0.71414 * 64) */
#define VR 90 /* round(1.40200 * 64) */
// Y contribution to R,G,B. Scale and bias.
#define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
#define YGB 32 /* 64 / 2 */
#define YB 32 /* 64 / 2 */
// U and V contributions to R,G,B.
#define UB -113 /* round(-1.77200 * 64) */
#define UG 22 /* round(0.34414 * 64) */
#define VG 46 /* round(0.71414 * 64) */
#define VR -90 /* round(-1.40200 * 64) */
MAKEYUVCONSTANTS(JPEG, YG, YB, UB, UG, VG, VR, BB, BG, BR)
// Bias values to round, and subtract 128 from U and V.
#define BB (UB * 128 + YGB)
#define BG (UG * 128 + VG * 128 + YGB)
#define BR (VR * 128 + YGB)
MAKEYUVCONSTANTS(JPEG, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
#undef BB
#undef BG
#undef BR
#undef YGB
#undef YG
#undef YB
#undef UB
#undef UG
#undef VG
#undef VR
#undef YG
// BT.709 YUV to RGB reference
// R = (Y - 16) * 1.164 - V * -1.793
// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
// B = (Y - 16) * 1.164 - U * -2.112
// Y contribution to R,G,B. Scale and bias.
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YGB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
// BT.709 limited range YUV to RGB reference
// R = (Y - 16) * 1.164 + V * 1.793
// G = (Y - 16) * 1.164 - U * 0.213 - V * 0.533
// B = (Y - 16) * 1.164 + U * 2.112
// KR = 0.2126, KB = 0.0722
// TODO(fbarchard): Find way to express 2.112 instead of 2.0.
// U and V contributions to R,G,B.
#define UB -128 /* max(-128, round(-2.112 * 64)) */
#define UG 14 /* round(0.213 * 64) */
#define VG 34 /* round(0.533 * 64) */
#define VR -115 /* round(-1.793 * 64) */
#define UB 128 /* max(128, round(2.112 * 64)) */
#define UG 14 /* round(0.213 * 64) */
#define VG 34 /* round(0.533 * 64) */
#define VR 115 /* round(1.793 * 64) */
// Bias values to round, and subtract 128 from U and V.
#define BB (UB * 128 + YGB)
#define BG (UG * 128 + VG * 128 + YGB)
#define BR (VR * 128 + YGB)
// Y contribution to R,G,B. Scale and bias.
#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
#define YB -1160 /* 1.164 * 64 * -16 + 64 / 2 */
MAKEYUVCONSTANTS(H709, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
MAKEYUVCONSTANTS(H709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#undef BB
#undef BG
#undef BR
#undef YGB
#undef YG
#undef YB
#undef UB
#undef UG
#undef VG
#undef VR
#undef YG
// BT.709 full range YUV to RGB reference
// R = Y - V * -1.5748
// G = Y - U * 0.18732 - V * 0.46812
// B = Y - U * -1.8556
// WR = 0.2126
// WB = 0.0722
// WR and WB given, the equations are:
// R = Y + (2 * (1 - WR)) * V;
// G = Y - ((2 * ((WR * (1 - WR) * V) + (WB * (1 - WB) * U))) / (1 - WB - WR));
// B = Y + (2 * (1 - WB)) * U;
// R = Y + V * 1.5748
// G = Y - U * 0.18732 - V * 0.46812
// B = Y + U * 1.8556
// KR = 0.2126, KB = 0.0722
// U and V contributions to R,G,B.
#define UB 119 /* round(1.8556 * 64) */
#define UG 12 /* round(0.18732 * 64) */
#define VG 30 /* round(0.46812 * 64) */
#define VR 101 /* round(1.5748 * 64) */
// Y contribution to R,G,B. Scale and bias. (same as jpeg)
#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
#define YGB 32 /* 64 / 2 */
#define YB 32 /* 64 / 2 */
// U and V contributions to R,G,B.
#define UB -119 /* round(-1.8556 * 64) */
#define UG 12 /* round(0.18732 * 64) */
#define VG 30 /* round(0.46812 * 64) */
#define VR -101 /* round(-1.5748 * 64) */
MAKEYUVCONSTANTS(F709, YG, YB, UB, UG, VG, VR, BB, BG, BR)
// Bias values to round, and subtract 128 from U and V.
#define BB (UB * 128 + YGB)
#define BG (UG * 128 + VG * 128 + YGB)
#define BR (VR * 128 + YGB)
MAKEYUVCONSTANTS(F709, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
#undef BB
#undef BG
#undef BR
#undef YGB
#undef YG
#undef YB
#undef UB
#undef UG
#undef VG
#undef VR
#undef YG
// BT.2020 YUV to RGB reference
// R = (Y - 16) * 1.164384 - V * -1.67867
// G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
// B = (Y - 16) * 1.164384 - U * -2.14177
// Y contribution to R,G,B. Scale and bias.
#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
#define YGB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
// BT.2020 limited range YUV to RGB reference
// R = (Y - 16) * 1.164384 + V * 1.67867
// G = (Y - 16) * 1.164384 - U * 0.187326 - V * 0.65042
// B = (Y - 16) * 1.164384 + U * 2.14177
// KR = 0.2627; KB = 0.0593
// TODO(fbarchard): Improve accuracy; the B channel is off by 7%.
// U and V contributions to R,G,B.
#define UB -128 /* max(-128, round(-2.142 * 64)) */
#define UG 12 /* round(0.187326 * 64) */
#define VG 42 /* round(0.65042 * 64) */
#define VR -107 /* round(-1.67867 * 64) */
#define UB 128 /* max(128, round(2.142 * 64)) */
#define UG 12 /* round(0.187326 * 64) */
#define VG 42 /* round(0.65042 * 64) */
#define VR 107 /* round(1.67867 * 64) */
// Bias values to round, and subtract 128 from U and V.
#define BB (UB * 128 + YGB)
#define BG (UG * 128 + VG * 128 + YGB)
#define BR (VR * 128 + YGB)
// Y contribution to R,G,B. Scale and bias.
#define YG 19003 /* round(1.164384 * 64 * 256 * 256 / 257) */
#define YB -1160 /* 1.164384 * 64 * -16 + 64 / 2 */
MAKEYUVCONSTANTS(2020, YG, YGB, UB, UG, VG, VR, BB, BG, BR)
MAKEYUVCONSTANTS(2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#undef BB
#undef BG
#undef BR
#undef YGB
#undef YG
#undef YB
#undef UB
#undef UG
#undef VG
#undef VR
// BT.2020 full range YUV to RGB reference
// R = Y + V * 1.474600
// G = Y - U * 0.164553 - V * 0.571353
// B = Y + U * 1.881400
// KR = 0.2627; KB = 0.0593
#define UB 120 /* round(1.881400 * 64) */
#define UG 11 /* round(0.164553 * 64) */
#define VG 37 /* round(0.571353 * 64) */
#define VR 94 /* round(1.474600 * 64) */
// Y contribution to R,G,B. Scale and bias. (same as jpeg)
#define YG 16320 /* round(1 * 64 * 256 * 256 / 257) */
#define YB 32 /* 64 / 2 */
MAKEYUVCONSTANTS(V2020, YG, YB, UB, UG, VG, VR, BB, BG, BR)
#undef YG
#undef YB
#undef UB
#undef UG
#undef VG
#undef VR
#undef BB
#undef BG
#undef BR
#undef MAKEYUVCONSTANTS

View File

@ -1336,7 +1336,7 @@ void ScalePlaneBilinearUp(int src_width,
}
}
// Scale plane, horizontally 2 times, vertically any time.
// Scale plane, horizontally up by 2 times.
// Uses linear filter horizontally, nearest vertically.
// This is an optimized version for scaling up a plane to 2 times of
// its original width, using linear interpolation.
@ -1356,7 +1356,7 @@ void ScalePlaneUp2_Linear(int src_width,
int dy;
// This function can only scale up by 2 times horizontally.
assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
assert(src_width == ((dst_width + 1) / 2));
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
@ -1396,7 +1396,7 @@ void ScalePlaneUp2_Linear(int src_width,
}
}
// Scale plane, 2 times.
// Scale plane, up by 2 times.
// This is an optimized version for scaling up a plane to 2 times of
// its original size, using bilinear interpolation.
// This is used to scale U and V planes of I420 to I444.
@ -1414,7 +1414,7 @@ void ScalePlaneUp2_Bilinear(int src_width,
int x;
// This function can only scale up by 2 times.
assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
assert(src_width == ((dst_width + 1) / 2));
assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
@ -1449,7 +1449,7 @@ void ScalePlaneUp2_Bilinear(int src_width,
for (x = 0; x < src_height - 1; ++x) {
Scale2RowUp(src_ptr, src_stride, dst_ptr, dst_stride, dst_width);
src_ptr += src_stride;
// TODO test performance of writing one row of destination at a time
// TODO: Test performance of writing one row of destination at a time.
dst_ptr += 2 * dst_stride;
}
if (!(dst_height & 1)) {
@ -1458,7 +1458,7 @@ void ScalePlaneUp2_Bilinear(int src_width,
}
}
// Scale at most 14bit plane, horizontally 2 times.
// Scale at most 14 bit plane, horizontally up by 2 times.
// This is an optimized version for scaling up a plane to 2 times of
// its original width, using linear interpolation.
// stride is in count of uint16_t.
@ -1478,7 +1478,7 @@ void ScalePlaneUp2_16_Linear(int src_width,
int dy;
// This function can only scale up by 2 times horizontally.
assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
assert(src_width == ((dst_width + 1) / 2));
#ifdef HAS_SCALEROWUP2LINEAR_SSE2
if (TestCpuFlag(kCpuHasSSE2)) {
@ -1512,7 +1512,7 @@ void ScalePlaneUp2_16_Linear(int src_width,
}
}
// Scale at most 12bit plane, up 2 times.
// Scale at most 12 bit plane, up by 2 times.
// This is an optimized version for scaling up a plane to 2 times of
// its original size, using bilinear interpolation.
// stride is in count of uint16_t.
@ -1531,7 +1531,7 @@ void ScalePlaneUp2_16_Bilinear(int src_width,
int x;
// This function can only scale up by 2 times.
assert(src_width * 2 == dst_width || src_width * 2 == dst_width + 1);
assert(src_width == ((dst_width + 1) / 2));
assert(src_height * 2 == dst_height || src_height * 2 == dst_height + 1);
#ifdef HAS_SCALEROWUP2LINEAR_SSE2

View File

@ -625,7 +625,7 @@ CANY(ScaleARGBFilterCols_Any_MSA,
dst_ptr[dst_width - 1] = src_ptr[(dst_width / 2) - 1]; \
}
// Even the C version need to be wrapped, because boundary pixels have to
// Even the C versions need to be wrapped, because boundary pixels have to
// be handled differently
SUH2LANY(ScaleRowUp2_Linear_Any_C,

View File

@ -400,7 +400,7 @@ void ScaleRowDown34_1_Box_16_C(const uint16_t* src_ptr,
}
}
// sample position: (O is src sample position, X is dst sample position)
// Sample position: (O is src sample position, X is dst sample position)
//
// v dst_ptr at here v stop at here
// X O X X O X X O X X O X X O X
@ -417,7 +417,7 @@ void ScaleRowUp2_Linear_C(const uint8_t* src_ptr,
}
}
// sample position: (O is src sample position, X is dst sample position)
// Sample position: (O is src sample position, X is dst sample position)
//
// src_ptr at here
// X v X X X X X X X X X
@ -451,7 +451,7 @@ void ScaleRowUp2_Bilinear_C(const uint8_t* src_ptr,
}
}
// only suitable for at most 14bit range.
// Only suitable for at most 14 bit range.
void ScaleRowUp2_Linear_16_C(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {

View File

@ -197,7 +197,6 @@ void ScaleRowDown2_AVX2(const uint8_t* src_ptr,
int dst_width) {
(void)src_stride;
asm volatile(
LABELALIGN
"1: \n"
"vmovdqu (%0),%%ymm0 \n"
@ -485,7 +484,6 @@ void ScaleRowDown34_SSSE3(const uint8_t* src_ptr,
"m"(kShuf2) // %2
);
asm volatile(
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
@ -532,7 +530,6 @@ void ScaleRowDown34_1_Box_SSSE3(const uint8_t* src_ptr,
"m"(kRound34) // %2
);
asm volatile(
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm6 \n"
@ -599,7 +596,6 @@ void ScaleRowDown34_0_Box_SSSE3(const uint8_t* src_ptr,
);
asm volatile(
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm6 \n"
@ -692,7 +688,6 @@ void ScaleRowDown38_2_Box_SSSE3(const uint8_t* src_ptr,
"m"(kScaleAb2) // %3
);
asm volatile(
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
@ -736,7 +731,6 @@ void ScaleRowDown38_3_Box_SSSE3(const uint8_t* src_ptr,
"m"(kScaleAc33) // %2
);
asm volatile(
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
@ -790,7 +784,6 @@ void ScaleRowUp2_Linear_SSE2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"pxor %%xmm0,%%xmm0 \n" // 0
"pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n"
@ -847,7 +840,6 @@ void ScaleRowUp2_Bilinear_SSE2(const uint8_t* src_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
LABELALIGN
"1: \n"
"pxor %%xmm0,%%xmm0 \n" // 0
@ -962,7 +954,6 @@ void ScaleRowUp2_Linear_16_SSE2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
"pxor %%xmm0,%%xmm0 \n" // 0
"pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n"
@ -1015,7 +1006,6 @@ void ScaleRowUp2_Bilinear_16_SSE2(const uint16_t* src_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"pxor %%xmm0,%%xmm0 \n" // 0
"pcmpeqw %%xmm7,%%xmm7 \n"
"psrlw $15,%%xmm7 \n"
@ -1124,29 +1114,28 @@ void ScaleRowUp2_Linear_SSSE3(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"pcmpeqw %%xmm4,%%xmm4 \n"
"psrlw $15,%%xmm4 \n"
"psllw $1,%%xmm4 \n" // all 2
"movdqu %3,%%xmm3 \n"
"pcmpeqw %%xmm4,%%xmm4 \n"
"psrlw $15,%%xmm4 \n"
"psllw $1,%%xmm4 \n" // all 2
"movdqu %3,%%xmm3 \n"
LABELALIGN
"1: \n"
"movq (%0),%%xmm0 \n" // 01234567
"movq 1(%0),%%xmm1 \n" // 12345678
"punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
"punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
"movdqa %%xmm0,%%xmm2 \n"
"punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
"punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
"pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi)
"pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo)
"paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
"paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
"psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
"psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
"vpackuswb %%xmm2,%%xmm0,%%xmm0 \n"
"vmovdqu %%xmm0,(%1) \n"
"movq (%0),%%xmm0 \n" // 01234567
"movq 1(%0),%%xmm1 \n" // 12345678
"punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
"punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
"movdqa %%xmm0,%%xmm2 \n"
"punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
"punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
"pmaddubsw %%xmm3,%%xmm2 \n" // 3*near+far (hi)
"pmaddubsw %%xmm3,%%xmm0 \n" // 3*near+far (lo)
"paddw %%xmm4,%%xmm0 \n" // 3*near+far+2 (lo)
"paddw %%xmm4,%%xmm2 \n" // 3*near+far+2 (hi)
"psrlw $2,%%xmm0 \n" // 3/4*near+1/4*far (lo)
"psrlw $2,%%xmm2 \n" // 3/4*near+1/4*far (hi)
"vpackuswb %%xmm2,%%xmm0,%%xmm0 \n"
"vmovdqu %%xmm0,(%1) \n"
"lea 0x8(%0),%0 \n"
"lea 0x10(%1),%1 \n" // 8 sample to 16 sample
@ -1167,76 +1156,75 @@ void ScaleRowUp2_Bilinear_SSSE3(const uint8_t* src_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n"
"psllw $3,%%xmm6 \n" // all 8
"movdqu %5,%%xmm7 \n"
"pcmpeqw %%xmm6,%%xmm6 \n"
"psrlw $15,%%xmm6 \n"
"psllw $3,%%xmm6 \n" // all 8
"movdqu %5,%%xmm7 \n"
LABELALIGN
"1: \n"
"movq (%0),%%xmm0 \n" // 01234567
"movq 1(%0),%%xmm1 \n" // 12345678
"punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
"punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
"movdqa %%xmm0,%%xmm2 \n"
"punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
"punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
"pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi)
"pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo)
"movq (%0),%%xmm0 \n" // 01234567
"movq 1(%0),%%xmm1 \n" // 12345678
"punpcklwd %%xmm0,%%xmm0 \n" // 0101232345456767
"punpcklwd %%xmm1,%%xmm1 \n" // 1212343456567878
"movdqa %%xmm0,%%xmm2 \n"
"punpckhdq %%xmm1,%%xmm2 \n" // 4545565667677878
"punpckldq %%xmm1,%%xmm0 \n" // 0101121223233434
"pmaddubsw %%xmm7,%%xmm2 \n" // 3*near+far (1, hi)
"pmaddubsw %%xmm7,%%xmm0 \n" // 3*near+far (1, lo)
"movq (%0,%3),%%xmm1 \n"
"movq 1(%0,%3),%%xmm4 \n"
"punpcklwd %%xmm1,%%xmm1 \n"
"punpcklwd %%xmm4,%%xmm4 \n"
"movdqa %%xmm1,%%xmm3 \n"
"punpckhdq %%xmm4,%%xmm3 \n"
"punpckldq %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
"pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
"movq (%0,%3),%%xmm1 \n"
"movq 1(%0,%3),%%xmm4 \n"
"punpcklwd %%xmm1,%%xmm1 \n"
"punpcklwd %%xmm4,%%xmm4 \n"
"movdqa %%xmm1,%%xmm3 \n"
"punpckhdq %%xmm4,%%xmm3 \n"
"punpckldq %%xmm4,%%xmm1 \n"
"pmaddubsw %%xmm7,%%xmm3 \n" // 3*near+far (2, hi)
"pmaddubsw %%xmm7,%%xmm1 \n" // 3*near+far (2, lo)
// xmm0 xmm2
// xmm1 xmm3
"movdqa %%xmm0,%%xmm4 \n"
"movdqa %%xmm1,%%xmm5 \n"
"paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
"paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
"paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
"paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
"psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
"movdqa %%xmm0,%%xmm4 \n"
"movdqa %%xmm1,%%xmm5 \n"
"paddw %%xmm0,%%xmm4 \n" // 6*near+2*far (1, lo)
"paddw %%xmm6,%%xmm5 \n" // 3*near+far+8 (2, lo)
"paddw %%xmm0,%%xmm4 \n" // 9*near+3*far (1, lo)
"paddw %%xmm5,%%xmm4 \n" // 9 3 3 1 + 8 (1, lo)
"psrlw $4,%%xmm4 \n" // ^ div by 16 (1, lo)
"movdqa %%xmm1,%%xmm5 \n"
"paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
"paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
"paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
"paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
"psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
"movdqa %%xmm1,%%xmm5 \n"
"paddw %%xmm1,%%xmm5 \n" // 6*near+2*far (2, lo)
"paddw %%xmm6,%%xmm0 \n" // 3*near+far+8 (1, lo)
"paddw %%xmm1,%%xmm5 \n" // 9*near+3*far (2, lo)
"paddw %%xmm0,%%xmm5 \n" // 9 3 3 1 + 8 (2, lo)
"psrlw $4,%%xmm5 \n" // ^ div by 16 (2, lo)
"movdqa %%xmm2,%%xmm0 \n"
"movdqa %%xmm3,%%xmm1 \n"
"paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
"paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
"paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
"paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
"psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
"movdqa %%xmm2,%%xmm0 \n"
"movdqa %%xmm3,%%xmm1 \n"
"paddw %%xmm2,%%xmm0 \n" // 6*near+2*far (1, hi)
"paddw %%xmm6,%%xmm1 \n" // 3*near+far+8 (2, hi)
"paddw %%xmm2,%%xmm0 \n" // 9*near+3*far (1, hi)
"paddw %%xmm1,%%xmm0 \n" // 9 3 3 1 + 8 (1, hi)
"psrlw $4,%%xmm0 \n" // ^ div by 16 (1, hi)
"movdqa %%xmm3,%%xmm1 \n"
"paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
"paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
"paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
"paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
"psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
"movdqa %%xmm3,%%xmm1 \n"
"paddw %%xmm3,%%xmm1 \n" // 6*near+2*far (2, hi)
"paddw %%xmm6,%%xmm2 \n" // 3*near+far+8 (1, hi)
"paddw %%xmm3,%%xmm1 \n" // 9*near+3*far (2, hi)
"paddw %%xmm2,%%xmm1 \n" // 9 3 3 1 + 8 (2, hi)
"psrlw $4,%%xmm1 \n" // ^ div by 16 (2, hi)
"packuswb %%xmm0,%%xmm4 \n"
"movdqu %%xmm4,(%1) \n" // store above
"packuswb %%xmm1,%%xmm5 \n"
"movdqu %%xmm5,(%1,%4) \n" // store below
"packuswb %%xmm0,%%xmm4 \n"
"movdqu %%xmm4,(%1) \n" // store above
"packuswb %%xmm1,%%xmm5 \n"
"movdqu %%xmm5,(%1,%4) \n" // store below
"lea 0x8(%0),%0 \n"
"lea 0x10(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
"lea 0x8(%0),%0 \n"
"lea 0x10(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -1257,30 +1245,29 @@ void ScaleRowUp2_Linear_AVX2(const uint8_t* src_ptr,
uint8_t* dst_ptr,
int dst_width) {
asm volatile(
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
"vmovdqu %3,%%ymm3 \n"
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
"vmovdqu %3,%%ymm3 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
"vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
"vpermq $0b11011000,%%ymm1,%%ymm1 \n"
"vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
"vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
"vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
"vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
"vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
"vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
"vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
"vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
"vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
"vpermq $0b11011000,%%ymm1,%%ymm1 \n"
"vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
"vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
"vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
"vpmaddubsw %%ymm3,%%ymm0,%%ymm0 \n" // 3*near+far (lo)
"vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
"vpaddw %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
"vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
"vpsrlw $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
"vpackuswb %%ymm1,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 16 sample to 32 sample
@ -1301,72 +1288,71 @@ void ScaleRowUp2_Bilinear_AVX2(const uint8_t* src_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrlw $15,%%ymm6,%%ymm6 \n"
"vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
"vmovdqu %5,%%ymm7 \n"
"vpcmpeqw %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrlw $15,%%ymm6,%%ymm6 \n"
"vpsllw $3,%%ymm6,%%ymm6 \n" // all 8
"vmovdqu %5,%%ymm7 \n"
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
"vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
"vpermq $0b11011000,%%ymm1,%%ymm1 \n"
"vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
"vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
"vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
"vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
"vmovdqu (%0),%%xmm0 \n" // 0123456789ABCDEF
"vmovdqu 1(%0),%%xmm1 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm0,%%ymm0 \n"
"vpermq $0b11011000,%%ymm1,%%ymm1 \n"
"vpunpcklwd %%ymm0,%%ymm0,%%ymm0 \n"
"vpunpcklwd %%ymm1,%%ymm1,%%ymm1 \n"
"vpunpckhdq %%ymm1,%%ymm0,%%ymm2 \n"
"vpunpckldq %%ymm1,%%ymm0,%%ymm0 \n"
"vpmaddubsw %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
"vpmaddubsw %%ymm7,%%ymm0,%%ymm0 \n" // 3*near+far (1, lo)
"vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
"vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm2,%%ymm2 \n"
"vpermq $0b11011000,%%ymm3,%%ymm3 \n"
"vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n"
"vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n"
"vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n"
"vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
"vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
"vmovdqu (%0,%3),%%xmm2 \n" // 0123456789ABCDEF
"vmovdqu 1(%0,%3),%%xmm3 \n" // 123456789ABCDEF0
"vpermq $0b11011000,%%ymm2,%%ymm2 \n"
"vpermq $0b11011000,%%ymm3,%%ymm3 \n"
"vpunpcklwd %%ymm2,%%ymm2,%%ymm2 \n"
"vpunpcklwd %%ymm3,%%ymm3,%%ymm3 \n"
"vpunpckhdq %%ymm3,%%ymm2,%%ymm4 \n"
"vpunpckldq %%ymm3,%%ymm2,%%ymm2 \n"
"vpmaddubsw %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
"vpmaddubsw %%ymm7,%%ymm2,%%ymm2 \n" // 3*near+far (2, lo)
// ymm0 ymm1
// ymm2 ymm3
"vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
"vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
"vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
"vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
"vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
"vpaddw %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
"vpaddw %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
"vpaddw %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
"vpaddw %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
"vpsrlw $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
"vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
"vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
"vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
"vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
"vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
"vpaddw %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
"vpaddw %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
"vpaddw %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
"vpaddw %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
"vpsrlw $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
"vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
"vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
"vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
"vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
"vpaddw %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
"vpaddw %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
"vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
"vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
"vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
"vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
"vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
"vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
"vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
"vpaddw %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
"vpaddw %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
"vpaddw %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
"vpaddw %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
"vpsrlw $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
"vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
"vmovdqu %%ymm4,(%1) \n" // store above
"vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
"vmovdqu %%ymm5,(%1,%4) \n" // store below
"vpackuswb %%ymm0,%%ymm4,%%ymm4 \n"
"vmovdqu %%ymm4,(%1) \n" // store above
"vpackuswb %%ymm2,%%ymm5,%%ymm5 \n"
"vmovdqu %%ymm5,(%1,%4) \n" // store below
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 16 sample to 32 sample
"sub $0x20,%2 \n"
"jg 1b \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 16 sample to 32 sample
"sub $0x20,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -1386,35 +1372,34 @@ void ScaleRowUp2_Linear_16_AVX2(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
"vmovdqu %3,%%ymm3 \n"
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
"vmovdqu %3,%%ymm3 \n"
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $1,%%ymm4,%%ymm4 \n" // all 2
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
"vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo)
"vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
"vpackssdw %%ymm1,%%ymm0,%%ymm0 \n" // 3*near+far
"vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2
"vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far
"vmovdqu %%ymm0,(%1) \n"
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
"vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo)
"vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
"vpackssdw %%ymm1,%%ymm0,%%ymm0 \n" // 3*near+far
"vpaddw %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2
"vpsrlw $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far
"vmovdqu %%ymm0,(%1) \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -1427,37 +1412,36 @@ void ScaleRowUp2_Linear_16_AVX2_Full(const uint16_t* src_ptr,
uint16_t* dst_ptr,
int dst_width) {
asm volatile(
"vmovdqu %3,%%ymm3 \n"
"vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrld $31,%%ymm4,%%ymm4 \n"
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2
"vmovdqu %3,%%ymm3 \n"
"vpcmpeqd %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrld $31,%%ymm4,%%ymm4 \n"
"vpslld $1,%%ymm4,%%ymm4 \n" // all 2
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
"vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo)
"vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
"vpaddd %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
"vpaddd %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
"vpsrad $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
"vpsrad $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
"vpackssdw %%ymm1,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
"vpmaddwd %%ymm3,%%ymm1,%%ymm0 \n" // 3*near+far (lo)
"vpmaddwd %%ymm3,%%ymm2,%%ymm1 \n" // 3*near+far (hi)
"vpaddd %%ymm4,%%ymm0,%%ymm0 \n" // 3*near+far+2 (lo)
"vpaddd %%ymm4,%%ymm1,%%ymm1 \n" // 3*near+far+2 (hi)
"vpsrad $2,%%ymm0,%%ymm0 \n" // 3/4*near+1/4*far (lo)
"vpsrad $2,%%ymm1,%%ymm1 \n" // 3/4*near+1/4*far (hi)
"vpackssdw %%ymm1,%%ymm0,%%ymm0 \n"
"vmovdqu %%ymm0,(%1) \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -1473,57 +1457,56 @@ void ScaleRowUp2_Bilinear_16_AVX2(const uint16_t* src_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"vmovdqu %5,%%ymm5 \n"
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $3,%%ymm4,%%ymm4 \n" // all 8
"vmovdqu %5,%%ymm5 \n"
"vpcmpeqw %%ymm4,%%ymm4,%%ymm4 \n"
"vpsrlw $15,%%ymm4,%%ymm4 \n"
"vpsllw $3,%%ymm4,%%ymm4 \n" // all 8
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
"vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo)
"vpmaddwd %%ymm5,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
"vpackssdw %%ymm1,%%ymm0,%%ymm2 \n" // 3*near+far (1)
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
"vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo)
"vpmaddwd %%ymm5,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
"vpackssdw %%ymm1,%%ymm0,%%ymm2 \n" // 3*near+far (1)
"vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm3 \n" // 2323343467677878
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
"vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (2, lo)
"vpmaddwd %%ymm5,%%ymm3,%%ymm1 \n" // 3*near+far (2, hi)
"vpackssdw %%ymm1,%%ymm0,%%ymm3 \n" // 3*near+far (2)
"vmovdqu (%0,%3,2),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0,%3,2),%%xmm1 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm3 \n" // 2323343467677878
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
"vpmaddwd %%ymm5,%%ymm1,%%ymm0 \n" // 3*near+far (2, lo)
"vpmaddwd %%ymm5,%%ymm3,%%ymm1 \n" // 3*near+far (2, hi)
"vpackssdw %%ymm1,%%ymm0,%%ymm3 \n" // 3*near+far (2)
"vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1)
"vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2)
"vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1)
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1)
"vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
"vmovdqu %%ymm0,(%1) \n" // store above
"vpaddw %%ymm2,%%ymm2,%%ymm0 \n" // 6*near+2*far (1)
"vpaddw %%ymm4,%%ymm3,%%ymm1 \n" // 3*near+far+8 (2)
"vpaddw %%ymm0,%%ymm2,%%ymm0 \n" // 9*near+3*far (1)
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (1)
"vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
"vmovdqu %%ymm0,(%1) \n" // store above
"vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2)
"vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1)
"vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2)
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2)
"vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
"vmovdqu %%ymm0,(%1,%4,2) \n" // store below
"vpaddw %%ymm3,%%ymm3,%%ymm0 \n" // 6*near+2*far (2)
"vpaddw %%ymm4,%%ymm2,%%ymm1 \n" // 3*near+far+8 (1)
"vpaddw %%ymm0,%%ymm3,%%ymm0 \n" // 9*near+3*far (2)
"vpaddw %%ymm0,%%ymm1,%%ymm0 \n" // 9 3 3 1 + 8 (2)
"vpsrlw $4,%%ymm0,%%ymm0 \n" // ^ div by 16
"vmovdqu %%ymm0,(%1,%4,2) \n" // store below
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -1540,70 +1523,69 @@ void ScaleRowUp2_Bilinear_16_AVX2_Full(const uint16_t* src_ptr,
ptrdiff_t dst_stride,
int dst_width) {
asm volatile(
"vmovdqu %5,%%ymm7 \n"
"vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrld $31,%%ymm6,%%ymm6 \n"
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8
"vmovdqu %5,%%ymm7 \n"
"vpcmpeqd %%ymm6,%%ymm6,%%ymm6 \n"
"vpsrld $31,%%ymm6,%%ymm6 \n"
"vpslld $3,%%ymm6,%%ymm6 \n" // all 8
LABELALIGN
"1: \n"
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
"vpmaddwd %%ymm7,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo)
"vpmaddwd %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
"vmovdqu (%0),%%xmm0 \n" // 01234567 (16b)
"vmovdqu 2(%0),%%xmm1 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm0,%%ymm0 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm1,%%ymm1 \n" // 1234000056780000
"vpunpckldq %%ymm0,%%ymm0,%%ymm0 \n" // 0101232345456767
"vpunpckldq %%ymm1,%%ymm1,%%ymm1 \n" // 1212343456567878
"vpunpckhqdq %%ymm1,%%ymm0,%%ymm2 \n" // 2323343467677878
"vpunpcklqdq %%ymm1,%%ymm0,%%ymm1 \n" // 0101121245455656
"vpmaddwd %%ymm7,%%ymm1,%%ymm0 \n" // 3*near+far (1, lo)
"vpmaddwd %%ymm7,%%ymm2,%%ymm1 \n" // 3*near+far (1, hi)
"vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b)
"vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm2,%%ymm2 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm3,%%ymm3 \n" // 1234000056780000
"vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n" // 0101232345456767
"vpunpckldq %%ymm3,%%ymm3,%%ymm3 \n" // 1212343456567878
"vpunpckhqdq %%ymm3,%%ymm2,%%ymm4 \n" // 2323343467677878
"vpunpcklqdq %%ymm3,%%ymm2,%%ymm3 \n" // 0101121245455656
"vpmaddwd %%ymm7,%%ymm3,%%ymm2 \n" // 3*near+far (2, lo)
"vpmaddwd %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
"vmovdqu (%0,%3,2),%%xmm2 \n" // 01234567 (16b)
"vmovdqu 2(%0,%3,2),%%xmm3 \n" // 12345678 (16b)
"vpermq $0b11011000,%%ymm2,%%ymm2 \n" // 0123000045670000
"vpermq $0b11011000,%%ymm3,%%ymm3 \n" // 1234000056780000
"vpunpckldq %%ymm2,%%ymm2,%%ymm2 \n" // 0101232345456767
"vpunpckldq %%ymm3,%%ymm3,%%ymm3 \n" // 1212343456567878
"vpunpckhqdq %%ymm3,%%ymm2,%%ymm4 \n" // 2323343467677878
"vpunpcklqdq %%ymm3,%%ymm2,%%ymm3 \n" // 0101121245455656
"vpmaddwd %%ymm7,%%ymm3,%%ymm2 \n" // 3*near+far (2, lo)
"vpmaddwd %%ymm7,%%ymm4,%%ymm3 \n" // 3*near+far (2, hi)
"vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
"vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
"vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
"vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
"vpsrad $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
"vpaddd %%ymm0,%%ymm0,%%ymm4 \n" // 6*near+2*far (1, lo)
"vpaddd %%ymm6,%%ymm2,%%ymm5 \n" // 3*near+far+8 (2, lo)
"vpaddd %%ymm4,%%ymm0,%%ymm4 \n" // 9*near+3*far (1, lo)
"vpaddd %%ymm4,%%ymm5,%%ymm4 \n" // 9 3 3 1 + 8 (1, lo)
"vpsrad $4,%%ymm4,%%ymm4 \n" // ^ div by 16 (1, lo)
"vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
"vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
"vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
"vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
"vpsrad $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
"vpaddd %%ymm2,%%ymm2,%%ymm5 \n" // 6*near+2*far (2, lo)
"vpaddd %%ymm6,%%ymm0,%%ymm0 \n" // 3*near+far+8 (1, lo)
"vpaddd %%ymm5,%%ymm2,%%ymm5 \n" // 9*near+3*far (2, lo)
"vpaddd %%ymm5,%%ymm0,%%ymm5 \n" // 9 3 3 1 + 8 (2, lo)
"vpsrad $4,%%ymm5,%%ymm5 \n" // ^ div by 16 (2, lo)
"vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
"vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
"vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
"vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
"vpsrad $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
"vpaddd %%ymm1,%%ymm1,%%ymm0 \n" // 6*near+2*far (1, hi)
"vpaddd %%ymm6,%%ymm3,%%ymm2 \n" // 3*near+far+8 (2, hi)
"vpaddd %%ymm0,%%ymm1,%%ymm0 \n" // 9*near+3*far (1, hi)
"vpaddd %%ymm0,%%ymm2,%%ymm0 \n" // 9 3 3 1 + 8 (1, hi)
"vpsrad $4,%%ymm0,%%ymm0 \n" // ^ div by 16 (1, hi)
"vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
"vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
"vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
"vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
"vpsrad $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
"vpaddd %%ymm3,%%ymm3,%%ymm2 \n" // 6*near+2*far (2, hi)
"vpaddd %%ymm6,%%ymm1,%%ymm1 \n" // 3*near+far+8 (1, hi)
"vpaddd %%ymm2,%%ymm3,%%ymm2 \n" // 9*near+3*far (2, hi)
"vpaddd %%ymm2,%%ymm1,%%ymm2 \n" // 9 3 3 1 + 8 (2, hi)
"vpsrad $4,%%ymm2,%%ymm2 \n" // ^ div by 16 (2, hi)
"vpackssdw %%ymm0,%%ymm4,%%ymm4 \n"
"vmovdqu %%ymm4,(%1) \n" // store above
"vpackssdw %%ymm2,%%ymm5,%%ymm5 \n"
"vmovdqu %%ymm5,(%1,%4,2) \n" // store below
"vpackssdw %%ymm0,%%ymm4,%%ymm4 \n"
"vmovdqu %%ymm4,(%1) \n" // store above
"vpackssdw %%ymm2,%%ymm5,%%ymm5 \n"
"vmovdqu %%ymm5,(%1,%4,2) \n" // store below
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
"lea 0x10(%0),%0 \n"
"lea 0x20(%1),%1 \n" // 8 sample to 16 sample
"sub $0x10,%2 \n"
"jg 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width) // %2
@ -1620,7 +1602,6 @@ void ScaleAddRow_SSE2(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width) {
asm volatile(
"pxor %%xmm5,%%xmm5 \n"
// 16 pixel loop.
@ -1653,7 +1634,6 @@ void ScaleAddRow_AVX2(const uint8_t* src_ptr,
uint16_t* dst_ptr,
int src_width) {
asm volatile(
"vpxor %%ymm5,%%ymm5,%%ymm5 \n"
LABELALIGN
@ -1776,8 +1756,8 @@ void ScaleFilterCols_SSSE3(uint8_t* dst_ptr,
"x"(kFsub80), // %8
"x"(kFadd40) // %9
#else
"m"(kFsub80), // %8
"m"(kFadd40) // %9
"m"(kFsub80), // %8
"m"(kFadd40) // %9
#endif
: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
"xmm7");
@ -1793,7 +1773,6 @@ void ScaleColsUp2_SSE2(uint8_t* dst_ptr,
(void)x;
(void)dx;
asm volatile(
LABELALIGN
"1: \n"
"movdqu (%1),%%xmm0 \n"
@ -1820,7 +1799,6 @@ void ScaleARGBRowDown2_SSE2(const uint8_t* src_argb,
int dst_width) {
(void)src_stride;
asm volatile(
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
@ -1844,7 +1822,6 @@ void ScaleARGBRowDown2Linear_SSE2(const uint8_t* src_argb,
int dst_width) {
(void)src_stride;
asm volatile(
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
@ -1870,7 +1847,6 @@ void ScaleARGBRowDown2Box_SSE2(const uint8_t* src_argb,
uint8_t* dst_argb,
int dst_width) {
asm volatile(
LABELALIGN
"1: \n"
"movdqu (%0),%%xmm0 \n"
@ -2057,7 +2033,6 @@ void ScaleARGBColsUp2_SSE2(uint8_t* dst_argb,
(void)x;
(void)dx;
asm volatile(
LABELALIGN
"1: \n"
"movdqu (%1),%%xmm0 \n"

View File

@ -509,7 +509,6 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
int dst_width) {
const uint8_t* src_temp = src_ptr + 1;
asm volatile(
"vmov.u16 q15, #3 \n"
"1: \n"
@ -527,7 +526,7 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
"vst2.8 {d0, d1}, [%1]! \n" // store
"subs %2, %2, #16 \n" // 8 sample -> 16 sample
"bgt 1b \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
@ -548,7 +547,6 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
const uint8_t* src_temp1 = src_ptr1 + 1;
asm volatile(
"vmov.u16 q15, #3 \n"
"1: \n"
@ -612,7 +610,6 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
int dst_width) {
const uint16_t* src_temp = src_ptr + 1;
asm volatile(
"vmov.u16 q15, #3 \n"
"1: \n"
@ -649,7 +646,6 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile(
"vmov.u16 q15, #3 \n"
"1: \n"

View File

@ -540,7 +540,6 @@ void ScaleRowUp2_Linear_NEON(const uint8_t* src_ptr,
int dst_width) {
const uint8_t* src_temp = src_ptr + 1;
asm volatile(
"movi v31.8b, #3 \n"
"1: \n"
@ -580,7 +579,6 @@ void ScaleRowUp2_Bilinear_NEON(const uint8_t* src_ptr,
const uint8_t* src_temp1 = src_ptr1 + 1;
asm volatile(
"movi v31.8b, #3 \n"
"movi v30.8h, #3 \n"
@ -637,7 +635,6 @@ void ScaleRowUp2_Linear_16_NEON(const uint16_t* src_ptr,
int dst_width) {
const uint16_t* src_temp = src_ptr + 1;
asm volatile(
"movi v31.8h, #3 \n"
"1: \n"
@ -675,7 +672,6 @@ void ScaleRowUp2_Bilinear_16_NEON(const uint16_t* src_ptr,
const uint16_t* src_temp1 = src_ptr1 + 1;
asm volatile(
"movi v31.8h, #3 \n"
"1: \n"
@ -1317,13 +1313,13 @@ void ScaleUVRowDownEven_NEON(const uint8_t* src_ptr,
(void)src_stride;
asm volatile(
"1: \n"
"ld1 {v0.h}[0], [%0], %6 \n"
"ld1 {v1.h}[0], [%1], %6 \n"
"ld1 {v2.h}[0], [%2], %6 \n"
"ld1 {v3.h}[0], [%3], %6 \n"
"subs %w5, %w5, #4 \n" // 4 pixels per loop.
"st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
"b.gt 1b \n"
"ld1 {v0.h}[0], [%0], %6 \n"
"ld1 {v1.h}[0], [%1], %6 \n"
"ld1 {v2.h}[0], [%2], %6 \n"
"ld1 {v3.h}[0], [%3], %6 \n"
"subs %w5, %w5, #4 \n" // 4 pixels per loop.
"st4 {v0.h, v1.h, v2.h, v3.h}[0], [%4], #8 \n"
"b.gt 1b \n"
: "+r"(src_ptr), // %0
"+r"(src1_ptr), // %1
"+r"(src2_ptr), // %2

View File

@ -257,6 +257,32 @@ static void YUVUToRGB(int y, int u, int v, int* r, int* g, int* b) {
*r = orig_pixels[2];
}
#define V422ToARGB(a, b, c, d, e, f, g, h, i, j) \
I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
static void YUVVToRGB(int y, int u, int v, int* r, int* g, int* b) {
const int kWidth = 16;
const int kHeight = 1;
const int kPixels = kWidth * kHeight;
const int kHalfPixels = ((kWidth + 1) / 2) * ((kHeight + 1) / 2);
SIMD_ALIGNED(uint8_t orig_y[16]);
SIMD_ALIGNED(uint8_t orig_u[8]);
SIMD_ALIGNED(uint8_t orig_v[8]);
SIMD_ALIGNED(uint8_t orig_pixels[16 * 4]);
memset(orig_y, y, kPixels);
memset(orig_u, u, kHalfPixels);
memset(orig_v, v, kHalfPixels);
/* YUV converted to ARGB. */
V422ToARGB(orig_y, kWidth, orig_u, (kWidth + 1) / 2, orig_v, (kWidth + 1) / 2,
orig_pixels, kWidth * 4, kWidth, kHeight);
*b = orig_pixels[0];
*g = orig_pixels[1];
*r = orig_pixels[2];
}
static void YToRGB(int y, int* r, int* g, int* b) {
const int kWidth = 16;
const int kHeight = 1;
@ -405,21 +431,21 @@ TEST_F(LibYUVColorTest, TestRoundToByte) {
EXPECT_LE(allb, 255);
}
// BT.601 YUV to RGB reference
// BT.601 limited range YUV to RGB reference
static void YUVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
*r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.596);
*g = RoundToByte((y - 16) * 1.164 - (u - 128) * 0.391 - (v - 128) * 0.813);
*b = RoundToByte((y - 16) * 1.164 - (u - 128) * -2.018);
}
// JPEG YUV to RGB reference
// BT.601 full range YUV to RGB reference (aka JPEG)
static void YUVJToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
*r = RoundToByte(y - (v - 128) * -1.40200);
*g = RoundToByte(y - (u - 128) * 0.34414 - (v - 128) * 0.71414);
*b = RoundToByte(y - (u - 128) * -1.77200);
}
// BT.709 YUV to RGB reference
// BT.709 limited range YUV to RGB reference
// See also http://www.equasys.de/colorconversion.html
static void YUVHToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
*r = RoundToByte((y - 16) * 1.164 - (v - 128) * -1.793);
@ -434,7 +460,7 @@ static void YUVFToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
*b = RoundToByte(y - (u - 128) * -1.8556);
}
// BT.2020 YUV to RGB reference
// BT.2020 limited range YUV to RGB reference
static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
*r = RoundToByte((y - 16) * 1.164384 - (v - 128) * -1.67867);
*g = RoundToByte((y - 16) * 1.164384 - (u - 128) * 0.187326 -
@ -442,6 +468,13 @@ static void YUVUToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
*b = RoundToByte((y - 16) * 1.164384 - (u - 128) * -2.14177);
}
// BT.2020 full range YUV to RGB reference
static void YUVVToRGBReference(int y, int u, int v, int* r, int* g, int* b) {
*r = RoundToByte(y + (v - 128) * 1.474600);
*g = RoundToByte(y - (u - 128) * 0.164553 - (v - 128) * 0.571353);
*b = RoundToByte(y + (u - 128) * 1.881400);
}
TEST_F(LibYUVColorTest, TestYUV) {
int r0, g0, b0, r1, g1, b1;
@ -573,16 +606,12 @@ static void PrintHistogram(int rh[256], int gh[256], int bh[256]) {
#else
#define FASTSTEP 5
#endif
// BT.601 limited range.
TEST_F(LibYUVColorTest, TestFullYUV) {
int rh[256] = {
0,
};
int gh[256] = {
0,
};
int bh[256] = {
0,
};
int rh[256] = { 0, };
int gh[256] = { 0, };
int bh[256] = { 0, };
for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@ -602,16 +631,11 @@ TEST_F(LibYUVColorTest, TestFullYUV) {
PrintHistogram(rh, gh, bh);
}
// BT.601 full range.
TEST_F(LibYUVColorTest, TestFullYUVJ) {
int rh[256] = {
0,
};
int gh[256] = {
0,
};
int bh[256] = {
0,
};
int rh[256] = { 0, };
int gh[256] = { 0, };
int bh[256] = { 0, };
for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@ -631,16 +655,11 @@ TEST_F(LibYUVColorTest, TestFullYUVJ) {
PrintHistogram(rh, gh, bh);
}
// BT.709 limited range.
TEST_F(LibYUVColorTest, TestFullYUVH) {
int rh[256] = {
0,
};
int gh[256] = {
0,
};
int bh[256] = {
0,
};
int rh[256] = { 0, };
int gh[256] = { 0, };
int bh[256] = { 0, };
for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@ -661,16 +680,11 @@ TEST_F(LibYUVColorTest, TestFullYUVH) {
PrintHistogram(rh, gh, bh);
}
// BT.709 full range.
TEST_F(LibYUVColorTest, TestFullYUVF) {
int rh[256] = {
0,
};
int gh[256] = {
0,
};
int bh[256] = {
0,
};
int rh[256] = { 0, };
int gh[256] = { 0, };
int bh[256] = { 0, };
for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@ -690,16 +704,11 @@ TEST_F(LibYUVColorTest, TestFullYUVF) {
PrintHistogram(rh, gh, bh);
}
// BT.2020 limited range.
TEST_F(LibYUVColorTest, TestFullYUVU) {
int rh[256] = {
0,
};
int gh[256] = {
0,
};
int bh[256] = {
0,
};
int rh[256] = { 0, };
int gh[256] = { 0, };
int bh[256] = { 0, };
for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
@ -719,6 +728,30 @@ TEST_F(LibYUVColorTest, TestFullYUVU) {
}
PrintHistogram(rh, gh, bh);
}
// BT.2020 full range.
TEST_F(LibYUVColorTest, TestFullYUVV) {
int rh[256] = { 0, };
int gh[256] = { 0, };
int bh[256] = { 0, };
for (int u = 0; u < 256; ++u) {
for (int v = 0; v < 256; ++v) {
for (int y2 = 0; y2 < 256; y2 += FASTSTEP) {
int r0, g0, b0, r1, g1, b1;
int y = RANDOM256(y2);
YUVVToRGBReference(y, u, v, &r0, &g0, &b0);
YUVVToRGB(y, u, v, &r1, &g1, &b1);
EXPECT_NEAR(r0, r1, ERROR_R);
EXPECT_NEAR(g0, g1, 2);
EXPECT_NEAR(b0, b1, ERROR_B);
++rh[r1 - r0 + 128];
++gh[g1 - g0 + 128];
++bh[b1 - b0 + 128];
}
}
}
PrintHistogram(rh, gh, bh);
}
#undef FASTSTEP
TEST_F(LibYUVColorTest, TestGreyYUVJ) {

View File

@ -558,7 +558,7 @@ TESTBIPLANARTOBP(NV12, 2, 2, NV12Mirror, 2, 2)
TESTBIPLANARTOP(NV12, 2, 2, I420, 2, 2)
TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
// Provide matrix wrappers
// Provide matrix wrappers for full range bt.709
#define F420ToABGR(a, b, c, d, e, f, g, h, i, j) \
I420ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuF709Constants, i, j)
#define F420ToARGB(a, b, c, d, e, f, g, h, i, j) \
@ -572,6 +572,20 @@ TESTBIPLANARTOP(NV21, 2, 2, I420, 2, 2)
#define F444ToARGB(a, b, c, d, e, f, g, h, i, j) \
I444ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvF709Constants, i, j)
// Provide matrix wrappers for full range bt.2020
#define V420ToABGR(a, b, c, d, e, f, g, h, i, j) \
I420ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
#define V420ToARGB(a, b, c, d, e, f, g, h, i, j) \
I420ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
#define V422ToABGR(a, b, c, d, e, f, g, h, i, j) \
I422ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
#define V422ToARGB(a, b, c, d, e, f, g, h, i, j) \
I422ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
#define V444ToABGR(a, b, c, d, e, f, g, h, i, j) \
I444ToARGBMatrix(a, b, e, f, c, d, g, h, &kYvuV2020Constants, i, j)
#define V444ToARGB(a, b, c, d, e, f, g, h, i, j) \
I444ToARGBMatrix(a, b, c, d, e, f, g, h, &kYuvV2020Constants, i, j)
#define ALIGNINT(V, ALIGN) (((V) + (ALIGN)-1) / (ALIGN) * (ALIGN))
#define TESTPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, BPP_B, ALIGN, \
@ -643,6 +657,8 @@ TESTPLANARTOB(H420, 2, 2, ARGB, 4, 4, 1)
TESTPLANARTOB(H420, 2, 2, ABGR, 4, 4, 1)
TESTPLANARTOB(U420, 2, 2, ARGB, 4, 4, 1)
TESTPLANARTOB(U420, 2, 2, ABGR, 4, 4, 1)
TESTPLANARTOB(V420, 2, 2, ARGB, 4, 4, 1)
TESTPLANARTOB(V420, 2, 2, ABGR, 4, 4, 1)
TESTPLANARTOB(I420, 2, 2, BGRA, 4, 4, 1)
TESTPLANARTOB(I420, 2, 2, RGBA, 4, 4, 1)
TESTPLANARTOB(I420, 2, 2, RAW, 3, 3, 1)
@ -667,6 +683,8 @@ TESTPLANARTOB(H422, 2, 1, ARGB, 4, 4, 1)
TESTPLANARTOB(H422, 2, 1, ABGR, 4, 4, 1)
TESTPLANARTOB(U422, 2, 1, ARGB, 4, 4, 1)
TESTPLANARTOB(U422, 2, 1, ABGR, 4, 4, 1)
TESTPLANARTOB(V422, 2, 1, ARGB, 4, 4, 1)
TESTPLANARTOB(V422, 2, 1, ABGR, 4, 4, 1)
TESTPLANARTOB(I422, 2, 1, BGRA, 4, 4, 1)
TESTPLANARTOB(I422, 2, 1, RGBA, 4, 4, 1)
TESTPLANARTOB(I444, 1, 1, ARGB, 4, 4, 1)
@ -677,6 +695,8 @@ TESTPLANARTOB(H444, 1, 1, ARGB, 4, 4, 1)
TESTPLANARTOB(H444, 1, 1, ABGR, 4, 4, 1)
TESTPLANARTOB(U444, 1, 1, ARGB, 4, 4, 1)
TESTPLANARTOB(U444, 1, 1, ABGR, 4, 4, 1)
TESTPLANARTOB(V444, 1, 1, ARGB, 4, 4, 1)
TESTPLANARTOB(V444, 1, 1, ABGR, 4, 4, 1)
TESTPLANARTOB(I420, 2, 2, YUY2, 2, 4, 1)
TESTPLANARTOB(I420, 2, 2, UYVY, 2, 4, 1)
TESTPLANARTOB(I422, 2, 1, YUY2, 2, 4, 1)
@ -772,6 +792,12 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
#define U420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
l, m)
#define V420AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
I420AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
l, m)
#define V420AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
I420AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
l, m)
#define J422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
l, m)
@ -796,6 +822,12 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
#define U422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
l, m)
#define V422AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
I422AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
l, m)
#define V422AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
I422AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
l, m)
#define J444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvJPEGConstants, k, \
l, m)
@ -820,6 +852,12 @@ TESTPLANARTOB(H420, 2, 2, AR30, 4, 4, 1)
#define U444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuv2020Constants, k, \
l, m)
#define V444AlphaToARGB(a, b, c, d, e, f, g, h, i, j, k, l, m) \
I444AlphaToARGBMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
l, m)
#define V444AlphaToABGR(a, b, c, d, e, f, g, h, i, j, k, l, m) \
I444AlphaToABGRMatrix(a, b, c, d, e, f, g, h, i, j, &kYuvV2020Constants, k, \
l, m)
TESTQPLANARTOB(I420Alpha, 2, 2, ARGB, 4, 4, 1)
TESTQPLANARTOB(I420Alpha, 2, 2, ABGR, 4, 4, 1)
@ -829,6 +867,8 @@ TESTQPLANARTOB(H420Alpha, 2, 2, ARGB, 4, 4, 1)
TESTQPLANARTOB(H420Alpha, 2, 2, ABGR, 4, 4, 1)
TESTQPLANARTOB(U420Alpha, 2, 2, ARGB, 4, 4, 1)
TESTQPLANARTOB(U420Alpha, 2, 2, ABGR, 4, 4, 1)
TESTQPLANARTOB(V420Alpha, 2, 2, ARGB, 4, 4, 1)
TESTQPLANARTOB(V420Alpha, 2, 2, ABGR, 4, 4, 1)
TESTQPLANARTOB(I422Alpha, 2, 1, ARGB, 4, 4, 1)
TESTQPLANARTOB(I422Alpha, 2, 1, ABGR, 4, 4, 1)
TESTQPLANARTOB(J422Alpha, 2, 1, ARGB, 4, 4, 1)
@ -837,6 +877,8 @@ TESTQPLANARTOB(H422Alpha, 2, 1, ARGB, 4, 4, 1)
TESTQPLANARTOB(H422Alpha, 2, 1, ABGR, 4, 4, 1)
TESTQPLANARTOB(U422Alpha, 2, 1, ARGB, 4, 4, 1)
TESTQPLANARTOB(U422Alpha, 2, 1, ABGR, 4, 4, 1)
TESTQPLANARTOB(V422Alpha, 2, 1, ARGB, 4, 4, 1)
TESTQPLANARTOB(V422Alpha, 2, 1, ABGR, 4, 4, 1)
TESTQPLANARTOB(I444Alpha, 1, 1, ARGB, 4, 4, 1)
TESTQPLANARTOB(I444Alpha, 1, 1, ABGR, 4, 4, 1)
TESTQPLANARTOB(J444Alpha, 1, 1, ARGB, 4, 4, 1)
@ -845,6 +887,8 @@ TESTQPLANARTOB(H444Alpha, 1, 1, ARGB, 4, 4, 1)
TESTQPLANARTOB(H444Alpha, 1, 1, ABGR, 4, 4, 1)
TESTQPLANARTOB(U444Alpha, 1, 1, ARGB, 4, 4, 1)
TESTQPLANARTOB(U444Alpha, 1, 1, ABGR, 4, 4, 1)
TESTQPLANARTOB(V444Alpha, 1, 1, ARGB, 4, 4, 1)
TESTQPLANARTOB(V444Alpha, 1, 1, ABGR, 4, 4, 1)
#define TESTBIPLANARTOBI(FMT_PLANAR, SUBSAMP_X, SUBSAMP_Y, FMT_B, FMT_C, \
BPP_B, W1280, N, NEG, OFF) \
@ -2771,6 +2815,8 @@ TESTPLANARTOE(H422, 2, 1, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(H422, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(U422, 2, 1, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(U422, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(V422, 2, 1, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(V422, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(I422, 2, 1, BGRA, 1, 4, ARGB, 4)
TESTPLANARTOE(I422, 2, 1, RGBA, 1, 4, ARGB, 4)
TESTPLANARTOE(I444, 1, 1, ARGB, 1, 4, ABGR, 4)
@ -2781,6 +2827,8 @@ TESTPLANARTOE(H444, 1, 1, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(H444, 1, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(U444, 1, 1, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(U444, 1, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(V444, 1, 1, ARGB, 1, 4, ARGB, 4)
TESTPLANARTOE(V444, 1, 1, ABGR, 1, 4, ARGB, 4)
TESTPLANARTOE(I420, 2, 2, YUY2, 2, 4, ARGB, 4)
TESTPLANARTOE(I420, 2, 2, UYVY, 2, 4, ARGB, 4)
TESTPLANARTOE(I422, 2, 1, YUY2, 2, 4, ARGB, 4)
@ -2862,6 +2910,8 @@ TESTQPLANARTOE(F420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(F420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
TESTQPLANARTOE(U420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(U420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
TESTQPLANARTOE(V420Alpha, 2, 2, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(V420Alpha, 2, 2, ABGR, 1, 4, ARGB, 4)
TESTQPLANARTOE(I422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(I422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTQPLANARTOE(J422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
@ -2872,6 +2922,8 @@ TESTQPLANARTOE(H422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(H422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTQPLANARTOE(U422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(U422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTQPLANARTOE(V422Alpha, 2, 1, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(V422Alpha, 2, 1, ABGR, 1, 4, ARGB, 4)
TESTQPLANARTOE(I444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(I444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
TESTQPLANARTOE(J444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
@ -2880,6 +2932,8 @@ TESTQPLANARTOE(H444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(H444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
TESTQPLANARTOE(U444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(U444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
TESTQPLANARTOE(V444Alpha, 1, 1, ARGB, 1, 4, ABGR, 4)
TESTQPLANARTOE(V444Alpha, 1, 1, ABGR, 1, 4, ARGB, 4)
#define TESTPLANETOEI(FMT_A, SUB_A, BPP_A, FMT_B, SUB_B, BPP_B, W1280, N, NEG, \
OFF, FMT_C, BPP_C) \

118
util/color.cc Normal file
View File

@ -0,0 +1,118 @@
/*
* Copyright 2021 The LibYuv Project Authors. All rights reserved.
*
* Use of this source code is governed by a BSD-style license
* that can be found in the LICENSE file in the root of the source
* tree. An additional intellectual property rights grant can be found
* in the file PATENTS. All contributing project authors may
* be found in the AUTHORS file in the root of the source tree.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
// This utility computes values needed to generate yuvconstants based on
// white point values.
// The yuv formulas are tuned for 8 bit YUV channels.
// For those MCs that can be represented as kr and kb:
// Full range
// float M[3][3] {{1,0,2*(1-kr)},{1,-((2*kb)/((2-kb)*(1-kb-kr))),-((2*kr)/((2-kr)*(1-kb-kr)))},{1,2*(1-kb),0}};
// float B[3] {1+(256*(1-kr))/255,1-(256*kb)/(255*(2-kb)*(1-kb-kr))-(256*kr)/(255*(2-kr)*(1-kb-kr)),1+(256*(1-kb))/255};
// Limited range
// float M[3][3] {{85/73,0,255/112-(255*kr)/112},{85/73,-((255*kb)/(112*(2-kb)*(1-kb-kr))),-((255*kr)/(112*(2-kr)*(1-kb-kr)))},{85/73,255/112-(255*kb)/112,0}};
// float B[3] {77662/43435-(1537*kr)/1785,203/219-(1537*kb)/(1785*(2-kb)*(1-kb-kr))-(1537*kr)/(1785*(2-kr)*(1-kb-kr)),77662/43435-(1537*kb)/1785};
// mc bt
// 1 bt.709 KR = 0.2126; KB = 0.0722
// 4 fcc KR = 0.30; KB = 0.11
// 6 bt.601 KR = 0.299; KB = 0.114
// 7 SMPTE 240M KR = 0.212; KB = 0.087
// 10 bt2020 KR = 0.2627; KB = 0.0593
// BT.709 full range YUV to RGB reference
// R = Y + V * 1.5748
// G = Y - U * 0.18732 - V * 0.46812
// B = Y + U * 1.8556
// KR = 0.2126
// KB = 0.0722
// https://mymusing.co/bt601-yuv-to-rgb-conversion-color/
// // Y contribution to R,G,B. Scale and bias.
// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
// #define YB 32 /* 64 / 2 */
//
// // U and V contributions to R,G,B.
// #define UB 113 /* round(1.77200 * 64) */
// #define UG 22 /* round(0.34414 * 64) */
// #define VG 46 /* round(0.71414 * 64) */
// #define VR 90 /* round(1.40200 * 64) */
//
// // Bias values to round, and subtract 128 from U and V.
// #define BB (-UB * 128 + YB)
// #define BG (UG * 128 + VG * 128 + YB)
// #define BR (-VR * 128 + YB)
int round(float v) {
return (int) (v + 0.5);
}
int main(int argc, const char* argv[]) {
if (argc < 2) {
printf("color kr kb\n");
return -1;
}
float kr = atof(argv[1]);
float kb = atof(argv[2]);
float kg = 1 - kr - kb;
float vr = 2 * (1 - kr);
float ug = 2 * ((1 - kb) * kb / kg);
float vg = 2 * ((1 - kr) * kr / kg);
float ub = 2 * (1 - kb);
printf("Full range\n");
printf("R = Y + V * %5f\n", vr);
printf("G = Y - U * %6f - V * %6f\n", ug, vg);
printf("B = Y + U * %5f\n", ub);
printf("KR = %4f; ", kr);
printf("KB = %4f\n", kb);
// printf("KG = %4f\n", kg);
// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
// #define YB 32 /* 64 / 2 */
//
// // U and V contributions to R,G,B.
printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg);
printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr);
vr = 255.f / 224.f * 2 * (1 - kr);
ug = 255.f / 224.f * 2 * ((1 - kb) * kb / kg);
vg = 255.f / 224.f * 2 * ((1 - kr) * kr / kg);
ub = 255.f / 224.f * 2 * (1 - kb);
printf("Limited range\n");
printf("R = (Y - 16) * 1.164 + V * %5f\n", vr);
printf("G = (Y - 16) * 1.164 - U * %6f - V * %6f\n", ug, vg);
printf("B = (Y - 16) * 1.164 + U * %5f\n", ub);
// printf("KG = %4f\n", kg);
// #define YG 16320 /* round(1.000 * 64 * 256 * 256 / 257) */
// #define YB 32 /* 64 / 2 */
//
// // U and V contributions to R,G,B.
printf("UB %-3d /* round(%f * 64) */\n", round(ub * 64), ub);
printf("UG %-3d /* round(%f * 64) */\n", round(ug * 64), ug);
printf("VG %-3d /* round(%f * 64) */\n", round(vg * 64), vg);
printf("VR %-3d /* round(%f * 64) */\n", round(vr * 64), vr);
return 0;
}