Fix RotatePlane by 90 on Neon when source width is not a multiple of 8

Bug: b/220888716, b/218875554, b/220205245
Change-Id: I17e118ac9b9a7013386a5f0ad27a2dd249474ae5
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3483576
Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2022-02-23 00:46:55 -08:00 committed by libyuv LUCI CQ
parent 3b8c86d23a
commit e77531f6f1
9 changed files with 128 additions and 136 deletions

View File

@ -1864,7 +1864,7 @@ int I422ToRGBAMatrix(const uint8_t* src_y,
int width,
int height);
// Convert I422 to RGBA with matrix.
// Convert I420 to RGBA with matrix.
LIBYUV_API
int I420ToRGBAMatrix(const uint8_t* src_y,
int src_stride_y,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1810
#define LIBYUV_VERSION 1811
#endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -29,10 +29,7 @@ void TransposePlane(const uint8_t* src,
int width,
int height) {
int i = height;
#if defined(HAS_TRANSPOSEWX16_MSA)
void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst,
int dst_stride, int width) = TransposeWx16_C;
#elif defined(HAS_TRANSPOSEWX16_LSX)
#if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX)
void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst,
int dst_stride, int width) = TransposeWx16_C;
#else
@ -40,24 +37,12 @@ void TransposePlane(const uint8_t* src,
int dst_stride, int width) = TransposeWx8_C;
#endif
#if defined(HAS_TRANSPOSEWX16_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
TransposeWx16 = TransposeWx16_Any_MSA;
if (IS_ALIGNED(width, 16)) {
TransposeWx16 = TransposeWx16_MSA;
}
}
#elif defined(HAS_TRANSPOSEWX16_LSX)
if (TestCpuFlag(kCpuHasLSX)) {
TransposeWx16 = TransposeWx16_Any_LSX;
if (IS_ALIGNED(width, 16)) {
TransposeWx16 = TransposeWx16_LSX;
}
}
#else
#if defined(HAS_TRANSPOSEWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) {
TransposeWx8 = TransposeWx8_NEON;
TransposeWx8 = TransposeWx8_Any_NEON;
if (IS_ALIGNED(width, 8)) {
TransposeWx8 = TransposeWx8_NEON;
}
}
#endif
#if defined(HAS_TRANSPOSEWX8_SSSE3)
@ -76,17 +61,24 @@ void TransposePlane(const uint8_t* src,
}
}
#endif
#endif /* defined(HAS_TRANSPOSEWX16_MSA) */
#if defined(HAS_TRANSPOSEWX16_MSA)
// Work across the source in 16x16 tiles
while (i >= 16) {
TransposeWx16(src, src_stride, dst, dst_stride, width);
src += 16 * src_stride; // Go down 16 rows.
dst += 16; // Move over 16 columns.
i -= 16;
if (TestCpuFlag(kCpuHasMSA)) {
TransposeWx16 = TransposeWx16_Any_MSA;
if (IS_ALIGNED(width, 16)) {
TransposeWx16 = TransposeWx16_MSA;
}
}
#elif defined(HAS_TRANSPOSEWX16_LSX)
#endif
#if defined(HAS_TRANSPOSEWX16_LSX)
if (TestCpuFlag(kCpuHasLSX)) {
TransposeWx16 = TransposeWx16_Any_LSX;
if (IS_ALIGNED(width, 16)) {
TransposeWx16 = TransposeWx16_LSX;
}
}
#endif
#if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX)
// Work across the source in 16x16 tiles
while (i >= 16) {
TransposeWx16(src, src_stride, dst, dst_stride, width);

View File

@ -24,14 +24,14 @@ extern "C" {
#define ALPHA_VAL (-1)
// Fill YUV -> RGB conversion constants into vectors
#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \
{ \
ub = __msa_fill_w(yuvconst->kUVToB[0]); \
vr = __msa_fill_w(yuvconst->kUVToR[1]); \
ug = __msa_fill_w(yuvconst->kUVToG[0]); \
vg = __msa_fill_w(yuvconst->kUVToG[1]); \
yg = __msa_fill_w(yuvconst->kYToRgb[0]); \
yb = __msa_fill_w(yuvconst->kYBiasToRgb[0]); \
#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \
{ \
ub = __msa_fill_w(yuvconst->kUVToB[0]); \
vr = __msa_fill_w(yuvconst->kUVToR[1]); \
ug = __msa_fill_w(yuvconst->kUVToG[0]); \
vg = __msa_fill_w(yuvconst->kUVToG[1]); \
yg = __msa_fill_w(yuvconst->kYToRgb[0]); \
yb = __msa_fill_w(yuvconst->kYBiasToRgb[0]); \
}
// Load YUV 422 pixel data
@ -68,50 +68,50 @@ extern "C" {
}
// Convert 8 pixels of YUV 420 to RGB.
#define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \
{ \
v8i16 vec0_m, vec1_m; \
v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
v4i32 reg5_m, reg6_m, reg7_m; \
v16i8 zero_m = {0}; \
\
vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \
vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \
reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \
reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \
reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \
reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \
reg0_m *= yg; \
reg1_m *= yg; \
reg2_m *= ubvr; \
reg3_m *= ubvr; \
reg0_m = __msa_srai_w(reg0_m, 16); \
reg1_m = __msa_srai_w(reg1_m, 16); \
reg0_m += yb; \
reg1_m += yb; \
reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \
reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \
reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \
reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \
reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \
reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \
reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \
reg5_m = reg0_m - reg5_m; \
reg6_m = reg1_m - reg6_m; \
reg2_m = reg0_m - reg2_m; \
reg3_m = reg1_m - reg3_m; \
reg7_m = reg0_m - reg7_m; \
reg4_m = reg1_m - reg4_m; \
reg5_m = __msa_srai_w(reg5_m, 6); \
reg6_m = __msa_srai_w(reg6_m, 6); \
reg7_m = __msa_srai_w(reg7_m, 6); \
reg4_m = __msa_srai_w(reg4_m, 6); \
reg2_m = __msa_srai_w(reg2_m, 6); \
reg3_m = __msa_srai_w(reg3_m, 6); \
CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \
out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \
out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \
out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
#define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \
{ \
v8i16 vec0_m, vec1_m; \
v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
v4i32 reg5_m, reg6_m, reg7_m; \
v16i8 zero_m = {0}; \
\
vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \
vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \
reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \
reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \
reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \
reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \
reg0_m *= yg; \
reg1_m *= yg; \
reg2_m *= ubvr; \
reg3_m *= ubvr; \
reg0_m = __msa_srai_w(reg0_m, 16); \
reg1_m = __msa_srai_w(reg1_m, 16); \
reg0_m += yb; \
reg1_m += yb; \
reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \
reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \
reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \
reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \
reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \
reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \
reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \
reg5_m = reg0_m - reg5_m; \
reg6_m = reg1_m - reg6_m; \
reg2_m = reg0_m - reg2_m; \
reg3_m = reg1_m - reg3_m; \
reg7_m = reg0_m - reg7_m; \
reg4_m = reg1_m - reg4_m; \
reg5_m = __msa_srai_w(reg5_m, 6); \
reg6_m = __msa_srai_w(reg6_m, 6); \
reg7_m = __msa_srai_w(reg7_m, 6); \
reg4_m = __msa_srai_w(reg4_m, 6); \
reg2_m = __msa_srai_w(reg2_m, 6); \
reg3_m = __msa_srai_w(reg3_m, 6); \
CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \
out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \
out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \
out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
}
// Pack and Store 8 ARGB values.
@ -278,32 +278,32 @@ extern "C" {
out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \
}
#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
{ \
v16u8 _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5; \
v8i16 _reg0, _reg1, _reg2, _reg3, _reg4, _reg5; \
_tmp0 = (v16u8)__msa_ilvev_b(_tmpb, _nexb); \
_tmp1 = (v16u8)__msa_ilvod_b(_tmpb, _nexb); \
_tmp2 = (v16u8)__msa_ilvev_b(_tmpg, _nexg); \
_tmp3 = (v16u8)__msa_ilvod_b(_tmpg, _nexg); \
_tmp4 = (v16u8)__msa_ilvev_b(_tmpr, _nexr); \
_tmp5 = (v16u8)__msa_ilvod_b(_tmpr, _nexr); \
_reg0 = (v8i16)__msa_hadd_u_h(_tmp0, _tmp0); \
_reg1 = (v8i16)__msa_hadd_u_h(_tmp1, _tmp1); \
_reg2 = (v8i16)__msa_hadd_u_h(_tmp2, _tmp2); \
_reg3 = (v8i16)__msa_hadd_u_h(_tmp3, _tmp3); \
_reg4 = (v8i16)__msa_hadd_u_h(_tmp4, _tmp4); \
_reg5 = (v8i16)__msa_hadd_u_h(_tmp5, _tmp5); \
_reg0 = (v8i16)__msa_aver_u_h(_reg0, _reg1); \
_reg2 = (v8i16)__msa_aver_u_h(_reg2, _reg3); \
_reg4 = (v8i16)__msa_aver_u_h(_reg4, _reg5); \
_reg1 = (v8i16)__msa_maddv_h(const_112, _reg0, const_8080); \
_reg3 = (v8i16)__msa_maddv_h(const_112, _reg4, const_8080); \
_reg1 = (v8i16)__msa_msubv_h(_reg1, const_74, _reg2); \
_reg3 = (v8i16)__msa_msubv_h(_reg3, const_94, _reg2); \
_reg1 = (v8i16)__msa_msubv_h(_reg1, const_38, _reg4); \
_reg3 = (v8i16)__msa_msubv_h(_reg3, const_18, _reg0); \
_dst0 = (v16u8)__msa_pckod_b(_reg3, _reg1); \
#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
{ \
v16u8 _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5; \
v8i16 _reg0, _reg1, _reg2, _reg3, _reg4, _reg5; \
_tmp0 = (v16u8)__msa_ilvev_b(_tmpb, _nexb); \
_tmp1 = (v16u8)__msa_ilvod_b(_tmpb, _nexb); \
_tmp2 = (v16u8)__msa_ilvev_b(_tmpg, _nexg); \
_tmp3 = (v16u8)__msa_ilvod_b(_tmpg, _nexg); \
_tmp4 = (v16u8)__msa_ilvev_b(_tmpr, _nexr); \
_tmp5 = (v16u8)__msa_ilvod_b(_tmpr, _nexr); \
_reg0 = (v8i16)__msa_hadd_u_h(_tmp0, _tmp0); \
_reg1 = (v8i16)__msa_hadd_u_h(_tmp1, _tmp1); \
_reg2 = (v8i16)__msa_hadd_u_h(_tmp2, _tmp2); \
_reg3 = (v8i16)__msa_hadd_u_h(_tmp3, _tmp3); \
_reg4 = (v8i16)__msa_hadd_u_h(_tmp4, _tmp4); \
_reg5 = (v8i16)__msa_hadd_u_h(_tmp5, _tmp5); \
_reg0 = (v8i16)__msa_aver_u_h(_reg0, _reg1); \
_reg2 = (v8i16)__msa_aver_u_h(_reg2, _reg3); \
_reg4 = (v8i16)__msa_aver_u_h(_reg4, _reg5); \
_reg1 = (v8i16)__msa_maddv_h(const_112, _reg0, const_8080); \
_reg3 = (v8i16)__msa_maddv_h(const_112, _reg4, const_8080); \
_reg1 = (v8i16)__msa_msubv_h(_reg1, const_74, _reg2); \
_reg3 = (v8i16)__msa_msubv_h(_reg3, const_94, _reg2); \
_reg1 = (v8i16)__msa_msubv_h(_reg1, const_38, _reg4); \
_reg3 = (v8i16)__msa_msubv_h(_reg3, const_18, _reg0); \
_dst0 = (v16u8)__msa_pckod_b(_reg3, _reg1); \
}
void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
@ -1687,9 +1687,9 @@ void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
v16u8 reg0, reg1, reg2, dst;
v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r;
v8i16 res0, res1;
v8i16 const_66 = (v8i16)__msa_ldi_h(66);
v8i16 const_66 = (v8i16)__msa_ldi_h(66);
v8i16 const_129 = (v8i16)__msa_ldi_h(129);
v8i16 const_25 = (v8i16)__msa_ldi_h(25);
v8i16 const_25 = (v8i16)__msa_ldi_h(25);
v8u16 const_1080 = (v8u16)__msa_fill_h(0x1080);
v16u8 zero = (v16u8)__msa_ldi_b(0);
@ -1726,7 +1726,7 @@ void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
res1 = (v8i16)__msa_maddv_h(tmpg_l, const_129, res1);
res0 = (v8i16)__msa_maddv_h(tmpr_r, const_66, res0);
res1 = (v8i16)__msa_maddv_h(tmpr_l, const_66, res1);
dst = (v16u8)__msa_pckod_b(res1, res0);
dst = (v16u8)__msa_pckod_b(res1, res0);
ST_UB(dst, dst_y);
src_argb1555 += 32;
dst_y += 16;
@ -1739,9 +1739,9 @@ void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
v16u8 reg0, reg1, dst;
v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r;
v8i16 res0, res1;
v8i16 const_66 = (v8i16)__msa_ldi_h(66);
v8i16 const_66 = (v8i16)__msa_ldi_h(66);
v8i16 const_129 = (v8i16)__msa_ldi_h(129);
v8i16 const_25 = (v8i16)__msa_ldi_h(25);
v8i16 const_25 = (v8i16)__msa_ldi_h(25);
v8i16 const_1080 = (v8i16)__msa_fill_h(0x1080);
v16u8 zero = __msa_ldi_b(0);
@ -1776,7 +1776,7 @@ void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
res1 = (v8i16)__msa_maddv_h(tmpg_l, const_129, res1);
res0 = (v8i16)__msa_maddv_h(tmpr_r, const_66, res0);
res1 = (v8i16)__msa_maddv_h(tmpr_l, const_66, res1);
dst = (v16u8)__msa_pckod_b(res1, res0);
dst = (v16u8)__msa_pckod_b(res1, res0);
ST_UB(dst, dst_y);
src_rgb565 += 32;
dst_y += 16;
@ -1879,10 +1879,10 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
v16u8 reg0, reg1, reg2, reg3;
v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr;
v8i16 const_112 = (v8i16)__msa_ldi_h(0x38);
v8i16 const_74 = (v8i16)__msa_ldi_h(0x25);
v8i16 const_38 = (v8i16)__msa_ldi_h(0x13);
v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F);
v8i16 const_18 = (v8i16)__msa_ldi_h(0x09);
v8i16 const_74 = (v8i16)__msa_ldi_h(0x25);
v8i16 const_38 = (v8i16)__msa_ldi_h(0x13);
v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F);
v8i16 const_18 = (v8i16)__msa_ldi_h(0x09);
v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080);
for (x = 0; x < width; x += 16) {
@ -1952,10 +1952,10 @@ void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
v16u8 reg0, reg1, reg2, reg3;
v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr;
v8i16 const_112 = (v8i16)__msa_ldi_h(0x38);
v8i16 const_74 = (v8i16)__msa_ldi_h(0x25);
v8i16 const_38 = (v8i16)__msa_ldi_h(0x13);
v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F);
v8i16 const_18 = (v8i16)__msa_ldi_h(0x09);
v8i16 const_74 = (v8i16)__msa_ldi_h(0x25);
v8i16 const_38 = (v8i16)__msa_ldi_h(0x13);
v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F);
v8i16 const_18 = (v8i16)__msa_ldi_h(0x09);
v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080);
for (x = 0; x < width; x += 16) {

View File

@ -585,11 +585,11 @@ void DetileRow_NEON(const uint8_t* src,
int width) {
asm volatile(
"1: \n"
"vld1.16 {q0}, [%0], %3 \n" // load 16 bytes
"subs %2, %2, #16 \n" // 16 processed per loop
"pld [%0, 1792] \n"
"vst1.16 {q0}, [%1]! \n" // store 16 bytes
"bgt 1b \n"
"vld1.16 {q0}, [%0], %3 \n" // load 16 bytes
"subs %2, %2, #16 \n" // 16 processed per loop
"pld [%0, 1792] \n"
"vst1.16 {q0}, [%1]! \n" // store 16 bytes
"bgt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
"+r"(width) // %2
@ -608,7 +608,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
"1: \n"
"vld2.8 {d0, d1}, [%0], %4 \n"
"subs %3, %3, #16 \n"
"pld [%0, 1792] \n"
"pld [%0, 1792] \n"
"vst1.8 {d0}, [%1]! \n"
"vst1.8 {d1}, [%2]! \n"
"bgt 1b \n"

View File

@ -637,7 +637,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
"1: \n"
"ld2 {v0.8b,v1.8b}, [%0], %4 \n"
"subs %w3, %w3, #16 \n"
"prfm pldl1keep, [%0, 1792] \n"
"prfm pldl1keep, [%0, 1792] \n"
"st1 {v0.8b}, [%1], #8 \n"
"st1 {v1.8b}, [%2], #8 \n"
"b.gt 1b \n"

View File

@ -433,7 +433,7 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
"DST_SUBSAMP_Y unsupported"); \
const int kWidth = W1280; \
const int kHeight = benchmark_height_; \
const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \
const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \
const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \
const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \
const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1); \

View File

@ -1503,15 +1503,15 @@ TEST_F(LibYUVPlanarTest, TestDetilePlane) {
// Disable all optimizations.
MaskCpuFlags(disable_cpu_flags_);
for (j = 0; j < benchmark_iterations_; j++) {
DetilePlane(orig_y, orig_width, dst_c, benchmark_width_,
benchmark_width_, benchmark_height_, 16);
DetilePlane(orig_y, orig_width, dst_c, benchmark_width_, benchmark_width_,
benchmark_height_, 16);
}
// Enable optimizations.
MaskCpuFlags(benchmark_cpu_info_);
for (j = 0; j < benchmark_iterations_; j++) {
DetilePlane(orig_y, orig_width, dst_opt, benchmark_width_,
benchmark_width_, benchmark_height_, 16);
DetilePlane(orig_y, orig_width, dst_opt, benchmark_width_, benchmark_width_,
benchmark_height_, 16);
}
for (i = 0; i < y_plane_size; ++i) {