Fix RotatePlane by 90 on Neon when source width is not a multiple of 8

Bug: b/220888716, b/218875554, b/220205245
Change-Id: I17e118ac9b9a7013386a5f0ad27a2dd249474ae5
Reviewed-on: https://chromium-review.googlesource.com/c/libyuv/libyuv/+/3483576
Reviewed-by: Mirko Bonadei <mbonadei@chromium.org>
Commit-Queue: Frank Barchard <fbarchard@chromium.org>
This commit is contained in:
Frank Barchard 2022-02-23 00:46:55 -08:00 committed by libyuv LUCI CQ
parent 3b8c86d23a
commit e77531f6f1
9 changed files with 128 additions and 136 deletions

View File

@ -5,4 +5,4 @@ License: BSD
License File: LICENSE License File: LICENSE
Description: Description:
libyuv is an open source project that includes YUV conversion and scaling functionality. libyuv is an open source project that includes YUV conversion and scaling functionality.

View File

@ -1864,7 +1864,7 @@ int I422ToRGBAMatrix(const uint8_t* src_y,
int width, int width,
int height); int height);
// Convert I422 to RGBA with matrix. // Convert I420 to RGBA with matrix.
LIBYUV_API LIBYUV_API
int I420ToRGBAMatrix(const uint8_t* src_y, int I420ToRGBAMatrix(const uint8_t* src_y,
int src_stride_y, int src_stride_y,

View File

@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_ #ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_ #define INCLUDE_LIBYUV_VERSION_H_
#define LIBYUV_VERSION 1810 #define LIBYUV_VERSION 1811
#endif // INCLUDE_LIBYUV_VERSION_H_ #endif // INCLUDE_LIBYUV_VERSION_H_

View File

@ -29,10 +29,7 @@ void TransposePlane(const uint8_t* src,
int width, int width,
int height) { int height) {
int i = height; int i = height;
#if defined(HAS_TRANSPOSEWX16_MSA) #if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX)
void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst,
int dst_stride, int width) = TransposeWx16_C;
#elif defined(HAS_TRANSPOSEWX16_LSX)
void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst, void (*TransposeWx16)(const uint8_t* src, int src_stride, uint8_t* dst,
int dst_stride, int width) = TransposeWx16_C; int dst_stride, int width) = TransposeWx16_C;
#else #else
@ -40,24 +37,12 @@ void TransposePlane(const uint8_t* src,
int dst_stride, int width) = TransposeWx8_C; int dst_stride, int width) = TransposeWx8_C;
#endif #endif
#if defined(HAS_TRANSPOSEWX16_MSA)
if (TestCpuFlag(kCpuHasMSA)) {
TransposeWx16 = TransposeWx16_Any_MSA;
if (IS_ALIGNED(width, 16)) {
TransposeWx16 = TransposeWx16_MSA;
}
}
#elif defined(HAS_TRANSPOSEWX16_LSX)
if (TestCpuFlag(kCpuHasLSX)) {
TransposeWx16 = TransposeWx16_Any_LSX;
if (IS_ALIGNED(width, 16)) {
TransposeWx16 = TransposeWx16_LSX;
}
}
#else
#if defined(HAS_TRANSPOSEWX8_NEON) #if defined(HAS_TRANSPOSEWX8_NEON)
if (TestCpuFlag(kCpuHasNEON)) { if (TestCpuFlag(kCpuHasNEON)) {
TransposeWx8 = TransposeWx8_NEON; TransposeWx8 = TransposeWx8_Any_NEON;
if (IS_ALIGNED(width, 8)) {
TransposeWx8 = TransposeWx8_NEON;
}
} }
#endif #endif
#if defined(HAS_TRANSPOSEWX8_SSSE3) #if defined(HAS_TRANSPOSEWX8_SSSE3)
@ -76,17 +61,24 @@ void TransposePlane(const uint8_t* src,
} }
} }
#endif #endif
#endif /* defined(HAS_TRANSPOSEWX16_MSA) */
#if defined(HAS_TRANSPOSEWX16_MSA) #if defined(HAS_TRANSPOSEWX16_MSA)
// Work across the source in 16x16 tiles if (TestCpuFlag(kCpuHasMSA)) {
while (i >= 16) { TransposeWx16 = TransposeWx16_Any_MSA;
TransposeWx16(src, src_stride, dst, dst_stride, width); if (IS_ALIGNED(width, 16)) {
src += 16 * src_stride; // Go down 16 rows. TransposeWx16 = TransposeWx16_MSA;
dst += 16; // Move over 16 columns. }
i -= 16;
} }
#elif defined(HAS_TRANSPOSEWX16_LSX) #endif
#if defined(HAS_TRANSPOSEWX16_LSX)
if (TestCpuFlag(kCpuHasLSX)) {
TransposeWx16 = TransposeWx16_Any_LSX;
if (IS_ALIGNED(width, 16)) {
TransposeWx16 = TransposeWx16_LSX;
}
}
#endif
#if defined(HAS_TRANSPOSEWX16_MSA) || defined(HAS_TRANSPOSEWX16_LSX)
// Work across the source in 16x16 tiles // Work across the source in 16x16 tiles
while (i >= 16) { while (i >= 16) {
TransposeWx16(src, src_stride, dst, dst_stride, width); TransposeWx16(src, src_stride, dst, dst_stride, width);

View File

@ -24,14 +24,14 @@ extern "C" {
#define ALPHA_VAL (-1) #define ALPHA_VAL (-1)
// Fill YUV -> RGB conversion constants into vectors // Fill YUV -> RGB conversion constants into vectors
#define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \ #define YUVTORGB_SETUP(yuvconst, ub, vr, ug, vg, yg, yb) \
{ \ { \
ub = __msa_fill_w(yuvconst->kUVToB[0]); \ ub = __msa_fill_w(yuvconst->kUVToB[0]); \
vr = __msa_fill_w(yuvconst->kUVToR[1]); \ vr = __msa_fill_w(yuvconst->kUVToR[1]); \
ug = __msa_fill_w(yuvconst->kUVToG[0]); \ ug = __msa_fill_w(yuvconst->kUVToG[0]); \
vg = __msa_fill_w(yuvconst->kUVToG[1]); \ vg = __msa_fill_w(yuvconst->kUVToG[1]); \
yg = __msa_fill_w(yuvconst->kYToRgb[0]); \ yg = __msa_fill_w(yuvconst->kYToRgb[0]); \
yb = __msa_fill_w(yuvconst->kYBiasToRgb[0]); \ yb = __msa_fill_w(yuvconst->kYBiasToRgb[0]); \
} }
// Load YUV 422 pixel data // Load YUV 422 pixel data
@ -68,50 +68,50 @@ extern "C" {
} }
// Convert 8 pixels of YUV 420 to RGB. // Convert 8 pixels of YUV 420 to RGB.
#define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \ #define YUVTORGB(in_y, in_uv, ubvr, ugvg, yg, yb, out_b, out_g, out_r) \
{ \ { \
v8i16 vec0_m, vec1_m; \ v8i16 vec0_m, vec1_m; \
v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \ v4i32 reg0_m, reg1_m, reg2_m, reg3_m, reg4_m; \
v4i32 reg5_m, reg6_m, reg7_m; \ v4i32 reg5_m, reg6_m, reg7_m; \
v16i8 zero_m = {0}; \ v16i8 zero_m = {0}; \
\ \
vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \ vec0_m = (v8i16)__msa_ilvr_b((v16i8)in_y, (v16i8)in_y); \
vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \ vec1_m = (v8i16)__msa_ilvr_b((v16i8)zero_m, (v16i8)in_uv); \
reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \ reg0_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec0_m); \
reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \ reg1_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec0_m); \
reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \ reg2_m = (v4i32)__msa_ilvr_h((v8i16)zero_m, (v8i16)vec1_m); \
reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \ reg3_m = (v4i32)__msa_ilvl_h((v8i16)zero_m, (v8i16)vec1_m); \
reg0_m *= yg; \ reg0_m *= yg; \
reg1_m *= yg; \ reg1_m *= yg; \
reg2_m *= ubvr; \ reg2_m *= ubvr; \
reg3_m *= ubvr; \ reg3_m *= ubvr; \
reg0_m = __msa_srai_w(reg0_m, 16); \ reg0_m = __msa_srai_w(reg0_m, 16); \
reg1_m = __msa_srai_w(reg1_m, 16); \ reg1_m = __msa_srai_w(reg1_m, 16); \
reg0_m += yb; \ reg0_m += yb; \
reg1_m += yb; \ reg1_m += yb; \
reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \ reg4_m = __msa_dotp_s_w((v8i16)vec1_m, (v8i16)ugvg); \
reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \ reg5_m = __msa_ilvev_w(reg2_m, reg2_m); \
reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \ reg6_m = __msa_ilvev_w(reg3_m, reg3_m); \
reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \ reg7_m = __msa_ilvr_w(reg4_m, reg4_m); \
reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \ reg2_m = __msa_ilvod_w(reg2_m, reg2_m); \
reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \ reg3_m = __msa_ilvod_w(reg3_m, reg3_m); \
reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \ reg4_m = __msa_ilvl_w(reg4_m, reg4_m); \
reg5_m = reg0_m - reg5_m; \ reg5_m = reg0_m - reg5_m; \
reg6_m = reg1_m - reg6_m; \ reg6_m = reg1_m - reg6_m; \
reg2_m = reg0_m - reg2_m; \ reg2_m = reg0_m - reg2_m; \
reg3_m = reg1_m - reg3_m; \ reg3_m = reg1_m - reg3_m; \
reg7_m = reg0_m - reg7_m; \ reg7_m = reg0_m - reg7_m; \
reg4_m = reg1_m - reg4_m; \ reg4_m = reg1_m - reg4_m; \
reg5_m = __msa_srai_w(reg5_m, 6); \ reg5_m = __msa_srai_w(reg5_m, 6); \
reg6_m = __msa_srai_w(reg6_m, 6); \ reg6_m = __msa_srai_w(reg6_m, 6); \
reg7_m = __msa_srai_w(reg7_m, 6); \ reg7_m = __msa_srai_w(reg7_m, 6); \
reg4_m = __msa_srai_w(reg4_m, 6); \ reg4_m = __msa_srai_w(reg4_m, 6); \
reg2_m = __msa_srai_w(reg2_m, 6); \ reg2_m = __msa_srai_w(reg2_m, 6); \
reg3_m = __msa_srai_w(reg3_m, 6); \ reg3_m = __msa_srai_w(reg3_m, 6); \
CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \ CLIP_0TO255(reg5_m, reg6_m, reg7_m, reg4_m, reg2_m, reg3_m); \
out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \ out_b = __msa_pckev_h((v8i16)reg6_m, (v8i16)reg5_m); \
out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \ out_g = __msa_pckev_h((v8i16)reg4_m, (v8i16)reg7_m); \
out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \ out_r = __msa_pckev_h((v8i16)reg3_m, (v8i16)reg2_m); \
} }
// Pack and Store 8 ARGB values. // Pack and Store 8 ARGB values.
@ -278,32 +278,32 @@ extern "C" {
out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \ out_v = (v16u8)__msa_insert_d(zero_m, 0, (int64_t)v_m); \
} }
#define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \ #define RGBTOUV(_tmpb, _tmpg, _tmpr, _nexb, _nexg, _nexr, _dst0) \
{ \ { \
v16u8 _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5; \ v16u8 _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5; \
v8i16 _reg0, _reg1, _reg2, _reg3, _reg4, _reg5; \ v8i16 _reg0, _reg1, _reg2, _reg3, _reg4, _reg5; \
_tmp0 = (v16u8)__msa_ilvev_b(_tmpb, _nexb); \ _tmp0 = (v16u8)__msa_ilvev_b(_tmpb, _nexb); \
_tmp1 = (v16u8)__msa_ilvod_b(_tmpb, _nexb); \ _tmp1 = (v16u8)__msa_ilvod_b(_tmpb, _nexb); \
_tmp2 = (v16u8)__msa_ilvev_b(_tmpg, _nexg); \ _tmp2 = (v16u8)__msa_ilvev_b(_tmpg, _nexg); \
_tmp3 = (v16u8)__msa_ilvod_b(_tmpg, _nexg); \ _tmp3 = (v16u8)__msa_ilvod_b(_tmpg, _nexg); \
_tmp4 = (v16u8)__msa_ilvev_b(_tmpr, _nexr); \ _tmp4 = (v16u8)__msa_ilvev_b(_tmpr, _nexr); \
_tmp5 = (v16u8)__msa_ilvod_b(_tmpr, _nexr); \ _tmp5 = (v16u8)__msa_ilvod_b(_tmpr, _nexr); \
_reg0 = (v8i16)__msa_hadd_u_h(_tmp0, _tmp0); \ _reg0 = (v8i16)__msa_hadd_u_h(_tmp0, _tmp0); \
_reg1 = (v8i16)__msa_hadd_u_h(_tmp1, _tmp1); \ _reg1 = (v8i16)__msa_hadd_u_h(_tmp1, _tmp1); \
_reg2 = (v8i16)__msa_hadd_u_h(_tmp2, _tmp2); \ _reg2 = (v8i16)__msa_hadd_u_h(_tmp2, _tmp2); \
_reg3 = (v8i16)__msa_hadd_u_h(_tmp3, _tmp3); \ _reg3 = (v8i16)__msa_hadd_u_h(_tmp3, _tmp3); \
_reg4 = (v8i16)__msa_hadd_u_h(_tmp4, _tmp4); \ _reg4 = (v8i16)__msa_hadd_u_h(_tmp4, _tmp4); \
_reg5 = (v8i16)__msa_hadd_u_h(_tmp5, _tmp5); \ _reg5 = (v8i16)__msa_hadd_u_h(_tmp5, _tmp5); \
_reg0 = (v8i16)__msa_aver_u_h(_reg0, _reg1); \ _reg0 = (v8i16)__msa_aver_u_h(_reg0, _reg1); \
_reg2 = (v8i16)__msa_aver_u_h(_reg2, _reg3); \ _reg2 = (v8i16)__msa_aver_u_h(_reg2, _reg3); \
_reg4 = (v8i16)__msa_aver_u_h(_reg4, _reg5); \ _reg4 = (v8i16)__msa_aver_u_h(_reg4, _reg5); \
_reg1 = (v8i16)__msa_maddv_h(const_112, _reg0, const_8080); \ _reg1 = (v8i16)__msa_maddv_h(const_112, _reg0, const_8080); \
_reg3 = (v8i16)__msa_maddv_h(const_112, _reg4, const_8080); \ _reg3 = (v8i16)__msa_maddv_h(const_112, _reg4, const_8080); \
_reg1 = (v8i16)__msa_msubv_h(_reg1, const_74, _reg2); \ _reg1 = (v8i16)__msa_msubv_h(_reg1, const_74, _reg2); \
_reg3 = (v8i16)__msa_msubv_h(_reg3, const_94, _reg2); \ _reg3 = (v8i16)__msa_msubv_h(_reg3, const_94, _reg2); \
_reg1 = (v8i16)__msa_msubv_h(_reg1, const_38, _reg4); \ _reg1 = (v8i16)__msa_msubv_h(_reg1, const_38, _reg4); \
_reg3 = (v8i16)__msa_msubv_h(_reg3, const_18, _reg0); \ _reg3 = (v8i16)__msa_msubv_h(_reg3, const_18, _reg0); \
_dst0 = (v16u8)__msa_pckod_b(_reg3, _reg1); \ _dst0 = (v16u8)__msa_pckod_b(_reg3, _reg1); \
} }
void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) { void MirrorRow_MSA(const uint8_t* src, uint8_t* dst, int width) {
@ -1687,9 +1687,9 @@ void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
v16u8 reg0, reg1, reg2, dst; v16u8 reg0, reg1, reg2, dst;
v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r; v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r;
v8i16 res0, res1; v8i16 res0, res1;
v8i16 const_66 = (v8i16)__msa_ldi_h(66); v8i16 const_66 = (v8i16)__msa_ldi_h(66);
v8i16 const_129 = (v8i16)__msa_ldi_h(129); v8i16 const_129 = (v8i16)__msa_ldi_h(129);
v8i16 const_25 = (v8i16)__msa_ldi_h(25); v8i16 const_25 = (v8i16)__msa_ldi_h(25);
v8u16 const_1080 = (v8u16)__msa_fill_h(0x1080); v8u16 const_1080 = (v8u16)__msa_fill_h(0x1080);
v16u8 zero = (v16u8)__msa_ldi_b(0); v16u8 zero = (v16u8)__msa_ldi_b(0);
@ -1726,7 +1726,7 @@ void ARGB1555ToYRow_MSA(const uint8_t* src_argb1555,
res1 = (v8i16)__msa_maddv_h(tmpg_l, const_129, res1); res1 = (v8i16)__msa_maddv_h(tmpg_l, const_129, res1);
res0 = (v8i16)__msa_maddv_h(tmpr_r, const_66, res0); res0 = (v8i16)__msa_maddv_h(tmpr_r, const_66, res0);
res1 = (v8i16)__msa_maddv_h(tmpr_l, const_66, res1); res1 = (v8i16)__msa_maddv_h(tmpr_l, const_66, res1);
dst = (v16u8)__msa_pckod_b(res1, res0); dst = (v16u8)__msa_pckod_b(res1, res0);
ST_UB(dst, dst_y); ST_UB(dst, dst_y);
src_argb1555 += 32; src_argb1555 += 32;
dst_y += 16; dst_y += 16;
@ -1739,9 +1739,9 @@ void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
v16u8 reg0, reg1, dst; v16u8 reg0, reg1, dst;
v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r; v8i16 tmpr_l, tmpr_r, tmpg_l, tmpg_r, tmpb_l, tmpb_r;
v8i16 res0, res1; v8i16 res0, res1;
v8i16 const_66 = (v8i16)__msa_ldi_h(66); v8i16 const_66 = (v8i16)__msa_ldi_h(66);
v8i16 const_129 = (v8i16)__msa_ldi_h(129); v8i16 const_129 = (v8i16)__msa_ldi_h(129);
v8i16 const_25 = (v8i16)__msa_ldi_h(25); v8i16 const_25 = (v8i16)__msa_ldi_h(25);
v8i16 const_1080 = (v8i16)__msa_fill_h(0x1080); v8i16 const_1080 = (v8i16)__msa_fill_h(0x1080);
v16u8 zero = __msa_ldi_b(0); v16u8 zero = __msa_ldi_b(0);
@ -1776,7 +1776,7 @@ void RGB565ToYRow_MSA(const uint8_t* src_rgb565, uint8_t* dst_y, int width) {
res1 = (v8i16)__msa_maddv_h(tmpg_l, const_129, res1); res1 = (v8i16)__msa_maddv_h(tmpg_l, const_129, res1);
res0 = (v8i16)__msa_maddv_h(tmpr_r, const_66, res0); res0 = (v8i16)__msa_maddv_h(tmpr_r, const_66, res0);
res1 = (v8i16)__msa_maddv_h(tmpr_l, const_66, res1); res1 = (v8i16)__msa_maddv_h(tmpr_l, const_66, res1);
dst = (v16u8)__msa_pckod_b(res1, res0); dst = (v16u8)__msa_pckod_b(res1, res0);
ST_UB(dst, dst_y); ST_UB(dst, dst_y);
src_rgb565 += 32; src_rgb565 += 32;
dst_y += 16; dst_y += 16;
@ -1879,10 +1879,10 @@ void ARGB1555ToUVRow_MSA(const uint8_t* src_argb1555,
v16u8 reg0, reg1, reg2, reg3; v16u8 reg0, reg1, reg2, reg3;
v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr; v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr;
v8i16 const_112 = (v8i16)__msa_ldi_h(0x38); v8i16 const_112 = (v8i16)__msa_ldi_h(0x38);
v8i16 const_74 = (v8i16)__msa_ldi_h(0x25); v8i16 const_74 = (v8i16)__msa_ldi_h(0x25);
v8i16 const_38 = (v8i16)__msa_ldi_h(0x13); v8i16 const_38 = (v8i16)__msa_ldi_h(0x13);
v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F); v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F);
v8i16 const_18 = (v8i16)__msa_ldi_h(0x09); v8i16 const_18 = (v8i16)__msa_ldi_h(0x09);
v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080); v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080);
for (x = 0; x < width; x += 16) { for (x = 0; x < width; x += 16) {
@ -1952,10 +1952,10 @@ void RGB565ToUVRow_MSA(const uint8_t* src_rgb565,
v16u8 reg0, reg1, reg2, reg3; v16u8 reg0, reg1, reg2, reg3;
v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr; v16u8 tmpb, tmpg, tmpr, nexb, nexg, nexr;
v8i16 const_112 = (v8i16)__msa_ldi_h(0x38); v8i16 const_112 = (v8i16)__msa_ldi_h(0x38);
v8i16 const_74 = (v8i16)__msa_ldi_h(0x25); v8i16 const_74 = (v8i16)__msa_ldi_h(0x25);
v8i16 const_38 = (v8i16)__msa_ldi_h(0x13); v8i16 const_38 = (v8i16)__msa_ldi_h(0x13);
v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F); v8i16 const_94 = (v8i16)__msa_ldi_h(0x2F);
v8i16 const_18 = (v8i16)__msa_ldi_h(0x09); v8i16 const_18 = (v8i16)__msa_ldi_h(0x09);
v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080); v8u16 const_8080 = (v8u16)__msa_fill_h(0x8080);
for (x = 0; x < width; x += 16) { for (x = 0; x < width; x += 16) {

View File

@ -585,11 +585,11 @@ void DetileRow_NEON(const uint8_t* src,
int width) { int width) {
asm volatile( asm volatile(
"1: \n" "1: \n"
"vld1.16 {q0}, [%0], %3 \n" // load 16 bytes "vld1.16 {q0}, [%0], %3 \n" // load 16 bytes
"subs %2, %2, #16 \n" // 16 processed per loop "subs %2, %2, #16 \n" // 16 processed per loop
"pld [%0, 1792] \n" "pld [%0, 1792] \n"
"vst1.16 {q0}, [%1]! \n" // store 16 bytes "vst1.16 {q0}, [%1]! \n" // store 16 bytes
"bgt 1b \n" "bgt 1b \n"
: "+r"(src), // %0 : "+r"(src), // %0
"+r"(dst), // %1 "+r"(dst), // %1
"+r"(width) // %2 "+r"(width) // %2
@ -608,7 +608,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
"1: \n" "1: \n"
"vld2.8 {d0, d1}, [%0], %4 \n" "vld2.8 {d0, d1}, [%0], %4 \n"
"subs %3, %3, #16 \n" "subs %3, %3, #16 \n"
"pld [%0, 1792] \n" "pld [%0, 1792] \n"
"vst1.8 {d0}, [%1]! \n" "vst1.8 {d0}, [%1]! \n"
"vst1.8 {d1}, [%2]! \n" "vst1.8 {d1}, [%2]! \n"
"bgt 1b \n" "bgt 1b \n"

View File

@ -637,7 +637,7 @@ void DetileSplitUVRow_NEON(const uint8_t* src_uv,
"1: \n" "1: \n"
"ld2 {v0.8b,v1.8b}, [%0], %4 \n" "ld2 {v0.8b,v1.8b}, [%0], %4 \n"
"subs %w3, %w3, #16 \n" "subs %w3, %w3, #16 \n"
"prfm pldl1keep, [%0, 1792] \n" "prfm pldl1keep, [%0, 1792] \n"
"st1 {v0.8b}, [%1], #8 \n" "st1 {v0.8b}, [%1], #8 \n"
"st1 {v1.8b}, [%2], #8 \n" "st1 {v1.8b}, [%2], #8 \n"
"b.gt 1b \n" "b.gt 1b \n"

View File

@ -433,7 +433,7 @@ TESTPLANARTOBP(I212, uint16_t, 2, 2, 1, P212, uint16_t, 2, 2, 1, 12)
"DST_SUBSAMP_Y unsupported"); \ "DST_SUBSAMP_Y unsupported"); \
const int kWidth = W1280; \ const int kWidth = W1280; \
const int kHeight = benchmark_height_; \ const int kHeight = benchmark_height_; \
const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \ const int kSrcHalfWidth = SUBSAMPLE(kWidth, SRC_SUBSAMP_X); \
const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \ const int kDstHalfWidth = SUBSAMPLE(kWidth, DST_SUBSAMP_X); \
const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \ const int kDstHalfHeight = SUBSAMPLE(kHeight, DST_SUBSAMP_Y); \
const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1); \ const int kPaddedWidth = (kWidth + (TILE_WIDTH - 1)) & ~(TILE_WIDTH - 1); \

View File

@ -1503,15 +1503,15 @@ TEST_F(LibYUVPlanarTest, TestDetilePlane) {
// Disable all optimizations. // Disable all optimizations.
MaskCpuFlags(disable_cpu_flags_); MaskCpuFlags(disable_cpu_flags_);
for (j = 0; j < benchmark_iterations_; j++) { for (j = 0; j < benchmark_iterations_; j++) {
DetilePlane(orig_y, orig_width, dst_c, benchmark_width_, DetilePlane(orig_y, orig_width, dst_c, benchmark_width_, benchmark_width_,
benchmark_width_, benchmark_height_, 16); benchmark_height_, 16);
} }
// Enable optimizations. // Enable optimizations.
MaskCpuFlags(benchmark_cpu_info_); MaskCpuFlags(benchmark_cpu_info_);
for (j = 0; j < benchmark_iterations_; j++) { for (j = 0; j < benchmark_iterations_; j++) {
DetilePlane(orig_y, orig_width, dst_opt, benchmark_width_, DetilePlane(orig_y, orig_width, dst_opt, benchmark_width_, benchmark_width_,
benchmark_width_, benchmark_height_, 16); benchmark_height_, 16);
} }
for (i = 0; i < y_plane_size; ++i) { for (i = 0; i < y_plane_size; ++i) {