diff --git a/source/convert.cc b/source/convert.cc index dbe785d36..7ebdad27b 100644 --- a/source/convert.cc +++ b/source/convert.cc @@ -3418,19 +3418,23 @@ int RAWToJ420(const uint8_t* src_raw, } } #endif -#if defined(HAS_RAWTOYJROW_LSX) +#if defined(HAS_RAWTOYJROW_LSX) && defined(HAS_RAWTOUVJROW_LSX) if (TestCpuFlag(kCpuHasLSX)) { + RAWToUVJRow = RAWToUVJRow_Any_LSX; RAWToYJRow = RAWToYJRow_Any_LSX; if (IS_ALIGNED(width, 16)) { RAWToYJRow = RAWToYJRow_LSX; + RAWToUVJRow = RAWToUVJRow_LSX; } } #endif -#if defined(HAS_RAWTOYJROW_LASX) +#if defined(HAS_RAWTOYJROW_LASX) && defined(HAS_RAWTOUVJROW_LASX) if (TestCpuFlag(kCpuHasLASX)) { + RAWToUVJRow = RAWToUVJRow_Any_LASX; RAWToYJRow = RAWToYJRow_Any_LASX; if (IS_ALIGNED(width, 32)) { RAWToYJRow = RAWToYJRow_LASX; + RAWToUVJRow = RAWToUVJRow_LASX; } } #endif diff --git a/source/row_gcc.cc b/source/row_gcc.cc index cc2238cc5..dfd803e99 100644 --- a/source/row_gcc.cc +++ b/source/row_gcc.cc @@ -9396,9 +9396,8 @@ void HalfMergeUVRow_SSSE3(const uint8_t* src_u, uint8_t* dst_uv, int width) { asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" // 0x0101 + "pabsb %%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" LABELALIGN @@ -9443,8 +9442,7 @@ void HalfMergeUVRow_AVX2(const uint8_t* src_u, int width) { asm volatile( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpabsb %%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN diff --git a/source/scale_gcc.cc b/source/scale_gcc.cc index 84e765091..fdd38dfe5 100644 --- a/source/scale_gcc.cc +++ b/source/scale_gcc.cc @@ -126,9 +126,9 @@ void ScaleRowDown2Linear_SSSE3(const uint8_t* src_ptr, int dst_width) { (void)src_stride; asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" // 0x0101 + "pabsb %%xmm4,%%xmm4 \n" + "pxor %%xmm5,%%xmm5 \n" LABELALIGN @@ -157,9 +157,8 @@ void ScaleRowDown2Box_SSSE3(const uint8_t* src_ptr, uint8_t* dst_ptr, int dst_width) { asm volatile( - "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "packuswb %%xmm4,%%xmm4 \n" + "pcmpeqb %%xmm4,%%xmm4 \n" // 0x0101 + "pabsb %%xmm4,%%xmm4 \n" "pxor %%xmm5,%%xmm5 \n" LABELALIGN @@ -225,8 +224,7 @@ void ScaleRowDown2Linear_AVX2(const uint8_t* src_ptr, (void)src_stride; asm volatile( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpabsb %%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN @@ -258,8 +256,7 @@ void ScaleRowDown2Box_AVX2(const uint8_t* src_ptr, int dst_width) { asm volatile( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpabsb %%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" LABELALIGN @@ -332,10 +329,9 @@ void ScaleRowDown4Box_SSSE3(const uint8_t* src_ptr, intptr_t stridex3; asm volatile( "pcmpeqb %%xmm4,%%xmm4 \n" - "psrlw $0xf,%%xmm4 \n" - "movdqa %%xmm4,%%xmm5 \n" - "packuswb %%xmm4,%%xmm4 \n" - "psllw $0x3,%%xmm5 \n" + "pabsw %%xmm4,%%xmm5 \n" + "pabsb %%xmm4,%%xmm4 \n" // 0x0101 + "psllw $0x3,%%xmm5 \n" // 0x0008 "lea 0x00(%4,%4,2),%3 \n" LABELALIGN @@ -420,9 +416,9 @@ void ScaleRowDown4Box_AVX2(const uint8_t* src_ptr, int dst_width) { asm volatile( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpsllw $0x3,%%ymm4,%%ymm5 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpabsw %%ymm4,%%ymm5 \n" + "vpabsb %%ymm4,%%ymm4 \n" // 0x0101 + "vpsllw $0x3,%%ymm5,%%ymm5 \n" // 0x0008 LABELALIGN "1: \n" @@ -2385,8 +2381,7 @@ void ScaleUVRowDown2Box_AVX2(const uint8_t* src_ptr, int dst_width) { asm volatile( "vpcmpeqb %%ymm4,%%ymm4,%%ymm4 \n" // 01010101 - "vpsrlw $0xf,%%ymm4,%%ymm4 \n" - "vpackuswb %%ymm4,%%ymm4,%%ymm4 \n" + "vpabsb %%ymm4,%%ymm4 \n" "vpxor %%ymm5,%%ymm5,%%ymm5 \n" // zero "vbroadcastf128 %4,%%ymm1 \n" // split shuffler "vbroadcastf128 %5,%%ymm3 \n" // merge shuffler