diff --git a/source/row_win.cc b/source/row_win.cc index c5a14f86f..ba5e53185 100644 --- a/source/row_win.cc +++ b/source/row_win.cc @@ -3461,17 +3461,14 @@ __declspec(naked) void MergeUVRow_AVX2(const uint8_t* src_u, sub edx, eax convertloop: - vmovdqu ymm0, [eax] // read 32 U's - vmovdqu ymm1, [eax + edx] // and 32 V's - lea eax, [eax + 32] - vpunpcklbw ymm2, ymm0, ymm1 // low 16 UV pairs. mutated qqword 0,2 - vpunpckhbw ymm0, ymm0, ymm1 // high 16 UV pairs. mutated qqword 1,3 - vextractf128 [edi], ymm2, 0 // bytes 0..15 - vextractf128 [edi + 16], ymm0, 0 // bytes 16..31 - vextractf128 [edi + 32], ymm2, 1 // bytes 32..47 - vextractf128 [edi + 48], ymm0, 1 // bytes 47..63 - lea edi, [edi + 64] - sub ecx, 32 + vpmovzxbw ymm0, [eax] + vpmovzxbw ymm1, [eax + edx] + lea eax, [eax + 16] + vpsllw ymm1, ymm1, 8 + vpor ymm2, ymm1, ymm0 + vmovdqu [edi], ymm2 + lea edi, [edi + 32] + sub ecx, 16 jg convertloop pop edi