diff --git a/include/libyuv/rotate_row.h b/include/libyuv/rotate_row.h
index a45948094..d4a974c54 100644
--- a/include/libyuv/rotate_row.h
+++ b/include/libyuv/rotate_row.h
@@ -27,12 +27,10 @@ extern "C" {
 #define LIBYUV_DISABLE_NEON
 #endif
 
-// clang >= 19.0.0 required for SME
-#if !defined(LIBYUV_DISABLE_SME) && defined(__clang__) && defined(__aarch64__)
-#if __clang_major__ < 19
+// temporary disable SME
+#if !defined(LIBYUV_DISABLE_SME)
 #define LIBYUV_DISABLE_SME
 #endif
-#endif
 
 // MemorySanitizer does not support assembly code yet. http://crbug.com/344505
 #if defined(__has_feature)
diff --git a/include/libyuv/row.h b/include/libyuv/row.h
index 5c110dd2b..97eabbf67 100644
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@@ -400,10 +400,11 @@ extern "C" {
 
 // The following are available for AVX512 clang x86 platforms:
 // TODO(fbarchard): Port to GCC and Visual C
-// TODO(fbarchard): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI. Issue libyuv:789
+// TODO(b/42280744): re-enable HAS_ARGBTORGB24ROW_AVX512VBMI.
 #if !defined(LIBYUV_DISABLE_X86) && \
     (defined(__x86_64__) || defined(__i386__)) && defined(CLANG_HAS_AVX512)
 #define HAS_ARGBTORGB24ROW_AVX512VBMI
+#define HAS_CONVERT16TO8ROW_AVX512BW
 #define HAS_MERGEUVROW_AVX512BW
 #endif
 
@@ -3337,6 +3338,10 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
                           uint8_t* dst_y,
                           int scale,
                           int width);
+void Convert16To8Row_AVX512BW(const uint16_t* src_y,
+                             uint8_t* dst_y,
+                             int scale,
+                             int width);
 void Convert16To8Row_Any_SSSE3(const uint16_t* src_ptr,
                                uint8_t* dst_ptr,
                                int scale,
@@ -3345,6 +3350,10 @@ void Convert16To8Row_Any_AVX2(const uint16_t* src_ptr,
                               uint8_t* dst_ptr,
                               int scale,
                               int width);
+void Convert16To8Row_Any_AVX512BW(const uint16_t* src_ptr,
+                                  uint8_t* dst_ptr,
+                                  int scale,
+                                  int width);
 void Convert16To8Row_NEON(const uint16_t* src_y,
                           uint8_t* dst_y,
                           int scale,
diff --git a/source/convert.cc b/source/convert.cc
index 4ff63f6f9..7d44d8ae4 100644
--- a/source/convert.cc
+++ b/source/convert.cc
@@ -705,6 +705,14 @@ int I010ToNV12(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_CONVERT16TO8ROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    Convert16To8Row = Convert16To8Row_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      Convert16To8Row = Convert16To8Row_AVX512BW;
+    }
+  }
+#endif
 
 #if defined(HAS_MERGEUVROW_SSE2)
   if (TestCpuFlag(kCpuHasSSE2)) {
diff --git a/source/planar_functions.cc b/source/planar_functions.cc
index 6191e4423..6e8801cda 100644
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@@ -149,6 +149,14 @@ void Convert16To8Plane(const uint16_t* src_y,
     }
   }
 #endif
+#if defined(HAS_CONVERT16TO8ROW_AVX512BW)
+  if (TestCpuFlag(kCpuHasAVX512BW)) {
+    Convert16To8Row = Convert16To8Row_Any_AVX512BW;
+    if (IS_ALIGNED(width, 64)) {
+      Convert16To8Row = Convert16To8Row_AVX512BW;
+    }
+  }
+#endif
 
   // Convert plane
   for (y = 0; y < height; ++y) {
diff --git a/source/row_any.cc b/source/row_any.cc
index 2118ad500..67dc8d2f6 100644
--- a/source/row_any.cc
+++ b/source/row_any.cc
@@ -1684,8 +1684,8 @@ ANY11T(AB64ToARGBRow_Any_NEON, AB64ToARGBRow_NEON, 8, 4, uint16_t, uint8_t, 7)
 // Any 1 to 1 with parameter and shorts.  BPP measures in shorts.
 #define ANY11C(NAMEANY, ANY_SIMD, SBPP, BPP, STYPE, DTYPE, MASK)             \
   void NAMEANY(const STYPE* src_ptr, DTYPE* dst_ptr, int scale, int width) { \
-    SIMD_ALIGNED(STYPE vin[32]);                                             \
-    SIMD_ALIGNED(DTYPE vout[32]);                                            \
+    SIMD_ALIGNED(STYPE vin[64]);                                             \
+    SIMD_ALIGNED(DTYPE vout[64]);                                            \
     memset(vin, 0, sizeof(vin)); /* for msan */                              \
     int r = width & MASK;                                                    \
     int n = width & ~MASK;                                                   \
@@ -1715,6 +1715,15 @@ ANY11C(Convert16To8Row_Any_AVX2,
        uint8_t,
        31)
 #endif
+#ifdef HAS_CONVERT16TO8ROW_AVX512BW
+ANY11C(Convert16To8Row_Any_AVX512BW,
+       Convert16To8Row_AVX512BW,
+       2,
+       1,
+       uint16_t,
+       uint8_t,
+       63)
+#endif
 #ifdef HAS_CONVERT16TO8ROW_NEON
 ANY11C(Convert16To8Row_Any_NEON,
        Convert16To8Row_NEON,
diff --git a/source/row_gcc.cc b/source/row_gcc.cc
index 69babb453..cb757c755 100644
--- a/source/row_gcc.cc
+++ b/source/row_gcc.cc
@@ -5202,8 +5202,7 @@ void MultiplyRow_16_AVX2(const uint16_t* src_y,
                          int width) {
   asm volatile (
       "vmovd       %3,%%xmm3                     \n"
-      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
-      "vbroadcastss %%xmm3,%%ymm3                \n"
+      "vpbroadcastw %%xmm3,%%ymm3                \n"
       "sub         %0,%1                         \n"
 
       // 32 pixels per loop.
@@ -5239,8 +5238,7 @@ void DivideRow_16_AVX2(const uint16_t* src_y,
                        int width) {
   asm volatile (
       "vmovd       %3,%%xmm3                     \n"
-      "vpunpcklwd  %%xmm3,%%xmm3,%%xmm3          \n"
-      "vbroadcastss %%xmm3,%%ymm3                \n"
+      "vpbroadcastw %%xmm3,%%ymm3                \n"
       "sub         %0,%1                         \n"
 
       // 32 pixels per loop.
@@ -5306,8 +5304,7 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
                           int width) {
   asm volatile (
       "vmovd       %3,%%xmm2                     \n"
-      "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
-      "vbroadcastss %%xmm2,%%ymm2                \n"
+      "vpbroadcastw %%xmm2,%%ymm2                \n"
 
       // 32 pixels per loop.
       LABELALIGN
@@ -5332,6 +5329,38 @@ void Convert16To8Row_AVX2(const uint16_t* src_y,
 }
 #endif  // HAS_CONVERT16TO8ROW_AVX2
 
+#ifdef HAS_CONVERT16TO8ROW_AVX512BW
+void Convert16To8Row_AVX512BW(const uint16_t* src_y,
+                              uint8_t* dst_y,
+                              int scale,
+                              int width) {
+  asm volatile (
+      "vpbroadcastw %3,%%zmm2                    \n"
+
+      // 64 pixels per loop.
+      LABELALIGN
+      "1:                                        \n"
+      "vmovdqu8    (%0),%%zmm0                   \n"
+      "vmovdqu8    0x40(%0),%%zmm1               \n"
+      "add         $0x80,%0                      \n"
+      "vpmulhuw    %%zmm2,%%zmm0,%%zmm0          \n"
+      "vpmulhuw    %%zmm2,%%zmm1,%%zmm1          \n"
+      "vpmovuswb   %%zmm0,%%ymm0                 \n"
+      "vpmovuswb   %%zmm1,%%ymm1                 \n"
+      "vmovdqu8    %%ymm0,(%1)                   \n"
+      "vmovdqu8    %%ymm1,0x20(%1)               \n"
+      "add         $0x40,%1                      \n"
+      "sub         $0x40,%2                      \n"
+      "jg          1b                            \n"
+      "vzeroupper                                \n"
+      : "+r"(src_y),  // %0
+        "+r"(dst_y),  // %1
+        "+r"(width)   // %2
+      : "r"(scale)    // %3
+      : "memory", "cc", "xmm0", "xmm1", "xmm2");
+}
+#endif  // HAS_CONVERT16TO8ROW_AVX2
+
 // Use scale to convert to lsb formats depending how many bits there are:
 // 512 = 9 bits
 // 1024 = 10 bits
@@ -5374,8 +5403,7 @@ void Convert8To16Row_AVX2(const uint8_t* src_y,
                           int width) {
   asm volatile (
       "vmovd       %3,%%xmm2                     \n"
-      "vpunpcklwd  %%xmm2,%%xmm2,%%xmm2          \n"
-      "vbroadcastss %%xmm2,%%ymm2                \n"
+      "vpbroadcastw %%xmm2,%%ymm2                \n"
 
       // 32 pixels per loop.
       LABELALIGN