Change Sobel to use JPeg Luma calculation instead of extracting G channel. Using luma produces a better sobel that respects all 3 channels of RGB. Historically the G channel was used to improve performance, and because the luma of I420 is a constrained range, hurting quality. Using the JPeg variation of YUV, the luma is more accurate, including cross platform, better optimized for AVX2 and odd widths, and full range.

BUG=444 TESTED=ARGBSobelXY_Opt R=harryjin@google.com Review URL: https://webrtc-codereview.appspot.com/57479004 git-svn-id: http://libyuv.googlecode.com/svn/trunk@1414 16f28f9a-4ce2-e073-06de-1de4eb20be90
2026-02-13 21:59:52 +08:00 · 2015-05-27 22:32:26 +00:00 · 2015-05-27 22:32:26 +00:00 · cfce47efc8
commit cfce47efc8
parent 535a7140f2
11 changed files with 47 additions and 191 deletions
--- a/README.chromium
+++ b/README.chromium
@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1415
+Version: 1416
 License: BSD
 License File: LICENSE
--- a/include/libyuv/convert_from_argb.h
+++ b/include/libyuv/convert_from_argb.h
@ -143,6 +143,12 @@ int ARGBToI400(const uint8* src_argb, int src_stride_argb,
               uint8* dst_y, int dst_stride_y,
               int width, int height);
 // Convert ARGB to G. (Reverse of J400toARGB, which replicates G back to ARGB)
 LIBYUV_API
 int ARGBToG(const uint8* src_argb, int src_stride_argb,
            uint8* dst_g, int dst_stride_g,
            int width, int height);
 // Convert ARGB To NV12.
 LIBYUV_API
 int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
--- a/include/libyuv/row.h
+++ b/include/libyuv/row.h
@ -70,7 +70,6 @@ extern "C" {
 #define HAS_ARGBSHUFFLEROW_SSSE3
 #define HAS_ARGBTOARGB1555ROW_SSE2
 #define HAS_ARGBTOARGB4444ROW_SSE2
 #define HAS_ARGBTOBAYERGGROW_SSE2
 #define HAS_ARGBTORAWROW_SSSE3
 #define HAS_ARGBTORGB24ROW_SSSE3
 #define HAS_ARGBTORGB565ROW_SSE2
@ -271,7 +270,6 @@ extern "C" {
 #define HAS_ARGB4444TOYROW_NEON
 #define HAS_ARGBTOARGB1555ROW_NEON
 #define HAS_ARGBTOARGB4444ROW_NEON
 #define HAS_ARGBTOBAYERGGROW_NEON
 #define HAS_ARGBTORAWROW_NEON
 #define HAS_ARGBTORGB24ROW_NEON
 #define HAS_ARGBTORGB565ROW_NEON
@ -1632,17 +1630,6 @@ void UYVYToUVRow_Any_NEON(const uint8* src_uyvy, int stride_uyvy,
 void UYVYToUV422Row_Any_NEON(const uint8* src_uyvy,
                             uint8* dst_u, uint8* dst_v, int pix);
 void ARGBToBayerGGRow_C(const uint8* src_argb, uint8* dst_bayer,
                        uint32 /* selector */, int pix);
 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
                           uint32 /* selector */, int pix);
 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
                           uint32 /* selector */, int pix);
 void ARGBToBayerGGRow_Any_SSE2(const uint8* src_argb, uint8* dst_bayer,
                               uint32 /* selector */, int pix);
 void ARGBToBayerGGRow_Any_NEON(const uint8* src_argb, uint8* dst_bayer,
                               uint32 /* selector */, int pix);
 void I422ToYUY2Row_C(const uint8* src_y,
                     const uint8* src_u,
                     const uint8* src_v,
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_  // NOLINT
 #define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1415
+#define LIBYUV_VERSION 1416
 #endif  // INCLUDE_LIBYUV_VERSION_H_  NOLINT
--- a/source/planar_functions.cc
+++ b/source/planar_functions.cc
@ -1976,8 +1976,8 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
                                         const uint8* src_sobely,
                                         uint8* dst, int width)) {
  int y;
-  void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
+  void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_g, int pix) =
-                         uint32 selector, int pix) = ARGBToBayerGGRow_C;
+      ARGBToYJRow_C;
  void (*SobelYRow)(const uint8* src_y0, const uint8* src_y1,
                    uint8* dst_sobely, int width) = SobelYRow_C;
  void (*SobelXRow)(const uint8* src_y0, const uint8* src_y1,
@ -1993,31 +1993,32 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
    src_argb  = src_argb  + (height - 1) * src_stride_argb;
    src_stride_argb = -src_stride_argb;
  }
-  // ARGBToBayer used to select G channel from ARGB.
+
-#if defined(HAS_ARGBTOBAYERGGROW_SSE2)
+#if defined(HAS_ARGBTOYJROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSE2)) {
    ARGBToBayerRow = ARGBToBayerGGRow_Any_SSE2;
    if (IS_ALIGNED(width, 8)) {
      ARGBToBayerRow = ARGBToBayerGGRow_SSE2;
    }
  }
 #endif
 #if defined(HAS_ARGBTOBAYERROW_SSSE3)
  if (TestCpuFlag(kCpuHasSSSE3)) {
-    ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
+    ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
-    if (IS_ALIGNED(width, 8)) {
+    if (IS_ALIGNED(width, 16)) {
-      ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+      ARGBToYJRow = ARGBToYJRow_SSSE3;
    }
  }
 #endif
-#if defined(HAS_ARGBTOBAYERGGROW_NEON)
+#if defined(HAS_ARGBTOYJROW_AVX2)
  if (TestCpuFlag(kCpuHasAVX2)) {
    ARGBToYJRow = ARGBToYJRow_Any_AVX2;
    if (IS_ALIGNED(width, 32)) {
      ARGBToYJRow = ARGBToYJRow_AVX2;
    }
  }
 #endif
 #if defined(HAS_ARGBTOYJROW_NEON)
  if (TestCpuFlag(kCpuHasNEON)) {
-    ARGBToBayerRow = ARGBToBayerGGRow_Any_NEON;
+    ARGBToYJRow = ARGBToYJRow_Any_NEON;
    if (IS_ALIGNED(width, 8)) {
-      ARGBToBayerRow = ARGBToBayerGGRow_NEON;
+      ARGBToYJRow = ARGBToYJRow_NEON;
    }
  }
 #endif
 #if defined(HAS_SOBELYROW_SSE2)
  if (TestCpuFlag(kCpuHasSSE2)) {
    SobelYRow = SobelYRow_SSE2;
@ -2050,20 +2051,20 @@ static int ARGBSobelize(const uint8* src_argb, int src_stride_argb,
    uint8* row_y0 = row_y + kEdge;
    uint8* row_y1 = row_y0 + kRowSize;
    uint8* row_y2 = row_y1 + kRowSize;
-    ARGBToBayerRow(src_argb, row_y0, 0x0d090501, width);
+    ARGBToYJRow(src_argb, row_y0, width);
    row_y0[-1] = row_y0[0];
    memset(row_y0 + width, row_y0[width - 1], 16);  // Extrude 16 for valgrind.
-    ARGBToBayerRow(src_argb, row_y1, 0x0d090501, width);
+    ARGBToYJRow(src_argb, row_y1, width);
    row_y1[-1] = row_y1[0];
    memset(row_y1 + width, row_y1[width - 1], 16);
    memset(row_y2 + width, 0, 16);
    for (y = 0; y < height; ++y) {
-      // Convert next row of ARGB to Y.
+      // Convert next row of ARGB to G.
      if (y < (height - 1)) {
        src_argb += src_stride_argb;
      }
-      ARGBToBayerRow(src_argb, row_y2, 0x0d090501, width);
+      ARGBToYJRow(src_argb, row_y2, width);
      row_y2[-1] = row_y2[0];
      row_y2[width] = row_y2[width - 1];
--- a/source/row_any.cc
+++ b/source/row_any.cc
@ -278,27 +278,6 @@ RGBDANY(ARGBToRGB565DitherRow_Any_NEON, ARGBToRGB565DitherRow_NEON,
 #endif
 #undef RGBDANY
 // ARGB to Bayer does multiple of 4 pixels, SSSE3 aligned src, unaligned dst.
 #define BAYERANY(NAMEANY, ARGBTORGB_SIMD, ARGBTORGB_C, SBPP, BPP, MASK)        \
    void NAMEANY(const uint8* src, uint8* dst, uint32 selector, int width) {   \
      int n = width & ~MASK;                                                   \
      if (n > 0) {                                                             \
        ARGBTORGB_SIMD(src, dst, selector, n);                                 \
      }                                                                        \
      ARGBTORGB_C(src + n * SBPP, dst + n * BPP, selector, width & MASK);      \
    }
 #if defined(HAS_ARGBTOBAYERGGROW_SSE2)
 BAYERANY(ARGBToBayerGGRow_Any_SSE2, ARGBToBayerGGRow_SSE2, ARGBToBayerGGRow_C,
         4, 1, 7)
 #endif
 #if defined(HAS_ARGBTOBAYERGGROW_NEON)
 BAYERANY(ARGBToBayerGGRow_Any_NEON, ARGBToBayerGGRow_NEON, ARGBToBayerGGRow_C,
         4, 1, 7)
 #endif
 #undef BAYERANY
 #define YANY(NAMEANY, ARGBTOY_SIMD, ARGBTOY_C, SBPP, BPP, MASK)                \
    void NAMEANY(const uint8* src_argb, uint8* dst_y, int width) {             \
      int n = width & ~MASK;                                                   \
--- a/source/row_common.cc
+++ b/source/row_common.cc
@ -1011,17 +1011,17 @@ void J400ToARGBRow_C(const uint8* src_y, uint8* dst_argb, int width) {
 #define VR -102 /* round(-1.596 * 64) */
 // Bias values to subtract 16 from Y and 128 from U and V.
-#define BB (UB * 128            + YGB)
+#define BB (UB * 128 + YGB)
 #define BG (UG * 128 + VG * 128 + YGB)
-#define BR            (VR * 128 + YGB)
+#define BR (VR * 128 + YGB)
 // C reference code that mimics the YUV assembly.
 static __inline void YuvPixel(uint8 y, uint8 u, uint8 v,
                              uint8* b, uint8* g, uint8* r) {
  uint32 y1 = (uint32)(y * 0x0101 * YG) >> 16;
-  *b = Clamp((int32)(-(         u * UB) + y1 + BB) >> 6);
+  *b = Clamp((int32)(-(u * UB) + y1 + BB) >> 6);
  *g = Clamp((int32)(-(v * VG + u * UG) + y1 + BG) >> 6);
-  *r = Clamp((int32)(-(v * VR         ) + y1 + BR) >> 6);
+  *r = Clamp((int32)(-(v * VR)+ y1 + BR) >> 6);
 }
 // C reference code that mimics the YUV assembly.
@ -1059,17 +1059,17 @@ static __inline void YPixel(uint8 y, uint8* b, uint8* g, uint8* r) {
 #define VRJ -90 /* round(-1.40200 * 64) */
 // Bias values to subtract 16 from Y and 128 from U and V.
-#define BBJ (UBJ * 128             + YGBJ)
+#define BBJ (UBJ * 128 + YGBJ)
 #define BGJ (UGJ * 128 + VGJ * 128 + YGBJ)
-#define BRJ             (VRJ * 128 + YGBJ)
+#define BRJ (VRJ * 128 + YGBJ)
 // C reference code that mimics the YUV assembly.
 static __inline void YuvJPixel(uint8 y, uint8 u, uint8 v,
                               uint8* b, uint8* g, uint8* r) {
  uint32 y1 = (uint32)(y * 0x0101 * YGJ) >> 16;
-  *b = Clamp((int32)(-(          u * UBJ) + y1 + BBJ) >> 6);
+  *b = Clamp((int32)(-(u * UBJ) + y1 + BBJ) >> 6);
  *g = Clamp((int32)(-(v * VGJ + u * UGJ) + y1 + BGJ) >> 6);
-  *r = Clamp((int32)(-(v * VRJ          ) + y1 + BRJ) >> 6);
+  *r = Clamp((int32)(-(v * VRJ) + y1 + BRJ) >> 6);
 }
 #undef YGJ
@ -2086,22 +2086,6 @@ void InterpolateRow_16_C(uint16* dst_ptr, const uint16* src_ptr,
  }
 }
 // Select G channel from ARGB.  e.g.  GGGGGGGG
 void ARGBToBayerGGRow_C(const uint8* src_argb,
                        uint8* dst_bayer, uint32 selector, int pix) {
  // Copy a row of G.
  int x;
  for (x = 0; x < pix - 1; x += 2) {
    dst_bayer[0] = src_argb[1];
    dst_bayer[1] = src_argb[5];
    src_argb += 8;
    dst_bayer += 2;
  }
  if (pix & 1) {
    dst_bayer[0] = src_argb[1];
  }
 }
 // Use first 4 shuffler values to reorder ARGB channels.
 void ARGBShuffleRow_C(const uint8* src_argb, uint8* dst_argb,
                      const uint8* shuffler, int pix) {
--- a/source/row_neon.cc
+++ b/source/row_neon.cc
@ -1251,25 +1251,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
  );
 }
 // Select G channels from ARGB.  e.g.  GGGGGGGG
 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
                           uint32 /*selector*/, int pix) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
    "vld4.8     {d0, d1, d2, d3}, [%0]!        \n"  // load row 8 pixels.
    "subs       %2, %2, #8                     \n"  // 8 processed per loop
    MEMACCESS(1)
    "vst1.8     {d1}, [%1]!                    \n"  // store 8 G's.
    "bgt        1b                             \n"
  : "+r"(src_argb),   // %0
    "+r"(dst_bayer),  // %1
    "+r"(pix)         // %2
  :
  : "cc", "memory", "q0", "q1"  // Clobber List
  );
 }
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
                         const uint8* shuffler, int pix) {
--- a/source/row_neon64.cc
+++ b/source/row_neon64.cc
@ -1259,27 +1259,6 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
 }
 #endif  // HAS_UYVYTOUVROW_NEON
 // Select G channels from ARGB.  e.g.  GGGGGGGG
 #ifdef HAS_ARGBTOBAYERGGROW_NEON
 void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
                           uint32 /*selector*/, int pix) {
  asm volatile (
  "1:                                          \n"
    MEMACCESS(0)
    "ld4        {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n"  // load row 8 pixels
    "subs       %w2, %w2, #8                   \n"  // 8 processed per loop
    MEMACCESS(1)
    "st1        {v1.8b}, [%1], #8              \n"  // store 8 G's.
    "b.gt       1b                             \n"
  : "+r"(src_argb),   // %0
    "+r"(dst_bayer),  // %1
    "+r"(pix)         // %2
  :
  : "cc", "memory", "v0", "v1", "v2", "v3"  // Clobber List
  );
 }
 #endif  // HAS_ARGBTOBAYERGGROW_NEON
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 #ifdef HAS_ARGBSHUFFLEROW_NEON
 void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
--- a/source/row_posix.cc
+++ b/source/row_posix.cc
@ -1599,8 +1599,8 @@ YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
    "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
    "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
    "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
-    "movdqu     %%xmm1," MEMACCESS2(0x10,[dst_argb]) "           \n"           \
+    "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
-    "lea        " MEMLEA(0x20,[dst_argb]) ",%[dst_argb]          \n"
+    "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
 // Store 8 BGRA values. Assumes XMM5 is zero.
 #define STOREBGRA                                                              \
@ -1611,8 +1611,8 @@ YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
    "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "                  \n"           \
-    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "            \n"           \
+    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_bgra]) "           \n"           \
-    "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra]           \n"
+    "lea       " MEMLEA(0x20, [dst_bgra]) ", %[dst_bgra]         \n"
 // Store 8 ABGR values. Assumes XMM5 is zero.
 #define STOREABGR                                                              \
@ -1622,8 +1622,8 @@ YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
    "punpcklwd %%xmm0,%%xmm2                                     \n"           \
    "punpckhwd %%xmm0,%%xmm1                                     \n"           \
    "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "                  \n"           \
-    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "            \n"           \
+    "movdqu    %%xmm1," MEMACCESS2(0x10, [dst_abgr]) "           \n"           \
-    "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr]           \n"
+    "lea       " MEMLEA(0x20, [dst_abgr]) ", %[dst_abgr]         \n"
 // Store 8 RGBA values. Assumes XMM5 is zero.
 #define STORERGBA                                                              \
@ -1634,8 +1634,8 @@ YuvConstants SIMD_ALIGNED(kYuvJConstants) = {
    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
-    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "            \n"           \
+    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
-    "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba]           \n"
+    "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
 void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
                                const uint8* u_buf,
@ -5030,37 +5030,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 }
 #endif  // HAS_INTERPOLATEROW_SSE2
 #ifdef HAS_ARGBTOBAYERGGROW_SSE2
 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
                           uint32 selector, int pix) {
  asm volatile (
    "pcmpeqb   %%xmm5,%%xmm5                   \n"
    "psrld     $0x18,%%xmm5                    \n"
    LABELALIGN
  "1:                                          \n"
    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
    "lea       " MEMLEA(0x20,0) ",%0           \n"
    "psrld     $0x8,%%xmm0                     \n"
    "psrld     $0x8,%%xmm1                     \n"
    "pand      %%xmm5,%%xmm0                   \n"
    "pand      %%xmm5,%%xmm1                   \n"
    "packssdw  %%xmm1,%%xmm0                   \n"
    "packuswb  %%xmm1,%%xmm0                   \n"
    "movq      %%xmm0," MEMACCESS(1) "         \n"
    "lea       " MEMLEA(0x8,1) ",%1            \n"
    "sub       $0x8,%2                         \n"
    "jg        1b                              \n"
  : "+r"(src_argb),  // %0
    "+r"(dst_bayer), // %1
    "+r"(pix)        // %2
  :
  : "memory", "cc"
    , "xmm0", "xmm1", "xmm5"
  );
 }
 #endif  // HAS_ARGBTOBAYERGGROW_SSE2
 #ifdef HAS_ARGBSHUFFLEROW_SSSE3
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
--- a/source/row_win.cc
+++ b/source/row_win.cc
@ -5875,36 +5875,6 @@ void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
 }
 #endif  // HAS_INTERPOLATEROW_SSE2
 // Specialized ARGB to Bayer that just isolates G channel.
 __declspec(naked)
 void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
                           uint32 selector, int pix) {
  __asm {
    mov        eax, [esp + 4]    // src_argb
    mov        edx, [esp + 8]    // dst_bayer
                                 // selector
    mov        ecx, [esp + 16]   // pix
    pcmpeqb    xmm5, xmm5        // generate mask 0x000000ff
    psrld      xmm5, 24
  wloop:
    movdqu     xmm0, [eax]
    movdqu     xmm1, [eax + 16]
    lea        eax, [eax + 32]
    psrld      xmm0, 8  // Move green to bottom.
    psrld      xmm1, 8
    pand       xmm0, xmm5
    pand       xmm1, xmm5
    packssdw   xmm0, xmm1
    packuswb   xmm0, xmm1
    movq       qword ptr [edx], xmm0
    lea        edx, [edx + 8]
    sub        ecx, 8
    jg         wloop
    ret
  }
 }
 // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
 __declspec(naked)
 void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,