/*
 *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
 *
 *  Use of this source code is governed by a BSD-style license
 *  that can be found in the LICENSE file in the root of the source
 *  tree. An additional intellectual property rights grant can be found
 *  in the file PATENTS.  All contributing project authors may
 *  be found in the AUTHORS file in the root of the source tree.
 */

#include "row.h"

extern "C" {

#ifdef HAS_ARGBTOYROW_SSSE3
#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var

// Constant multiplication table for converting ARGB to I400.
extern "C" TALIGN16(const int8, kRGBToY[16]) = {
  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
};

extern "C" TALIGN16(const int8, kRGBToU[16]) = {
  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
};

extern "C" TALIGN16(const int8, kRGBToV[16]) = {
  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
};

extern "C" TALIGN16(const uint8, kAddY16[16]) = {
  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
};

extern "C" TALIGN16(const uint8, kAddUV128[16]) = {
  128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u,
  128u, 0u, 128u, 0u, 128u, 0u, 128u, 0u
};

__declspec(naked)
void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
__asm {
    mov        eax, [esp + 4]   // src_argb
    mov        edx, [esp + 8]   // dst_y
    mov        ecx, [esp + 12]  // pix
    movdqa     xmm7, _kRGBToY
    movdqa     xmm6, _kAddY16
    pcmpeqb    xmm5, xmm5      // Generate mask 0x0000ffff
    psrld      xmm5, 16

 convertloop :
    movdqa    xmm0, [eax]
    movdqa    xmm1, [eax + 16]
    pmaddubsw xmm0, xmm7
    lea       eax, [eax + 32]
    pmaddubsw xmm1, xmm7            // BG ra BG ra BG ra BG ra
    palignr   xmm2, xmm0, 2         // AR xx AR xx AR xx AR xx
    paddw     xmm2, xmm0            // BGRA xx BGRA xx BGRA xx BGRA xx
    pand      xmm2, xmm5            // BGRA 00 BGRA 00 BGRA 00 BGRA 00
    palignr   xmm3, xmm1, 2
    paddw     xmm3, xmm1
    pand      xmm3, xmm5            // BGRA 00 BGRA 00 BGRA 00 BGRA 00
    packssdw  xmm2, xmm3            // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA
    psrlw     xmm2, 7               // 0B xx 0B xx 0B xx 0B xx
    packuswb  xmm2, xmm2
    paddb     xmm2, xmm6
    movq      qword ptr [edx], xmm2
    lea       edx, [edx + 8]
    sub       ecx, 8
    ja        convertloop
    ret
  }
}

__declspec(naked)
void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
                       uint8* dst_u, uint8* dst_v, int width) {
__asm {
    push       esi
    push       edi
    mov        eax, [esp + 8 + 4]   // src_argb
    mov        esi, [esp + 8 + 8]   // src_stride_argb
    mov        edx, [esp + 8 + 12]  // dst_u
    mov        edi, [esp + 8 + 16]  // dst_v
    mov        ecx, [esp + 8 + 20]  // pix
    movdqa     xmm7, _kRGBToU
    movdqa     xmm6, _kRGBToV
    movdqa     xmm5, _kAddUV128
    pcmpeqb    xmm4, xmm4      // Generate mask 0x0000ffff
    psrld      xmm4, 16

 convertloop :
    // step 1 - subsample 8x2 argb pixels to 4x1
    movdqa     xmm0, [eax]           // 32x2 -> 32x1
    movdqa     xmm1, [eax + 16]
    movdqa     xmm2, [eax + esi]
    movdqa     xmm3, [eax + esi + 16]
    lea        eax,  [eax + 32]
    pavgb      xmm0, xmm2
    pavgb      xmm1, xmm3

    movdqa     xmm2, xmm0            // 32x1 -> 16x1
    shufps     xmm0, xmm1, 0x88
    shufps     xmm2, xmm1, 0xdd
    pavgb      xmm0, xmm2

    // step 2 - convert to U and V
    // from here down is very similar to Y code except
    // instead of 8 different pixels, its 4 pixels of U and 4 of V
    movdqa     xmm1, xmm0
    pmaddubsw  xmm0, xmm7            // U
    pmaddubsw  xmm1, xmm6            // V

    palignr    xmm2, xmm0, 2         // AR xx AR xx AR xx AR xx
    paddw      xmm2, xmm0            // BGRA xx BGRA xx BGRA xx BGRA xx
    pand       xmm2, xmm4            // BGRA 00 BGRA 00 BGRA 00 BGRA 00

    palignr    xmm3, xmm1, 2
    paddw      xmm3, xmm1
    pand       xmm3, xmm4            // BGRA 00 BGRA 00 BGRA 00 BGRA 00

    psraw      xmm2, 8
    psraw      xmm3, 8
    packsswb   xmm2, xmm3            // BGRA BGRA BGRA BGRA BGRA BGRA BGRA BGRA
    paddb      xmm2, xmm5            // -> unsigned
    packuswb   xmm2, xmm2            // 8 bytes. 4 U, 4 V

    // step 3 - store 4 U and 4 V values
    movd       dword ptr [edx], xmm2 // U
    lea        edx, [edx + 4]
    pshufd     xmm0, xmm2, 0x55      // V
    movd       dword ptr [edi], xmm0
    lea        edi, [edi + 4]
    sub        ecx, 8
    ja         convertloop
    pop        edi
    pop        esi
    ret
  }
}

static inline int RGBToY(uint8 r, uint8 g, uint8 b) {
  return (( 66 * r + 129 * g +  25 * b + 128) >> 8) + 16;
}

static inline int RGBToU(uint8 r, uint8 g, uint8 b) {
  return ((-38 * r -  74 * g + 112 * b + 128) >> 8) + 128;
}
static inline int RGBToV(uint8 r, uint8 g, uint8 b) {
  return ((112 * r -  94 * g -  18 * b + 128) >> 8) + 128;
}

void ARGBToYRow_C(const uint8* src_argb0, uint8* dst_y, int width) {
  for (int x = 0; x < width; ++x) {
    dst_y[0] = RGBToY(src_argb0[2], src_argb0[1], src_argb0[0]);
    src_argb0 += 4;
    dst_y += 1;
  }
}

void ARGBToUVRow_C(const uint8* src_argb0, int src_stride_argb,
                   uint8* dst_u, uint8* dst_v, int width) {
  const uint8* src_argb1 = src_argb0 + src_stride_argb;
  for (int x = 0; x < width - 1; x += 2) {
    uint8 ab = (src_argb0[0] + src_argb0[4] + src_argb1[0] + src_argb1[4]) >> 2;
    uint8 ag = (src_argb0[1] + src_argb0[5] + src_argb1[1] + src_argb1[5]) >> 2;
    uint8 ar = (src_argb0[2] + src_argb0[6] + src_argb1[2] + src_argb1[6]) >> 2;
    dst_u[0] = RGBToU(ar, ag, ab);
    dst_v[0] = RGBToV(ar, ag, ab);
    src_argb0 += 8;
    src_argb1 += 8;
    dst_u += 1;
    dst_v += 1;
  }
  if (width & 1) {
    uint8 ab = (src_argb0[0] + src_argb1[0]) >> 1;
    uint8 ag = (src_argb0[1] + src_argb1[1]) >> 1;
    uint8 ar = (src_argb0[2] + src_argb1[2]) >> 1;
    dst_u[0] = RGBToU(ar, ag, ab);
    dst_v[0] = RGBToV(ar, ag, ab);
  }
}

__declspec(naked)
void FastConvertYUVToRGB32Row(const uint8* y_buf,
                              const uint8* u_buf,
                              const uint8* v_buf,
                              uint8* rgb_buf,
                              int width) {
  __asm {
    pushad
    mov       edx, [esp + 32 + 4]
    mov       edi, [esp + 32 + 8]
    mov       esi, [esp + 32 + 12]
    mov       ebp, [esp + 32 + 16]
    mov       ecx, [esp + 32 + 20]

 convertloop :
    movzx     eax, byte ptr [edi]
    lea       edi, [edi + 1]
    movzx     ebx, byte ptr [esi]
    lea       esi, [esi + 1]
    movq      mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
    movzx     eax, byte ptr [edx]
    paddsw    mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
    movzx     ebx, byte ptr [edx + 1]
    movq      mm1, [_kCoefficientsRgbY + 8 * eax]
    lea       edx, [edx + 2]
    movq      mm2, [_kCoefficientsRgbY + 8 * ebx]
    paddsw    mm1, mm0
    paddsw    mm2, mm0
    psraw     mm1, 6
    psraw     mm2, 6
    packuswb  mm1, mm2
    movntq    [ebp], mm1
    lea       ebp, [ebp + 8]
    sub       ecx, 2
    ja        convertloop

    popad
    ret
  }
}

__declspec(naked)
void FastConvertYUVToBGRARow(const uint8* y_buf,
                             const uint8* u_buf,
                             const uint8* v_buf,
                             uint8* rgb_buf,
                             int width) {
  __asm {
    pushad
    mov       edx, [esp + 32 + 4]
    mov       edi, [esp + 32 + 8]
    mov       esi, [esp + 32 + 12]
    mov       ebp, [esp + 32 + 16]
    mov       ecx, [esp + 32 + 20]

 convertloop :
    movzx     eax, byte ptr [edi]
    lea       edi, [edi + 1]
    movzx     ebx, byte ptr [esi]
    lea       esi, [esi + 1]
    movq      mm0, [_kCoefficientsBgraY + 2048 + 8 * eax]
    movzx     eax, byte ptr [edx]
    paddsw    mm0, [_kCoefficientsBgraY + 4096 + 8 * ebx]
    movzx     ebx, byte ptr [edx + 1]
    movq      mm1, [_kCoefficientsBgraY + 8 * eax]
    lea       edx, [edx + 2]
    movq      mm2, [_kCoefficientsBgraY + 8 * ebx]
    paddsw    mm1, mm0
    paddsw    mm2, mm0
    psraw     mm1, 6
    psraw     mm2, 6
    packuswb  mm1, mm2
    movntq    [ebp], mm1
    lea       ebp, [ebp + 8]
    sub       ecx, 2
    ja        convertloop

    popad
    ret
  }
}

__declspec(naked)
void FastConvertYUVToABGRRow(const uint8* y_buf,
                             const uint8* u_buf,
                             const uint8* v_buf,
                             uint8* rgb_buf,
                             int width) {
  __asm {
    pushad
    mov       edx, [esp + 32 + 4]
    mov       edi, [esp + 32 + 8]
    mov       esi, [esp + 32 + 12]
    mov       ebp, [esp + 32 + 16]
    mov       ecx, [esp + 32 + 20]

 convertloop :
    movzx     eax, byte ptr [edi]
    lea       edi, [edi + 1]
    movzx     ebx, byte ptr [esi]
    lea       esi, [esi + 1]
    movq      mm0, [_kCoefficientsAbgrY + 2048 + 8 * eax]
    movzx     eax, byte ptr [edx]
    paddsw    mm0, [_kCoefficientsAbgrY + 4096 + 8 * ebx]
    movzx     ebx, byte ptr [edx + 1]
    movq      mm1, [_kCoefficientsAbgrY + 8 * eax]
    lea       edx, [edx + 2]
    movq      mm2, [_kCoefficientsAbgrY + 8 * ebx]
    paddsw    mm1, mm0
    paddsw    mm2, mm0
    psraw     mm1, 6
    psraw     mm2, 6
    packuswb  mm1, mm2
    movntq    [ebp], mm1
    lea       ebp, [ebp + 8]
    sub       ecx, 2
    ja        convertloop

    popad
    ret
  }
}

__declspec(naked)
void FastConvertYUV444ToRGB32Row(const uint8* y_buf,
                                 const uint8* u_buf,
                                 const uint8* v_buf,
                                 uint8* rgb_buf,
                                 int width) {
  __asm {
    pushad
    mov       edx, [esp + 32 + 4]   // Y
    mov       edi, [esp + 32 + 8]   // U
    mov       esi, [esp + 32 + 12]  // V
    mov       ebp, [esp + 32 + 16]  // rgb
    mov       ecx, [esp + 32 + 20]  // width

 convertloop :
    movzx     eax, byte ptr [edi]
    lea       edi, [edi + 1]
    movzx     ebx, byte ptr [esi]
    lea       esi, [esi + 1]
    movq      mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
    movzx     eax, byte ptr [edx]
    paddsw    mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
    lea       edx, [edx + 1]
    paddsw    mm0, [_kCoefficientsRgbY + 8 * eax]
    psraw     mm0, 6
    packuswb  mm0, mm0
    movd      [ebp], mm0
    lea       ebp, [ebp + 4]
    sub       ecx, 1
    ja        convertloop

    popad
    ret
  }
}

__declspec(naked)
void FastConvertYToRGB32Row(const uint8* y_buf,
                            uint8* rgb_buf,
                            int width) {
  __asm {
    push      ebx
    mov       eax, [esp + 4 + 4]   // Y
    mov       edx, [esp + 4 + 8]   // rgb
    mov       ecx, [esp + 4 + 12]  // width

 convertloop :
    movzx     ebx, byte ptr [eax]
    movq      mm0, [_kCoefficientsRgbY + 8 * ebx]
    psraw     mm0, 6
    movzx     ebx, byte ptr [eax + 1]
    movq      mm1, [_kCoefficientsRgbY + 8 * ebx]
    psraw     mm1, 6
    packuswb  mm0, mm1
    lea       eax, [eax + 2]
    movq      [edx], mm0
    lea       edx, [edx + 8]
    sub       ecx, 2
    ja        convertloop

    pop       ebx
    ret
  }
}

#endif

}  // extern "C"