tizen 2.3.1 release
[external/libjpeg-turbo.git] / simd / jdmrgss2-64.asm
index a64a6b3..ffbf6b2 100644 (file)
@@ -1,8 +1,8 @@
 ;
 ; jdmrgss2-64.asm - merged upsampling/color conversion (64-bit SSE2)
 ;
-; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
-; Copyright 2009 D. R. Commander
+; Copyright 2009, 2012 Pierre Ossman <ossman@cendio.se> for Cendio AB
+; Copyright 2009, 2012 D. R. Commander
 ;
 ; Based on
 ; x86 SIMD extension for IJG JPEG library
@@ -12,7 +12,7 @@
 ; This file should be assembled with NASM (Netwide Assembler),
 ; can *not* be assembled with Microsoft's MASM or any compatible
 ; assembler (including Borland's Turbo Assembler).
-; NASM is available from http://nasm.sourceforge.net/ for
+; NASM is available from http://nasm.sourceforge.net/ or
 ; http://sourceforge.net/project/showfiles.php?group_id=6208
 ;
 ; [TAB8]
@@ -252,17 +252,13 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        movntdq XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
-       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        jmp     short .out0
 .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmH,xmmH                    ; xmmH=(all 1's)
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmF,xmmH                    ; movntdqu XMMWORD [rdi], xmmF
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmF
 .out0:
+       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        sub     rcx, byte SIZEOF_XMMWORD
        jz      near .endcolumn
 
@@ -275,31 +271,28 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        jmp     near .columnloop
 
 .column_st32:
-       pcmpeqb xmmH,xmmH                       ; xmmH=(all 1's)
        lea     rcx, [rcx+rcx*2]                ; imul ecx, RGB_PIXELSIZE
        cmp     rcx, byte 2*SIZEOF_XMMWORD
        jb      short .column_st16
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmH                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
        movdqa  xmmA,xmmF
        sub     rcx, byte 2*SIZEOF_XMMWORD
        jmp     short .column_st15
 .column_st16:
        cmp     rcx, byte SIZEOF_XMMWORD
        jb      short .column_st15
-       maskmovdqu xmmA,xmmH                    ; movntdqu XMMWORD [rdi], xmmA
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
        add     rdi, byte SIZEOF_XMMWORD        ; outptr
        movdqa  xmmA,xmmD
        sub     rcx, byte SIZEOF_XMMWORD
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
        ; Store the lower 8 bytes of xmmA to the output when it has enough
        ; space.
        cmp     rcx, byte SIZEOF_MMWORD
        jb      short .column_st7
-       movq    MMWORD [rdi], xmmA
+       movq    XMM_MMWORD [rdi], xmmA
        add     rdi, byte SIZEOF_MMWORD
        sub     rcx, byte SIZEOF_MMWORD
        psrldq  xmmA, SIZEOF_MMWORD
@@ -308,7 +301,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        ; space.
        cmp     rcx, byte SIZEOF_DWORD
        jb      short .column_st3
-       movd    DWORD [rdi], xmmA
+       movd    XMM_DWORD [rdi], xmmA
        add     rdi, byte SIZEOF_DWORD
        sub     rcx, byte SIZEOF_DWORD
        psrldq  xmmA, SIZEOF_DWORD
@@ -328,47 +321,6 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        test    rcx, rcx
        jz      short .endcolumn
        mov     BYTE [rdi], al
-%else
-       mov     rax,rcx
-       xor     rcx, byte 0x0F
-       shl     rcx, 2
-       movd    xmmB,ecx
-       psrlq   xmmH,4
-       pcmpeqb xmmE,xmmE
-       psrlq   xmmH,xmmB
-       psrlq   xmmE,xmmB
-       punpcklbw xmmE,xmmH
-       ; ----------------
-       mov     rcx,rdi
-       and     rcx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       add     rax,rcx
-       cmp     rax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     rdi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     rcx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmG,xmmA
-       movdqa  xmmC,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmD,ecx
-       sub     rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmF,ecx
-       psllq   xmmA,xmmF
-       psllq   xmmE,xmmF
-       jmp     short .adj0
-.adj1: neg     rcx
-       movd    xmmF,ecx
-       psrlq   xmmA,xmmF
-       psrlq   xmmE,xmmF
-       psllq   xmmG,xmmD
-       psllq   xmmC,xmmD
-       por     xmmA,xmmG
-       por     xmmE,xmmC
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
 
 %else ; RGB_PIXELSIZE == 4 ; -----------
 
@@ -413,19 +365,14 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        movntdq XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
        movntdq XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
        movntdq XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
-       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        jmp     short .out0
 .out1: ; --(unaligned)-----------------
-       pcmpeqb    xmmE,xmmE                    ; xmmE=(all 1's)
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmC,xmmE                    ; movntdqu XMMWORD [rdi], xmmC
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmH,xmmE                    ; movntdqu XMMWORD [rdi], xmmH
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       movdqu  XMMWORD [rdi+2*SIZEOF_XMMWORD], xmmC
+       movdqu  XMMWORD [rdi+3*SIZEOF_XMMWORD], xmmH
 .out0:
+       add     rdi, byte RGB_PIXELSIZE*SIZEOF_XMMWORD  ; outptr
        sub     rcx, byte SIZEOF_XMMWORD
        jz      near .endcolumn
 
@@ -438,30 +385,27 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        jmp     near .columnloop
 
 .column_st32:
-       pcmpeqb xmmE,xmmE                       ; xmmE=(all 1's)
        cmp     rcx, byte SIZEOF_XMMWORD/2
        jb      short .column_st16
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [rdi], xmmA
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
-       maskmovdqu xmmD,xmmE                    ; movntdqu XMMWORD [rdi], xmmD
-       add     rdi, byte SIZEOF_XMMWORD        ; outptr
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
+       movdqu  XMMWORD [rdi+1*SIZEOF_XMMWORD], xmmD
+       add     rdi, byte 2*SIZEOF_XMMWORD      ; outptr
        movdqa  xmmA,xmmC
        movdqa  xmmD,xmmH
        sub     rcx, byte SIZEOF_XMMWORD/2
 .column_st16:
        cmp     rcx, byte SIZEOF_XMMWORD/4
        jb      short .column_st15
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
+       movdqu  XMMWORD [rdi+0*SIZEOF_XMMWORD], xmmA
        add     rdi, byte SIZEOF_XMMWORD        ; outptr
        movdqa  xmmA,xmmD
        sub     rcx, byte SIZEOF_XMMWORD/4
 .column_st15:
-%ifdef STRICT_MEMORY_ACCESS
        ; Store two pixels (8 bytes) of xmmA to the output when it has enough
        ; space.
        cmp     rcx, byte SIZEOF_XMMWORD/8
        jb      short .column_st7
-       movq    MMWORD [rdi], xmmA
+       movq    XMM_MMWORD [rdi], xmmA
        add     rdi, byte SIZEOF_XMMWORD/8*4
        sub     rcx, byte SIZEOF_XMMWORD/8
        psrldq  xmmA, SIZEOF_XMMWORD/8*4
@@ -470,48 +414,7 @@ EXTN(jsimd_h2v1_merged_upsample_sse2):
        ; space.
        test    rcx, rcx
        jz      short .endcolumn
-       movd    DWORD [rdi], xmmA
-%else
-       cmp     rcx, byte SIZEOF_XMMWORD/16
-       jb      near .endcolumn
-       mov     rax,rcx
-       xor     rcx, byte 0x03
-       inc     rcx
-       shl     rcx, 4
-       movd    xmmF,ecx
-       psrlq   xmmE,xmmF
-       punpcklbw xmmE,xmmE
-       ; ----------------
-       mov     rcx,rdi
-       and     rcx, byte SIZEOF_XMMWORD-1
-       jz      short .adj0
-       lea     rax, [rcx+rax*4]        ; RGB_PIXELSIZE
-       cmp     rax, byte SIZEOF_XMMWORD
-       ja      short .adj0
-       and     rdi, byte (-SIZEOF_XMMWORD)     ; align to 16-byte boundary
-       shl     rcx, 3                  ; pslldq xmmA,ecx & pslldq xmmE,ecx
-       movdqa  xmmB,xmmA
-       movdqa  xmmG,xmmE
-       pslldq  xmmA, SIZEOF_XMMWORD/2
-       pslldq  xmmE, SIZEOF_XMMWORD/2
-       movd    xmmC,ecx
-       sub     rcx, byte (SIZEOF_XMMWORD/2)*BYTE_BIT
-       jb      short .adj1
-       movd    xmmH,ecx
-       psllq   xmmA,xmmH
-       psllq   xmmE,xmmH
-       jmp     short .adj0
-.adj1: neg     rcx
-       movd    xmmH,ecx
-       psrlq   xmmA,xmmH
-       psrlq   xmmE,xmmH
-       psllq   xmmB,xmmC
-       psllq   xmmG,xmmC
-       por     xmmA,xmmB
-       por     xmmE,xmmG
-.adj0: ; ----------------
-       maskmovdqu xmmA,xmmE                    ; movntdqu XMMWORD [edi], xmmA
-%endif ; STRICT_MEMORY_ACCESS ; ---------------
+       movd    XMM_DWORD [rdi], xmmA
 
 %endif ; RGB_PIXELSIZE ; ---------------