simd/jcqnts2f-64.asm

   1 ;
   2 ; jcqnts2f-64.asm - sample data conversion and quantization (64-bit SSE & SSE2)
   3 ;
   4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
   5 ; Copyright 2009 D. R. Commander
   6 ;
   7 ; Based on
   8 ; x86 SIMD extension for IJG JPEG library
   9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11 ;
  12 ; This file should be assembled with NASM (Netwide Assembler),
  13 ; can *not* be assembled with Microsoft's MASM or any compatible
  14 ; assembler (including Borland's Turbo Assembler).
  15 ; NASM is available from http://nasm.sourceforge.net/ or
  16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
  17 ;
  18 ; [TAB8]
  19
  20 %include "jsimdext.inc"
  21 %include "jdct.inc"
  22
  23 ; --------------------------------------------------------------------------
  24         SECTION SEG_TEXT
  25         BITS    64
  26 ;
  27 ; Load data into workspace, applying unsigned->signed conversion
  28 ;
  29 ; GLOBAL(void)
  30 ; jsimd_convsamp_float_sse2 (JSAMPARRAY sample_data, JDIMENSION start_col,
  31 ;                            FAST_FLOAT * workspace);
  32 ;
  33
  34 ; r10 = JSAMPARRAY sample_data
  35 ; r11 = JDIMENSION start_col
  36 ; r12 = FAST_FLOAT * workspace
  37
  38         align   16
  39         global  EXTN(jsimd_convsamp_float_sse2)
  40
  41 EXTN(jsimd_convsamp_float_sse2):
  42         push    rbp
  43         mov     rax,rsp
  44         mov     rbp,rsp
  45         collect_args
  46         push    rbx
  47
  48         pcmpeqw  xmm7,xmm7
  49         psllw    xmm7,7
  50         packsswb xmm7,xmm7              ; xmm7 = PB_CENTERJSAMPLE (0x808080..)
  51
  52         mov rsi, r10
  53         mov     rax, r11
  54         mov rdi, r12
  55         mov     rcx, DCTSIZE/2
  56 .convloop:
  57         mov     rbx, JSAMPROW [rsi+0*SIZEOF_JSAMPROW]   ; (JSAMPLE *)
  58         mov rdx, JSAMPROW [rsi+1*SIZEOF_JSAMPROW]       ; (JSAMPLE *)
  59
  60         movq    xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE]
  61         movq    xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE]
  62
  63         psubb   xmm0,xmm7                       ; xmm0=(01234567)
  64         psubb   xmm1,xmm7                       ; xmm1=(89ABCDEF)
  65
  66         punpcklbw xmm0,xmm0                     ; xmm0=(*0*1*2*3*4*5*6*7)
  67         punpcklbw xmm1,xmm1                     ; xmm1=(*8*9*A*B*C*D*E*F)
  68
  69         punpcklwd xmm2,xmm0                     ; xmm2=(***0***1***2***3)
  70         punpckhwd xmm0,xmm0                     ; xmm0=(***4***5***6***7)
  71         punpcklwd xmm3,xmm1                     ; xmm3=(***8***9***A***B)
  72         punpckhwd xmm1,xmm1                     ; xmm1=(***C***D***E***F)
  73
  74         psrad     xmm2,(DWORD_BIT-BYTE_BIT)     ; xmm2=(0123)
  75         psrad     xmm0,(DWORD_BIT-BYTE_BIT)     ; xmm0=(4567)
  76         cvtdq2ps  xmm2,xmm2                     ; xmm2=(0123)
  77         cvtdq2ps  xmm0,xmm0                     ; xmm0=(4567)
  78         psrad     xmm3,(DWORD_BIT-BYTE_BIT)     ; xmm3=(89AB)
  79         psrad     xmm1,(DWORD_BIT-BYTE_BIT)     ; xmm1=(CDEF)
  80         cvtdq2ps  xmm3,xmm3                     ; xmm3=(89AB)
  81         cvtdq2ps  xmm1,xmm1                     ; xmm1=(CDEF)
  82
  83         movaps  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_FAST_FLOAT)], xmm2
  84         movaps  XMMWORD [XMMBLOCK(0,1,rdi,SIZEOF_FAST_FLOAT)], xmm0
  85         movaps  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_FAST_FLOAT)], xmm3
  86         movaps  XMMWORD [XMMBLOCK(1,1,rdi,SIZEOF_FAST_FLOAT)], xmm1
  87
  88         add     rsi, byte 2*SIZEOF_JSAMPROW
  89         add     rdi, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT
  90         dec     rcx
  91         jnz     short .convloop
  92
  93         pop     rbx
  94         uncollect_args
  95         pop     rbp
  96         ret
  97
  98
  99 ; --------------------------------------------------------------------------
 100 ;
 101 ; Quantize/descale the coefficients, and store into coef_block
 102 ;
 103 ; GLOBAL(void)
 104 ; jsimd_quantize_float_sse2 (JCOEFPTR coef_block, FAST_FLOAT * divisors,
 105 ;                         FAST_FLOAT * workspace);
 106 ;
 107
 108 ; r10 = JCOEFPTR coef_block
 109 ; r11 = FAST_FLOAT * divisors
 110 ; r12 = FAST_FLOAT * workspace
 111
 112         align   16
 113         global  EXTN(jsimd_quantize_float_sse2)
 114
 115 EXTN(jsimd_quantize_float_sse2):
 116         push    rbp
 117         mov     rax,rsp
 118         mov     rbp,rsp
 119         collect_args
 120
 121         mov rsi, r12
 122         mov rdx, r11
 123         mov rdi, r10
 124         mov     rax, DCTSIZE2/16
 125 .quantloop:
 126         movaps  xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_FAST_FLOAT)]
 127         movaps  xmm1, XMMWORD [XMMBLOCK(0,1,rsi,SIZEOF_FAST_FLOAT)]
 128         mulps   xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
 129         mulps   xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
 130         movaps  xmm2, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_FAST_FLOAT)]
 131         movaps  xmm3, XMMWORD [XMMBLOCK(1,1,rsi,SIZEOF_FAST_FLOAT)]
 132         mulps   xmm2, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
 133         mulps   xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
 134
 135         cvtps2dq xmm0,xmm0
 136         cvtps2dq xmm1,xmm1
 137         cvtps2dq xmm2,xmm2
 138         cvtps2dq xmm3,xmm3
 139
 140         packssdw xmm0,xmm1
 141         packssdw xmm2,xmm3
 142
 143         movdqa  XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_JCOEF)], xmm0
 144         movdqa  XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_JCOEF)], xmm2
 145
 146         add     rsi, byte 16*SIZEOF_FAST_FLOAT
 147         add     rdx, byte 16*SIZEOF_FAST_FLOAT
 148         add     rdi, byte 16*SIZEOF_JCOEF
 149         dec     rax
 150         jnz     short .quantloop
 151
 152         uncollect_args
 153         pop     rbp
 154         ret
 155
 156 ; For some reason, the OS X linker does not honor the request to align the
 157 ; segment unless we do this.
 158         align   16