simd/x86_64/jfdctflt-sse.asm

   1 ;
   2 ; jfdctflt.asm - floating-point FDCT (64-bit SSE)
   3 ;
   4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
   5 ; Copyright (C) 2009, 2016, D. R. Commander.
   6 ; Copyright (C) 2023, Aliaksiej Kandracienka.
   7 ;
   8 ; Based on the x86 SIMD extension for IJG JPEG library
   9 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
  10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
  11 ;
  12 ; This file should be assembled with NASM (Netwide Assembler),
  13 ; can *not* be assembled with Microsoft's MASM or any compatible
  14 ; assembler (including Borland's Turbo Assembler).
  15 ; NASM is available from http://nasm.sourceforge.net/ or
  16 ; http://sourceforge.net/project/showfiles.php?group_id=6208
  17 ;
  18 ; This file contains a floating-point implementation of the forward DCT
  19 ; (Discrete Cosine Transform). The following code is based directly on
  20 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details.
  21
  22 %include "jsimdext.inc"
  23 %include "jdct.inc"
  24
  25 ; --------------------------------------------------------------------------
  26
  27 %macro  unpcklps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(0 1 4 5)
  28     shufps      %1, %2, 0x44
  29 %endmacro
  30
  31 %macro  unpckhps2 2  ; %1=(0 1 2 3) / %2=(4 5 6 7) => %1=(2 3 6 7)
  32     shufps      %1, %2, 0xEE
  33 %endmacro
  34
  35 ; --------------------------------------------------------------------------
  36     SECTION     SEG_CONST
  37
  38     alignz      32
  39     GLOBAL_DATA(jconst_fdct_float_sse)
  40
  41 EXTN(jconst_fdct_float_sse):
  42
  43 PD_0_382 times 4 dd 0.382683432365089771728460
  44 PD_0_707 times 4 dd 0.707106781186547524400844
  45 PD_0_541 times 4 dd 0.541196100146196984399723
  46 PD_1_306 times 4 dd 1.306562964876376527856643
  47
  48     alignz      32
  49
  50 ; --------------------------------------------------------------------------
  51     SECTION     SEG_TEXT
  52     BITS        64
  53 ;
  54 ; Perform the forward DCT on one block of samples.
  55 ;
  56 ; GLOBAL(void)
  57 ; jsimd_fdct_float_sse(FAST_FLOAT *data)
  58 ;
  59
  60 ; r10 = FAST_FLOAT *data
  61
  62 %define wk(i)   r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD  ; xmmword wk[WK_NUM]
  63 %define WK_NUM  2
  64
  65     align       32
  66     GLOBAL_FUNCTION(jsimd_fdct_float_sse)
  67
  68 EXTN(jsimd_fdct_float_sse):
  69     push        rbp
  70     mov         rbp, rsp
  71     push        r15
  72     and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
  73     ; Allocate stack space for wk array.  r15 is used to access it.
  74     mov         r15, rsp
  75     sub         rsp, byte (SIZEOF_XMMWORD * WK_NUM)
  76     collect_args 1
  77
  78     ; ---- Pass 1: process rows.
  79
  80     mov         rdx, r10                ; (FAST_FLOAT *)
  81     mov         rcx, DCTSIZE/4
  82 .rowloop:
  83
  84     movaps      xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
  85     movaps      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
  86     movaps      xmm2, XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)]
  87     movaps      xmm3, XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)]
  88
  89     ; xmm0=(20 21 22 23), xmm2=(24 25 26 27)
  90     ; xmm1=(30 31 32 33), xmm3=(34 35 36 37)
  91
  92     movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
  93     unpcklps    xmm0, xmm1              ; xmm0=(20 30 21 31)
  94     unpckhps    xmm4, xmm1              ; xmm4=(22 32 23 33)
  95     movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
  96     unpcklps    xmm2, xmm3              ; xmm2=(24 34 25 35)
  97     unpckhps    xmm5, xmm3              ; xmm5=(26 36 27 37)
  98
  99     movaps      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
 100     movaps      xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
 101     movaps      xmm1, XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)]
 102     movaps      xmm3, XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)]
 103
 104     ; xmm6=(00 01 02 03), xmm1=(04 05 06 07)
 105     ; xmm7=(10 11 12 13), xmm3=(14 15 16 17)
 106
 107     movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 32 23 33)
 108     movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(24 34 25 35)
 109
 110     movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
 111     unpcklps    xmm6, xmm7              ; xmm6=(00 10 01 11)
 112     unpckhps    xmm4, xmm7              ; xmm4=(02 12 03 13)
 113     movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
 114     unpcklps    xmm1, xmm3              ; xmm1=(04 14 05 15)
 115     unpckhps    xmm2, xmm3              ; xmm2=(06 16 07 17)
 116
 117     movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
 118     unpcklps2   xmm6, xmm0              ; xmm6=(00 10 20 30)=data0
 119     unpckhps2   xmm7, xmm0              ; xmm7=(01 11 21 31)=data1
 120     movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
 121     unpcklps2   xmm2, xmm5              ; xmm2=(06 16 26 36)=data6
 122     unpckhps2   xmm3, xmm5              ; xmm3=(07 17 27 37)=data7
 123
 124     movaps      xmm0, xmm7
 125     movaps      xmm5, xmm6
 126     subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
 127     subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
 128     addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
 129     addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
 130
 131     movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 32 23 33)
 132     movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(24 34 25 35)
 133     movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
 134     movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
 135
 136     movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
 137     unpcklps2   xmm4, xmm2              ; xmm4=(02 12 22 32)=data2
 138     unpckhps2   xmm7, xmm2              ; xmm7=(03 13 23 33)=data3
 139     movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
 140     unpcklps2   xmm1, xmm3              ; xmm1=(04 14 24 34)=data4
 141     unpckhps2   xmm6, xmm3              ; xmm6=(05 15 25 35)=data5
 142
 143     movaps      xmm2, xmm7
 144     movaps      xmm3, xmm4
 145     addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
 146     addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
 147     subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
 148     subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
 149
 150     ; -- Even part
 151
 152     movaps      xmm1, xmm5
 153     movaps      xmm6, xmm0
 154     subps       xmm5, xmm7              ; xmm5=tmp13
 155     subps       xmm0, xmm4              ; xmm0=tmp12
 156     addps       xmm1, xmm7              ; xmm1=tmp10
 157     addps       xmm6, xmm4              ; xmm6=tmp11
 158
 159     addps       xmm0, xmm5
 160     mulps       xmm0, [rel PD_0_707]    ; xmm0=z1
 161
 162     movaps      xmm7, xmm1
 163     movaps      xmm4, xmm5
 164     subps       xmm1, xmm6              ; xmm1=data4
 165     subps       xmm5, xmm0              ; xmm5=data6
 166     addps       xmm7, xmm6              ; xmm7=data0
 167     addps       xmm4, xmm0              ; xmm4=data2
 168
 169     movaps      XMMWORD [XMMBLOCK(0,1,rdx,SIZEOF_FAST_FLOAT)], xmm1
 170     movaps      XMMWORD [XMMBLOCK(2,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
 171     movaps      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
 172     movaps      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
 173
 174     ; -- Odd part
 175
 176     movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
 177     movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
 178
 179     addps       xmm2, xmm3              ; xmm2=tmp10
 180     addps       xmm3, xmm6              ; xmm3=tmp11
 181     addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
 182
 183     mulps       xmm3, [rel PD_0_707]    ; xmm3=z3
 184
 185     movaps      xmm1, xmm2              ; xmm1=tmp10
 186     subps       xmm2, xmm6
 187     mulps       xmm2, [rel PD_0_382]    ; xmm2=z5
 188     mulps       xmm1, [rel PD_0_541]    ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
 189     mulps       xmm6, [rel PD_1_306]    ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
 190     addps       xmm1, xmm2              ; xmm1=z2
 191     addps       xmm6, xmm2              ; xmm6=z4
 192
 193     movaps      xmm5, xmm0
 194     subps       xmm0, xmm3              ; xmm0=z13
 195     addps       xmm5, xmm3              ; xmm5=z11
 196
 197     movaps      xmm7, xmm0
 198     movaps      xmm4, xmm5
 199     subps       xmm0, xmm1              ; xmm0=data3
 200     subps       xmm5, xmm6              ; xmm5=data7
 201     addps       xmm7, xmm1              ; xmm7=data5
 202     addps       xmm4, xmm6              ; xmm4=data1
 203
 204     movaps      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
 205     movaps      XMMWORD [XMMBLOCK(3,1,rdx,SIZEOF_FAST_FLOAT)], xmm5
 206     movaps      XMMWORD [XMMBLOCK(1,1,rdx,SIZEOF_FAST_FLOAT)], xmm7
 207     movaps      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
 208
 209     add         rdx, 4*DCTSIZE*SIZEOF_FAST_FLOAT
 210     dec         rcx
 211     jnz         near .rowloop
 212
 213     ; ---- Pass 2: process columns.
 214
 215     mov         rdx, r10                ; (FAST_FLOAT *)
 216     mov         rcx, DCTSIZE/4
 217 .columnloop:
 218
 219     movaps      xmm0, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)]
 220     movaps      xmm1, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)]
 221     movaps      xmm2, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)]
 222     movaps      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)]
 223
 224     ; xmm0=(02 12 22 32), xmm2=(42 52 62 72)
 225     ; xmm1=(03 13 23 33), xmm3=(43 53 63 73)
 226
 227     movaps      xmm4, xmm0              ; transpose coefficients(phase 1)
 228     unpcklps    xmm0, xmm1              ; xmm0=(02 03 12 13)
 229     unpckhps    xmm4, xmm1              ; xmm4=(22 23 32 33)
 230     movaps      xmm5, xmm2              ; transpose coefficients(phase 1)
 231     unpcklps    xmm2, xmm3              ; xmm2=(42 43 52 53)
 232     unpckhps    xmm5, xmm3              ; xmm5=(62 63 72 73)
 233
 234     movaps      xmm6, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)]
 235     movaps      xmm7, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)]
 236     movaps      xmm1, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)]
 237     movaps      xmm3, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)]
 238
 239     ; xmm6=(00 10 20 30), xmm1=(40 50 60 70)
 240     ; xmm7=(01 11 21 31), xmm3=(41 51 61 71)
 241
 242     movaps      XMMWORD [wk(0)], xmm4   ; wk(0)=(22 23 32 33)
 243     movaps      XMMWORD [wk(1)], xmm2   ; wk(1)=(42 43 52 53)
 244
 245     movaps      xmm4, xmm6              ; transpose coefficients(phase 1)
 246     unpcklps    xmm6, xmm7              ; xmm6=(00 01 10 11)
 247     unpckhps    xmm4, xmm7              ; xmm4=(20 21 30 31)
 248     movaps      xmm2, xmm1              ; transpose coefficients(phase 1)
 249     unpcklps    xmm1, xmm3              ; xmm1=(40 41 50 51)
 250     unpckhps    xmm2, xmm3              ; xmm2=(60 61 70 71)
 251
 252     movaps      xmm7, xmm6              ; transpose coefficients(phase 2)
 253     unpcklps2   xmm6, xmm0              ; xmm6=(00 01 02 03)=data0
 254     unpckhps2   xmm7, xmm0              ; xmm7=(10 11 12 13)=data1
 255     movaps      xmm3, xmm2              ; transpose coefficients(phase 2)
 256     unpcklps2   xmm2, xmm5              ; xmm2=(60 61 62 63)=data6
 257     unpckhps2   xmm3, xmm5              ; xmm3=(70 71 72 73)=data7
 258
 259     movaps      xmm0, xmm7
 260     movaps      xmm5, xmm6
 261     subps       xmm7, xmm2              ; xmm7=data1-data6=tmp6
 262     subps       xmm6, xmm3              ; xmm6=data0-data7=tmp7
 263     addps       xmm0, xmm2              ; xmm0=data1+data6=tmp1
 264     addps       xmm5, xmm3              ; xmm5=data0+data7=tmp0
 265
 266     movaps      xmm2, XMMWORD [wk(0)]   ; xmm2=(22 23 32 33)
 267     movaps      xmm3, XMMWORD [wk(1)]   ; xmm3=(42 43 52 53)
 268     movaps      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp6
 269     movaps      XMMWORD [wk(1)], xmm6   ; wk(1)=tmp7
 270
 271     movaps      xmm7, xmm4              ; transpose coefficients(phase 2)
 272     unpcklps2   xmm4, xmm2              ; xmm4=(20 21 22 23)=data2
 273     unpckhps2   xmm7, xmm2              ; xmm7=(30 31 32 33)=data3
 274     movaps      xmm6, xmm1              ; transpose coefficients(phase 2)
 275     unpcklps2   xmm1, xmm3              ; xmm1=(40 41 42 43)=data4
 276     unpckhps2   xmm6, xmm3              ; xmm6=(50 51 52 53)=data5
 277
 278     movaps      xmm2, xmm7
 279     movaps      xmm3, xmm4
 280     addps       xmm7, xmm1              ; xmm7=data3+data4=tmp3
 281     addps       xmm4, xmm6              ; xmm4=data2+data5=tmp2
 282     subps       xmm2, xmm1              ; xmm2=data3-data4=tmp4
 283     subps       xmm3, xmm6              ; xmm3=data2-data5=tmp5
 284
 285     ; -- Even part
 286
 287     movaps      xmm1, xmm5
 288     movaps      xmm6, xmm0
 289     subps       xmm5, xmm7              ; xmm5=tmp13
 290     subps       xmm0, xmm4              ; xmm0=tmp12
 291     addps       xmm1, xmm7              ; xmm1=tmp10
 292     addps       xmm6, xmm4              ; xmm6=tmp11
 293
 294     addps       xmm0, xmm5
 295     mulps       xmm0, [rel PD_0_707]    ; xmm0=z1
 296
 297     movaps      xmm7, xmm1
 298     movaps      xmm4, xmm5
 299     subps       xmm1, xmm6              ; xmm1=data4
 300     subps       xmm5, xmm0              ; xmm5=data6
 301     addps       xmm7, xmm6              ; xmm7=data0
 302     addps       xmm4, xmm0              ; xmm4=data2
 303
 304     movaps      XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_FAST_FLOAT)], xmm1
 305     movaps      XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
 306     movaps      XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
 307     movaps      XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
 308
 309     ; -- Odd part
 310
 311     movaps      xmm6, XMMWORD [wk(0)]   ; xmm6=tmp6
 312     movaps      xmm0, XMMWORD [wk(1)]   ; xmm0=tmp7
 313
 314     addps       xmm2, xmm3              ; xmm2=tmp10
 315     addps       xmm3, xmm6              ; xmm3=tmp11
 316     addps       xmm6, xmm0              ; xmm6=tmp12, xmm0=tmp7
 317
 318     mulps       xmm3, [rel PD_0_707]    ; xmm3=z3
 319
 320     movaps      xmm1, xmm2              ; xmm1=tmp10
 321     subps       xmm2, xmm6
 322     mulps       xmm2, [rel PD_0_382]    ; xmm2=z5
 323     mulps       xmm1, [rel PD_0_541]    ; xmm1=MULTIPLY(tmp10,FIX_0_541196)
 324     mulps       xmm6, [rel PD_1_306]    ; xmm6=MULTIPLY(tmp12,FIX_1_306562)
 325     addps       xmm1, xmm2              ; xmm1=z2
 326     addps       xmm6, xmm2              ; xmm6=z4
 327
 328     movaps      xmm5, xmm0
 329     subps       xmm0, xmm3              ; xmm0=z13
 330     addps       xmm5, xmm3              ; xmm5=z11
 331
 332     movaps      xmm7, xmm0
 333     movaps      xmm4, xmm5
 334     subps       xmm0, xmm1              ; xmm0=data3
 335     subps       xmm5, xmm6              ; xmm5=data7
 336     addps       xmm7, xmm1              ; xmm7=data5
 337     addps       xmm4, xmm6              ; xmm4=data1
 338
 339     movaps      XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_FAST_FLOAT)], xmm0
 340     movaps      XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_FAST_FLOAT)], xmm5
 341     movaps      XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_FAST_FLOAT)], xmm7
 342     movaps      XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_FAST_FLOAT)], xmm4
 343
 344     add         rdx, byte 4*SIZEOF_FAST_FLOAT
 345     dec         rcx
 346     jnz         near .columnloop
 347
 348     uncollect_args 1
 349     lea         rsp, [rbp-8]
 350     pop         r15
 351     pop         rbp
 352     ret
 353
 354 ; For some reason, the OS X linker does not honor the request to align the
 355 ; segment unless we do this.
 356     align       32