libs/ardour/sse_avx_functions_64bit_win.s

   1 /*
   2     Copyright (C) 2005-2006 Paul Davis, John Rigg
   3
   4     This program is free software; you can redistribute it and/or modify
   5     it under the terms of the GNU General Public License as published by
   6     the Free Software Foundation; either version 2 of the License, or
   7     (at your option) any later version.
   8
   9     This program is distributed in the hope that it will be useful,
  10     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12     GNU General Public License for more details.
  13
  14     You should have received a copy of the GNU General Public License
  15     along with this program; if not, write to the Free Software
  16     Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  17
  18         Author: Sampo Savolainen
  19         64-bit conversion: John Rigg
  20
  21     $Id$
  22 */
  23
  24 #; Microsoft version of AVX sample processing functions
  25
  26 #; void x86_sse_avx_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
  27
  28 .globl x86_sse_avx_mix_buffers_with_gain
  29         .def    x86_sse_avx_mix_buffers_with_gain; .scl    2;      .type   32;
  30 .endef
  31
  32 x86_sse_avx_mix_buffers_with_gain:
  33
  34 #; due to Microsoft calling convention
  35 #; %rcx float *dst
  36 #; %rdx float *src
  37 #; %r8 unsigned int nframes
  38 #; %xmm3 float  gain
  39
  40         pushq %rbp
  41         movq %rsp, %rbp
  42
  43         #; save the registers
  44         pushq %rbx #; must be preserved
  45
  46         #; move current max to %xmm0 for convenience
  47         movss %xmm3, %xmm0
  48
  49         #; if nframes == 0, go to end
  50         cmp     $0, %r8
  51         je      .MBWG_END
  52
  53         #; Check for alignment
  54
  55         movq %rcx, %rax
  56         andq $28, %rax #; mask alignment offset
  57
  58         movq %rdx, %rbx
  59         andq $28, %rbx #; mask alignment offset
  60
  61         cmp %rax, %rbx
  62         jne .MBWG_NONALIGN #; if buffer are not aligned between each other, calculate manually
  63
  64         #; if we are aligned
  65         cmp $0, %rbx
  66         jz .MBWG_AVX
  67
  68         #; Pre-loop, we need to run 1-7 frames "manually" without
  69         #; SSE instructions
  70
  71 .MBWG_PRELOOP:
  72
  73         #; gain is already in %xmm0
  74         movss (%rdx), %xmm1
  75         mulss %xmm0, %xmm1
  76         addss (%rcx), %xmm1
  77         movss %xmm1, (%rcx)
  78
  79         addq $4, %rcx #; dst++
  80         addq $4, %rdx #; src++
  81         decq %r8          #; nframes--
  82         jz .MBWG_END
  83
  84         addq $4, %rbx
  85
  86         cmp $32, %rbx #; test if we've reached 32 byte alignment
  87         jne .MBWG_PRELOOP
  88
  89 .MBWG_AVX:
  90
  91         cmp $8, %r8 #; we know it's not zero, but if it's not >=4, then
  92         jl .MBWG_NONALIGN #; we jump straight to the "normal" code
  93
  94         #; set up the gain buffer (gain is already in %xmm0)
  95         vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register
  96         vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
  97
  98 .MBWG_AVXLOOP:
  99
 100         vmovaps (%rdx), %ymm1        #; source => xmm0
 101         vmulps  %ymm0,  %ymm1, %ymm2 #; apply gain to source
 102         vaddps  (%rcx), %ymm2, %ymm1 #; mix with destination
 103         vmovaps  %ymm1, (%rcx)        #; copy result to destination
 104
 105         addq $32, %rcx #; dst+=8
 106         addq $32, %rdx #; src+=8
 107
 108         subq $8, %r8 #; nframes-=8
 109         cmp $8, %r8
 110         jge .MBWG_AVXLOOP
 111
 112         #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
 113         vzeroupper
 114
 115         cmp $0, %r8
 116         je .MBWG_END
 117
 118         #; if there are remaining frames, the nonalign code will do nicely
 119         #; for the rest 1-7 frames.
 120
 121 .MBWG_NONALIGN:
 122         #; not aligned!
 123
 124         #; gain is already in %xmm0
 125
 126 .MBWG_NONALIGNLOOP:
 127
 128         movss (%rdx), %xmm1
 129         mulss %xmm0, %xmm1
 130         addss (%rcx), %xmm1
 131         movss %xmm1, (%rcx)
 132
 133         addq $4, %rcx
 134         addq $4, %rdx
 135
 136         decq %r8
 137         jnz .MBWG_NONALIGNLOOP
 138
 139 .MBWG_END:
 140
 141         popq %rbx
 142
 143         #; return
 144         leave
 145         ret
 146
 147
 148 #; void x86_sse_avx_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
 149
 150 .globl x86_sse_avx_mix_buffers_no_gain
 151         .def    x86_sse_avx_mix_buffers_no_gain; .scl    2;   .type   32;
 152 .endef
 153
 154 x86_sse_avx_mix_buffers_no_gain:
 155
 156 #; due to Microsoft calling convention
 157 #; %rcx float *dst
 158 #; %rdx float *src
 159 #; %r8 unsigned int nframes
 160
 161         pushq %rbp
 162         movq %rsp, %rbp
 163
 164         #; save the registers
 165         pushq %rbx #; must be preserved
 166
 167         #; the real function
 168
 169         #; if nframes == 0, go to end
 170         cmp     $0, %r8
 171         je      .MBNG_END
 172
 173         #; Check for alignment
 174
 175         movq %rcx, %rax
 176         andq $28, %rax #; mask alignment offset
 177
 178         movq %rdx, %rbx
 179         andq $28, %rbx #; mask alignment offset
 180
 181         cmp %rax, %rbx
 182         jne .MBNG_NONALIGN #; if not buffers are not aligned btween each other, calculate manually
 183
 184         cmp $0, %rbx
 185         je .MBNG_AVX #; aligned at 32, rpoceed to AVX
 186
 187         #; Pre-loop, we need to run 1-7 frames "manually" without
 188         #; AVX instructions
 189
 190 .MBNG_PRELOOP:
 191
 192         movss (%rdx), %xmm0
 193         addss (%rcx), %xmm0
 194         movss %xmm0, (%rcx)
 195
 196         addq $4, %rcx #; dst++
 197         addq $4, %rdx #; src++
 198
 199         decq %r8          #; nframes--
 200         jz      .MBNG_END
 201
 202         addq $4, %rbx #; one non-aligned byte less
 203
 204         cmp $32, %rbx #; test if we've reached 32 byte alignment
 205         jne .MBNG_PRELOOP
 206
 207 .MBNG_AVX:
 208
 209         cmp $8, %r8 #; if there are frames left, but less than 8
 210         jl .MBNG_NONALIGN #; we can't run AVX
 211
 212 .MBNG_AVXLOOP:
 213
 214         vmovaps (%rdx), %ymm0        #; source => xmm0
 215         vaddps  (%rcx), %ymm0, %ymm1 #; mix with destination
 216         vmovaps  %ymm1, (%rcx)       #; copy result to destination
 217
 218         addq $32, %rcx #; dst+=8
 219         addq $32, %rdx #; src+=8
 220
 221         subq $8, %r8 #; nframes-=8
 222         cmp $8, %r8
 223         jge .MBNG_AVXLOOP
 224
 225         #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
 226         vzeroupper
 227
 228         cmp $0, %r8
 229         je .MBNG_END
 230
 231         #; if there are remaining frames, the nonalign code will do nicely
 232         #; for the rest 1-7 frames.
 233
 234 .MBNG_NONALIGN:
 235         #; not aligned!
 236         #;
 237
 238         movss (%rdx), %xmm0 #; src => xmm0
 239         addss (%rcx), %xmm0 #; xmm0 += dst
 240         movss %xmm0, (%rcx) #; xmm0 => dst
 241
 242         addq $4, %rcx
 243         addq $4, %rdx
 244
 245         decq %r8
 246         jnz .MBNG_NONALIGN
 247
 248 .MBNG_END:
 249
 250         popq %rbx
 251
 252         #; return
 253         leave
 254         ret
 255
 256
 257 #; void x86_sse_avx_copy_vector (float *dst, float *src, unsigned int nframes);
 258
 259 .globl x86_sse_avx_copy_vector
 260         .def    x86_sse_avx_copy_vector; .scl    2;   .type   32;
 261 .endef
 262
 263 x86_sse_avx_copy_vector:
 264
 265 #; due to Microsoft calling convention
 266 #; %rcx float *dst
 267 #; %rdx float *src
 268 #; %r8 unsigned int nframes
 269
 270         pushq %rbp
 271         movq %rsp, %rbp
 272
 273         #; save the registers
 274         pushq %rbx #; must be preserved
 275
 276         #; the real function
 277
 278         #; if nframes == 0, go to end
 279         cmp     $0, %r8
 280         je      .CB_END
 281
 282         #; Check for alignment
 283
 284         movq %rcx, %rax
 285         andq $28, %rax #; mask alignment offset
 286
 287         movq %rdx, %rbx
 288         andq $28, %rbx #; mask alignment offset
 289
 290         cmp %rax, %rbx
 291         jne .CB_NONALIGN #; if not buffers are not aligned btween each other, calculate manually
 292
 293         cmp $0, %rbx
 294         je .CB_AVX #; aligned at 32, rpoceed to AVX
 295
 296         #; Pre-loop, we need to run 1-7 frames "manually" without
 297         #; AVX instructions
 298
 299 .CB_PRELOOP:
 300
 301         movss (%rdx), %xmm0
 302         movss %xmm0, (%rcx)
 303
 304         addq $4, %rcx #; dst++
 305         addq $4, %rdx #; src++
 306
 307         decq %r8          #; nframes--
 308         jz      .CB_END
 309
 310         addq $4, %rbx #; one non-aligned byte less
 311
 312         cmp $32, %rbx #; test if we've reached 32 byte alignment
 313         jne .CB_PRELOOP
 314
 315 .CB_AVX:
 316
 317         cmp $8, %r8 #; if there are frames left, but less than 8
 318         jl .CB_NONALIGN #; we can't run AVX
 319
 320 .CB_AVXLOOP:
 321
 322         vmovaps (%rdx), %ymm0        #; source => xmm0
 323         vmovaps  %ymm0, (%rcx)       #; copy result to destination
 324
 325         addq $32, %rcx #; dst+=8
 326         addq $32, %rdx #; src+=8
 327
 328         subq $8, %r8 #; nframes-=8
 329         cmp $8, %r8
 330         jge .CB_AVXLOOP
 331
 332         #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
 333         vzeroupper
 334
 335         cmp $0, %r8
 336         je .CB_END
 337
 338         #; if there are remaining frames, the nonalign code will do nicely
 339         #; for the rest 1-7 frames.
 340
 341 .CB_NONALIGN:
 342         #; not aligned!
 343         #;
 344
 345         movss (%rdx), %xmm0 #; src => xmm0
 346         movss %xmm0, (%rcx) #; xmm0 => dst
 347
 348         addq $4, %rcx
 349         addq $4, %rdx
 350
 351         decq %r8
 352         jnz .CB_NONALIGN
 353
 354 .CB_END:
 355
 356         popq %rbx
 357
 358         #; return
 359         leave
 360         ret
 361
 362
 363 #; void x86_sse_avx_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
 364
 365 .globl x86_sse_avx_apply_gain_to_buffer
 366         .def    x86_sse_avx_apply_gain_to_buffer; .scl    2;   .type   32;
 367 .endef
 368
 369 x86_sse_avx_apply_gain_to_buffer:
 370
 371 #; due to Microsoft calling convention
 372 #; %rcx float                   *buf    32(%rbp)
 373 #; %rdx unsigned int    nframes
 374 #; %xmm2 float                  gain                    avx specific register
 375
 376         pushq %rbp
 377         movq %rsp, %rbp
 378
 379         #; move current max to %xmm0 for convenience
 380         movss %xmm2, %xmm0
 381
 382         #; the real function
 383
 384         #; if nframes == 0, go to end
 385         cmp     $0, %rdx
 386         je      .AG_END
 387
 388         #; Check for alignment
 389
 390         movq %rcx, %r8 #; buf => %rdx
 391         andq $28, %r8 #; check alignment with mask 11100
 392         jz      .AG_AVX #; if buffer IS aligned
 393
 394         #; PRE-LOOP
 395         #; we iterate 1-7 times, doing normal x87 float comparison
 396         #; so we reach a 32 byte aligned "buf" (=%rdi) value
 397
 398 .AGLP_START:
 399
 400         #; Load next value from the buffer into %xmm1
 401         movss (%rcx), %xmm1
 402         mulss %xmm0, %xmm1
 403         movss %xmm1, (%rcx)
 404
 405         #; increment buffer, decrement counter
 406         addq $4, %rcx #; buf++;
 407
 408         decq %rdx   #; nframes--
 409         jz      .AG_END #; if we run out of frames, we go to the end
 410
 411         addq $4, %r8 #; one non-aligned byte less
 412         cmp $16, %r8
 413         jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
 414
 415 .AG_AVX:
 416
 417         #; We have reached the 32 byte aligned "buf" ("rcx") value
 418         #; use AVX instructions
 419
 420         #; Figure out how many loops we should do
 421         movq %rdx, %rax #; copy remaining nframes to %rax for division
 422
 423         shr $3, %rax #; unsigned divide by 8
 424
 425         #; %rax = AVX iterations
 426         cmp $0, %rax
 427         je .AGPOST_START
 428
 429         #; set up the gain buffer (gain is already in %xmm0)
 430         vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register
 431         vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
 432
 433 .AGLP_AVX:
 434
 435         vmovaps (%rcx), %ymm1
 436         vmulps %ymm0, %ymm1, %ymm2
 437         vmovaps %ymm2, (%rcx)
 438
 439         addq $32, %rcx  #; buf + 8
 440         subq $8, %rdx   #; nframes-=8
 441
 442         decq %rax
 443         jnz .AGLP_AVX
 444
 445         #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
 446         vzeroupper
 447
 448         #; Next we need to post-process all remaining frames
 449         #; the remaining frame count is in %rcx
 450         cmpq $0, %rdx #;
 451         jz .AG_END
 452
 453 .AGPOST_START:
 454
 455         movss (%rcx), %xmm1
 456         mulss %xmm0, %xmm1
 457         movss %xmm1, (%rcx)
 458
 459         #; increment buffer, decrement counter
 460         addq $4, %rcx #; buf++;
 461
 462         decq %rdx   #; nframes--
 463         jnz     .AGPOST_START #; if we run out of frames, we go to the end
 464
 465 .AG_END:
 466
 467         #; return
 468         leave
 469         ret
 470
 471 #; end proc
 472
 473
 474 #; float x86_sse_avx_compute_peak(float *buf, long nframes, float current);
 475
 476 .globl x86_sse_avx_compute_peak
 477         .def    x86_sse_avx_compute_peak; .scl    2;   .type   32;
 478 .endef
 479
 480 x86_sse_avx_compute_peak:
 481
 482 #; due to Microsoft calling convention
 483 #; %rcx float*          buf     32(%rbp)
 484 #; %rdx unsigned int    nframes
 485 #; %xmm2 float                  current
 486
 487         pushq %rbp
 488         movq %rsp, %rbp
 489
 490         #; move current max to %xmm0 for convenience
 491         movss %xmm2, %xmm0
 492
 493         #; if nframes == 0, go to end
 494         cmp     $0, %rdx
 495         je      .CP_END
 496
 497         #; create the "abs" mask in %xmm3
 498         #; if will be used to discard sign bit
 499         pushq   $2147483647
 500         movss   (%rsp), %xmm3
 501         addq    $8, %rsp
 502
 503         #; Check for alignment
 504         movq %rcx, %r8 #; buf => %rdx
 505         andq $28, %r8 #; mask bits 1 & 2
 506         jz      .CP_AVX #; if buffer IS aligned
 507
 508         #; PRE-LOOP
 509         #; we iterate 1-7 times, doing normal x87 float comparison
 510         #; so we reach a 32 byte aligned "buf" (=%rcx) value
 511
 512 .LP_START:
 513
 514         #; Load next value from the buffer
 515         movss (%rcx), %xmm1
 516         andps %xmm3, %xmm1      #; mask out sign bit
 517         maxss %xmm1, %xmm0
 518
 519         #; increment buffer, decrement counter
 520         addq $4, %rcx #; buf++;
 521
 522         decq %rdx   #; nframes--
 523         jz      .CP_END #; if we run out of frames, we go to the end
 524
 525         addq $4, %r8 #; one non-aligned byte less
 526         cmp $32, %r8
 527         jne .LP_START #; if more non-aligned frames exist, we do a do-over
 528
 529 .CP_AVX:
 530
 531         #; We have reached the 32 byte aligned "buf" ("rdi") value
 532
 533         #; Figure out how many loops we should do
 534         movq %rdx, %rax #; copy remaining nframes to %rax for division
 535
 536         shr $3, %rax #; unsigned divide by 8
 537         jz .POST_START
 538
 539         #; %rax = AVX iterations
 540
 541         #; current maximum is at %xmm0, but we need to broadcast it to the whole ymm0 register..
 542         vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the all 128 bits of xmm0 register
 543         vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
 544
 545         #; broadcast sign mask to the whole ymm3 register
 546         vshufps $0x00, %ymm3, %ymm3, %ymm3 #; spread single float value to the all 128 bits of xmm3 register
 547         vperm2f128 $0x00, %ymm3, %ymm3, %ymm3 #; extend the first 128 bits of ymm3 register to higher 128 bits
 548
 549 .LP_AVX:
 550
 551         vmovaps (%rcx), %ymm1
 552         vandps %ymm3, %ymm1, %ymm1      #; mask out sign bit
 553         vmaxps %ymm1, %ymm0, %ymm0
 554
 555         addq $32, %rcx #; buf+=8
 556         subq $8, %rdx #; nframes-=8
 557
 558         decq %rax
 559         jnz .LP_AVX
 560
 561         #; Calculate the maximum value contained in the 4 FP's in %ymm0
 562         vshufps $0x4e, %ymm0, %ymm0, %ymm1     #; shuffle left & right pairs (1234 => 3412) in each 128 bit half
 563         vmaxps  %ymm1, %ymm0, %ymm0            #; maximums of the four pairs, if each of 8 elements was unique, 4 unique elements left now
 564         vshufps $0xb1, %ymm0, %ymm0, %ymm1     #; shuffle the floats inside pairs (1234 => 2143) in each 128 bit half
 565         vmaxps  %ymm1, %ymm0, %ymm0                        #; maximums of the four pairs, we had up to 4 unique elements was unique, 2 unique elements left now
 566         vperm2f128 $0x01, %ymm0, %ymm0, %ymm1  #; swap 128 bit halfs
 567         vmaxps  %ymm1, %ymm0, %ymm0                        #; the result will be - all 8 elemens are maximums
 568
 569         #; now every float in %ymm0 is the same value, current maximum value
 570
 571         #; Next we need to post-process all remaining frames
 572         #; the remaining frame count is in %rcx
 573
 574         #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
 575         vzeroupper
 576
 577         #; if no remaining frames, jump to the end
 578         cmp $0, %rdx
 579         je .CP_END
 580
 581 .POST_START:
 582
 583         movss (%rcx), %xmm1
 584         andps %xmm3, %xmm1      #; mask out sign bit
 585         maxss %xmm1, %xmm0
 586
 587         addq $4, %rcx   #; buf++;
 588
 589         decq %rdx               #; nframes--;
 590         jnz .POST_START
 591
 592 .CP_END:
 593
 594         #; return value is in xmm0
 595
 596         #; return
 597         leave
 598         ret
 599
 600 #; end proc