2 Copyright (C) 2005-2006 Paul Davis, John Rigg
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 Author: Sampo Savolainen
19 64-bit conversion: John Rigg
24 #; Microsoft version of AVX sample processing functions
26 #; void x86_sse_avx_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
28 .globl x86_sse_avx_mix_buffers_with_gain
29 .def x86_sse_avx_mix_buffers_with_gain; .scl 2; .type 32;
32 x86_sse_avx_mix_buffers_with_gain:
34 #; due to Microsoft calling convention
37 #; %r8 unsigned int nframes
44 pushq %rbx #; must be preserved
46 #; move current max to %xmm0 for convenience
49 #; if nframes == 0, go to end
53 #; Check for alignment
56 andq $28, %rax #; mask alignment offset
59 andq $28, %rbx #; mask alignment offset
62 jne .MBWG_NONALIGN #; if buffer are not aligned between each other, calculate manually
68 #; Pre-loop, we need to run 1-7 frames "manually" without
73 #; gain is already in %xmm0
79 addq $4, %rcx #; dst++
80 addq $4, %rdx #; src++
86 cmp $32, %rbx #; test if we've reached 32 byte alignment
91 cmp $8, %r8 #; we know it's not zero, but if it's not >=4, then
92 jl .MBWG_NONALIGN #; we jump straight to the "normal" code
94 #; set up the gain buffer (gain is already in %xmm0)
95 vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register
96 vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
100 vmovaps (%rdx), %ymm1 #; source => xmm0
101 vmulps %ymm0, %ymm1, %ymm2 #; apply gain to source
102 vaddps (%rcx), %ymm2, %ymm1 #; mix with destination
103 vmovaps %ymm1, (%rcx) #; copy result to destination
105 addq $32, %rcx #; dst+=8
106 addq $32, %rdx #; src+=8
108 subq $8, %r8 #; nframes-=8
112 #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
118 #; if there are remaining frames, the nonalign code will do nicely
119 #; for the rest 1-7 frames.
124 #; gain is already in %xmm0
137 jnz .MBWG_NONALIGNLOOP
148 #; void x86_sse_avx_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
150 .globl x86_sse_avx_mix_buffers_no_gain
151 .def x86_sse_avx_mix_buffers_no_gain; .scl 2; .type 32;
154 x86_sse_avx_mix_buffers_no_gain:
156 #; due to Microsoft calling convention
159 #; %r8 unsigned int nframes
164 #; save the registers
165 pushq %rbx #; must be preserved
169 #; if nframes == 0, go to end
173 #; Check for alignment
176 andq $28, %rax #; mask alignment offset
179 andq $28, %rbx #; mask alignment offset
182 jne .MBNG_NONALIGN #; if not buffers are not aligned btween each other, calculate manually
185 je .MBNG_AVX #; aligned at 32, rpoceed to AVX
187 #; Pre-loop, we need to run 1-7 frames "manually" without
196 addq $4, %rcx #; dst++
197 addq $4, %rdx #; src++
199 decq %r8 #; nframes--
202 addq $4, %rbx #; one non-aligned byte less
204 cmp $32, %rbx #; test if we've reached 32 byte alignment
209 cmp $8, %r8 #; if there are frames left, but less than 8
210 jl .MBNG_NONALIGN #; we can't run AVX
214 vmovaps (%rdx), %ymm0 #; source => xmm0
215 vaddps (%rcx), %ymm0, %ymm1 #; mix with destination
216 vmovaps %ymm1, (%rcx) #; copy result to destination
218 addq $32, %rcx #; dst+=8
219 addq $32, %rdx #; src+=8
221 subq $8, %r8 #; nframes-=8
225 #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
231 #; if there are remaining frames, the nonalign code will do nicely
232 #; for the rest 1-7 frames.
238 movss (%rdx), %xmm0 #; src => xmm0
239 addss (%rcx), %xmm0 #; xmm0 += dst
240 movss %xmm0, (%rcx) #; xmm0 => dst
257 #; void x86_sse_avx_copy_vector (float *dst, float *src, unsigned int nframes);
259 .globl x86_sse_avx_copy_vector
260 .def x86_sse_avx_copy_vector; .scl 2; .type 32;
263 x86_sse_avx_copy_vector:
265 #; due to Microsoft calling convention
268 #; %r8 unsigned int nframes
273 #; save the registers
274 pushq %rbx #; must be preserved
278 #; if nframes == 0, go to end
282 #; Check for alignment
285 andq $28, %rax #; mask alignment offset
288 andq $28, %rbx #; mask alignment offset
291 jne .CB_NONALIGN #; if not buffers are not aligned btween each other, calculate manually
294 je .CB_AVX #; aligned at 32, rpoceed to AVX
296 #; Pre-loop, we need to run 1-7 frames "manually" without
304 addq $4, %rcx #; dst++
305 addq $4, %rdx #; src++
307 decq %r8 #; nframes--
310 addq $4, %rbx #; one non-aligned byte less
312 cmp $32, %rbx #; test if we've reached 32 byte alignment
317 cmp $8, %r8 #; if there are frames left, but less than 8
318 jl .CB_NONALIGN #; we can't run AVX
322 vmovaps (%rdx), %ymm0 #; source => xmm0
323 vmovaps %ymm0, (%rcx) #; copy result to destination
325 addq $32, %rcx #; dst+=8
326 addq $32, %rdx #; src+=8
328 subq $8, %r8 #; nframes-=8
332 #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
338 #; if there are remaining frames, the nonalign code will do nicely
339 #; for the rest 1-7 frames.
345 movss (%rdx), %xmm0 #; src => xmm0
346 movss %xmm0, (%rcx) #; xmm0 => dst
363 #; void x86_sse_avx_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
365 .globl x86_sse_avx_apply_gain_to_buffer
366 .def x86_sse_avx_apply_gain_to_buffer; .scl 2; .type 32;
369 x86_sse_avx_apply_gain_to_buffer:
371 #; due to Microsoft calling convention
372 #; %rcx float *buf 32(%rbp)
373 #; %rdx unsigned int nframes
374 #; %xmm2 float gain avx specific register
379 #; move current max to %xmm0 for convenience
384 #; if nframes == 0, go to end
388 #; Check for alignment
390 movq %rcx, %r8 #; buf => %rdx
391 andq $28, %r8 #; check alignment with mask 11100
392 jz .AG_AVX #; if buffer IS aligned
395 #; we iterate 1-7 times, doing normal x87 float comparison
396 #; so we reach a 32 byte aligned "buf" (=%rdi) value
400 #; Load next value from the buffer into %xmm1
405 #; increment buffer, decrement counter
406 addq $4, %rcx #; buf++;
408 decq %rdx #; nframes--
409 jz .AG_END #; if we run out of frames, we go to the end
411 addq $4, %r8 #; one non-aligned byte less
413 jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
417 #; We have reached the 32 byte aligned "buf" ("rcx") value
418 #; use AVX instructions
420 #; Figure out how many loops we should do
421 movq %rdx, %rax #; copy remaining nframes to %rax for division
423 shr $3, %rax #; unsigned divide by 8
425 #; %rax = AVX iterations
429 #; set up the gain buffer (gain is already in %xmm0)
430 vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the first 128 bits of ymm0 register
431 vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
435 vmovaps (%rcx), %ymm1
436 vmulps %ymm0, %ymm1, %ymm2
437 vmovaps %ymm2, (%rcx)
439 addq $32, %rcx #; buf + 8
440 subq $8, %rdx #; nframes-=8
445 #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
448 #; Next we need to post-process all remaining frames
449 #; the remaining frame count is in %rcx
459 #; increment buffer, decrement counter
460 addq $4, %rcx #; buf++;
462 decq %rdx #; nframes--
463 jnz .AGPOST_START #; if we run out of frames, we go to the end
474 #; float x86_sse_avx_compute_peak(float *buf, long nframes, float current);
476 .globl x86_sse_avx_compute_peak
477 .def x86_sse_avx_compute_peak; .scl 2; .type 32;
480 x86_sse_avx_compute_peak:
482 #; due to Microsoft calling convention
483 #; %rcx float* buf 32(%rbp)
484 #; %rdx unsigned int nframes
485 #; %xmm2 float current
490 #; move current max to %xmm0 for convenience
493 #; if nframes == 0, go to end
497 #; create the "abs" mask in %xmm3
498 #; if will be used to discard sign bit
503 #; Check for alignment
504 movq %rcx, %r8 #; buf => %rdx
505 andq $28, %r8 #; mask bits 1 & 2
506 jz .CP_AVX #; if buffer IS aligned
509 #; we iterate 1-7 times, doing normal x87 float comparison
510 #; so we reach a 32 byte aligned "buf" (=%rcx) value
514 #; Load next value from the buffer
516 andps %xmm3, %xmm1 #; mask out sign bit
519 #; increment buffer, decrement counter
520 addq $4, %rcx #; buf++;
522 decq %rdx #; nframes--
523 jz .CP_END #; if we run out of frames, we go to the end
525 addq $4, %r8 #; one non-aligned byte less
527 jne .LP_START #; if more non-aligned frames exist, we do a do-over
531 #; We have reached the 32 byte aligned "buf" ("rdi") value
533 #; Figure out how many loops we should do
534 movq %rdx, %rax #; copy remaining nframes to %rax for division
536 shr $3, %rax #; unsigned divide by 8
539 #; %rax = AVX iterations
541 #; current maximum is at %xmm0, but we need to broadcast it to the whole ymm0 register..
542 vshufps $0x00, %ymm0, %ymm0, %ymm0 #; spread single float value to the all 128 bits of xmm0 register
543 vperm2f128 $0x00, %ymm0, %ymm0, %ymm0 #; extend the first 128 bits of ymm0 register to higher 128 bits
545 #; broadcast sign mask to the whole ymm3 register
546 vshufps $0x00, %ymm3, %ymm3, %ymm3 #; spread single float value to the all 128 bits of xmm3 register
547 vperm2f128 $0x00, %ymm3, %ymm3, %ymm3 #; extend the first 128 bits of ymm3 register to higher 128 bits
551 vmovaps (%rcx), %ymm1
552 vandps %ymm3, %ymm1, %ymm1 #; mask out sign bit
553 vmaxps %ymm1, %ymm0, %ymm0
555 addq $32, %rcx #; buf+=8
556 subq $8, %rdx #; nframes-=8
561 #; Calculate the maximum value contained in the 4 FP's in %ymm0
562 vshufps $0x4e, %ymm0, %ymm0, %ymm1 #; shuffle left & right pairs (1234 => 3412) in each 128 bit half
563 vmaxps %ymm1, %ymm0, %ymm0 #; maximums of the four pairs, if each of 8 elements was unique, 4 unique elements left now
564 vshufps $0xb1, %ymm0, %ymm0, %ymm1 #; shuffle the floats inside pairs (1234 => 2143) in each 128 bit half
565 vmaxps %ymm1, %ymm0, %ymm0 #; maximums of the four pairs, we had up to 4 unique elements was unique, 2 unique elements left now
566 vperm2f128 $0x01, %ymm0, %ymm0, %ymm1 #; swap 128 bit halfs
567 vmaxps %ymm1, %ymm0, %ymm0 #; the result will be - all 8 elemens are maximums
569 #; now every float in %ymm0 is the same value, current maximum value
571 #; Next we need to post-process all remaining frames
572 #; the remaining frame count is in %rcx
574 #; zero upper 128 bits of all ymm registers to proceed with SSE operations without penalties
577 #; if no remaining frames, jump to the end
584 andps %xmm3, %xmm1 #; mask out sign bit
587 addq $4, %rcx #; buf++;
589 decq %rdx #; nframes--;
594 #; return value is in xmm0