2 Copyright (C) 2005-2006 Paul Davis, John Rigg
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 Author: Sampo Savolainen
19 64-bit conversion: John Rigg
25 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
27 .globl x86_sse_mix_buffers_with_gain
28 .type x86_sse_mix_buffers_with_gain,@function
30 x86_sse_mix_buffers_with_gain:
34 #; %rdx unsigned int nframes
45 #; if nframes == 0, go to end
49 #; Check for alignment
52 andq $12, %rax #; mask alignment offset
55 andq $12, %rbx #; mask alignment offset
58 jne .MBWG_NONALIGN #; if not aligned, calculate manually
64 #; Pre-loop, we need to run 1-3 frames "manually" without
69 #; gain is already in %xmm0
75 addq $4, %rdi #; dst++
76 addq $4, %rsi #; src++
77 decq %rdx #; nframes--
82 cmp $16, %rbx #; test if we've reached 16 byte alignment
88 cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
89 jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
91 #; gain is already in %xmm0
92 shufps $0x00, %xmm0, %xmm0
97 movaps (%rsi), %xmm1 #; source => xmm0
98 mulps %xmm0, %xmm1 #; apply gain to source
99 addps (%rdi), %xmm1 #; mix with destination
100 movaps %xmm1, (%rdi) #; copy result to destination
102 addq $16, %rdi #; dst+=4
103 addq $16, %rsi #; src+=4
105 subq $4, %rdx #; nframes-=4
112 #; if there are remaining frames, the nonalign code will do nicely
113 #; for the rest 1-3 frames.
118 #; gain is already in %xmm0
131 jnz .MBWG_NONALIGNLOOP
143 .size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain
146 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
148 .globl x86_sse_mix_buffers_no_gain
149 .type x86_sse_mix_buffers_no_gain,@function
151 x86_sse_mix_buffers_no_gain:
155 #; %rdx unsigned int nframes
160 #; save the registers
167 #; if nframes == 0, go to end
171 #; Check for alignment
174 andq $12, %rax #; mask alignment offset
177 andq $12, %rbx #; mask alignment offset
180 jne .MBNG_NONALIGN #; if not aligned, calculate manually
185 #; Pre-loop, we need to run 1-3 frames "manually" without
194 addq $4, %rdi #; dst++
195 addq $4, %rsi #; src++
196 decq %rdx #; nframes--
200 cmp $16, %rbx #; test if we've reached 16 byte alignment
205 cmp $4, %rdx #; if there are frames left, but less than 4
206 jnge .MBNG_NONALIGN #; we can't run SSE
210 movaps (%rsi), %xmm0 #; source => xmm0
211 addps (%rdi), %xmm0 #; mix with destination
212 movaps %xmm0, (%rdi) #; copy result to destination
214 addq $16, %rdi #; dst+=4
215 addq $16, %rsi #; src+=4
217 subq $4, %rdx #; nframes-=4
224 #; if there are remaining frames, the nonalign code will do nicely
225 #; for the rest 1-3 frames.
230 movss (%rsi), %xmm0 #; src => xmm0
231 addss (%rdi), %xmm0 #; xmm0 += dst
232 movss %xmm0, (%rdi) #; xmm0 => dst
250 .size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain
253 #; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
255 .globl x86_sse_apply_gain_to_buffer
256 .type x86_sse_apply_gain_to_buffer,@function
258 x86_sse_apply_gain_to_buffer:
260 #; %rdi float *buf 32(%rbp)
261 #; %rsi unsigned int nframes
263 #; %xmm1 float buf[0]
273 #; if nframes == 0, go to end
274 movq %rsi, %rcx #; nframes
278 #; set up the gain buffer (gain is already in %xmm0)
279 shufps $0x00, %xmm0, %xmm0
281 #; Check for alignment
283 movq %rdi, %rdx #; buf => %rdx
284 andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
285 jz .AG_SSE #; if buffer IS aligned
288 #; we iterate 1-3 times, doing normal x87 float comparison
289 #; so we reach a 16 byte aligned "buf" (=%rdi) value
293 #; Load next value from the buffer into %xmm1
298 #; increment buffer, decrement counter
299 addq $4, %rdi #; buf++;
301 decq %rcx #; nframes--
302 jz .AG_END #; if we run out of frames, we go to the end
304 addq $4, %rdx #; one non-aligned byte less
306 jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
310 #; We have reached the 16 byte aligned "buf" ("rdi") value
312 #; Figure out how many loops we should do
313 movq %rcx, %rax #; copy remaining nframes to %rax for division
314 movq $0, %rdx #; 0 the edx register
319 divq %rdi #; %rdx = remainder == 0
322 #; %rax = SSE iterations
334 subq $4, %rcx #; nframes-=4
339 #; Next we need to post-process all remaining frames
340 #; the remaining frame count is in %rcx
342 #; if no remaining frames, jump to the end
344 andq $3, %rcx #; nframes % 4
353 #; increment buffer, decrement counter
354 addq $4, %rdi #; buf++;
356 decq %rcx #; nframes--
357 jnz .AGPOST_START #; if we run out of frames, we go to the end
368 .size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer
372 #; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes)
374 .globl x86_sse_apply_gain_vector
375 .type x86_sse_apply_gain_vector,@function
377 x86_sse_apply_gain_vector:
380 #; %rsi float *gain_vector
381 #; %rdx unsigned int nframes
391 #; if nframes == 0 go to end
406 jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
408 #; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
411 movss (%rdi), %xmm0 #; buf => xmm0
412 movss (%rsi), %xmm1 #; gain value => xmm1
413 mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
414 movss %xmm0, (%rdi) #; signal with gain => buf
419 addq $4, %rdi #; buf++
420 addq $4, %rsi #; gab++
426 #; There are frames left for sure, as that is checked in the beginning
427 #; and within the previous loop. BUT, there might be less than 4 frames
431 movq %rdx, %rax #; nframes => %rax
432 shr $2, %rax #; unsigned divide by 4
434 cmp $0, %rax #; Jos toimii ilman t�t�, niin kiva
449 andq $3, %rdx #; Remaining frames are nframes & 3
453 #; Inside this loop, we know there are frames left to process
454 #; but because either there are < 4 frames left, or the buffers
455 #; are not aligned, we can't use the parallel SSE ops
457 movss (%rdi), %xmm0 #; buf => xmm0
458 movss (%rsi), %xmm1 #; gain value => xmm1
459 mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
460 movss %xmm0, (%rdi) #; signal with gain => buf
464 decq %rdx #; nframes--
476 .size x86_sse_apply_gain_vector, .-x86_sse_apply_gain_vector
480 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
482 .globl x86_sse_compute_peak
483 .type x86_sse_compute_peak,@function
486 x86_sse_compute_peak:
488 #; %rdi float *buf 32(%rbp)
489 #; %rsi unsigned int nframes
490 #; %xmm0 float current
491 #; %xmm1 float buf[0]
499 #; if nframes == 0, go to end
500 movq %rsi, %rcx #; nframes
504 #; create the "abs" mask in %xmm2
508 shufps $0x00, %xmm2, %xmm2
510 #; Check for alignment
512 #;movq 8(%rbp), %rdi #; buf
513 movq %rdi, %rdx #; buf => %rdx
514 andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
515 jz .CP_SSE #; if buffer IS aligned
518 #; we iterate 1-3 times, doing normal x87 float comparison
519 #; so we reach a 16 byte aligned "buf" (=%rdi) value
523 #; Load next value from the buffer
528 #; increment buffer, decrement counter
529 addq $4, %rdi #; buf++;
531 decq %rcx #; nframes--
532 jz .CP_END #; if we run out of frames, we go to the end
534 addq $4, %rdx #; one non-aligned byte less
536 jne .LP_START #; if more non-aligned frames exist, we do a do-over
540 #; We have reached the 16 byte aligned "buf" ("rdi") value
542 #; Figure out how many loops we should do
543 movq %rcx, %rax #; copy remaining nframes to %rax for division
545 shr $2,%rax #; unsigned divide by 4
548 #; %rax = SSE iterations
550 #; current maximum is at %xmm0, but we need to ..
551 shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
553 #;prefetcht0 16(%rdi)
566 #; Calculate the maximum value contained in the 4 FP's in %xmm0
568 shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
569 maxps %xmm1, %xmm0 #; maximums of the two pairs
571 shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
574 #; now every float in %xmm0 is the same value, current maximum value
576 #; Next we need to post-process all remaining frames
577 #; the remaining frame count is in %rcx
579 #; if no remaining frames, jump to the end
581 andq $3, %rcx #; nframes % 4
590 addq $4, %rdi #; buf++;
592 decq %rcx #; nframes--;
603 .size x86_sse_compute_peak, .-x86_sse_compute_peak
607 .section .note.GNU-stack,"",%progbits