2 Copyright (C) 2005-2006 Paul Davis, John Rigg
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 2 of the License, or
7 (at your option) any later version.
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
14 You should have received a copy of the GNU General Public License
15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
18 Author: Sampo Savolainen
19 64-bit conversion: John Rigg
24 #; Microsoft version of SSE sample processing functions
26 #; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain);
28 .globl x86_sse_mix_buffers_with_gain
29 .def x86_sse_mix_buffers_with_gain; .scl 2; .type 32;
32 x86_sse_mix_buffers_with_gain:
34 #; due to Microsoft calling convention
37 #; %r8 unsigned int nframes
40 #; due to System V AMD64 (Linux) calling convention
43 #; %rdx unsigned int nframes
50 pushq %rbx #; must be preserved
53 pushq %rdi #; must be preserved
54 pushq %rsi #; must be preserved
56 #; to keep algorithms universal - move input params into Linux specific registers
62 #; if nframes == 0, go to end
66 #; Check for alignment
69 andq $12, %rax #; mask alignment offset
72 andq $12, %rbx #; mask alignment offset
75 jne .MBWG_NONALIGN #; if not aligned, calculate manually
81 #; Pre-loop, we need to run 1-3 frames "manually" without
86 #; gain is already in %xmm0
92 addq $4, %rdi #; dst++
93 addq $4, %rsi #; src++
94 decq %rdx #; nframes--
99 cmp $16, %rbx #; test if we've reached 16 byte alignment
105 cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then
106 jnge .MBWG_NONALIGN #; we jump straight to the "normal" code
108 #; gain is already in %xmm0
109 shufps $0x00, %xmm0, %xmm0
114 movaps (%rsi), %xmm1 #; source => xmm0
115 mulps %xmm0, %xmm1 #; apply gain to source
116 addps (%rdi), %xmm1 #; mix with destination
117 movaps %xmm1, (%rdi) #; copy result to destination
119 addq $16, %rdi #; dst+=4
120 addq $16, %rsi #; src+=4
122 subq $4, %rdx #; nframes-=4
129 #; if there are remaining frames, the nonalign code will do nicely
130 #; for the rest 1-3 frames.
135 #; gain is already in %xmm0
148 jnz .MBWG_NONALIGNLOOP
163 #; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes);
165 .globl x86_sse_mix_buffers_no_gain
166 .def x86_sse_mix_buffers_no_gain; .scl 2; .type 32;
169 x86_sse_mix_buffers_no_gain:
171 #; due to Microsoft calling convention
174 #; %r8 unsigned int nframes
176 #; due to System V AMD64 (Linux) calling convention
179 #; %rdx unsigned int nframes
184 #; save the registers
185 pushq %rbx #; must be preserved
188 pushq %rdi #; must be preserved
189 pushq %rsi #; must be preserved
191 #; to keep algorithms universal - move input params into Linux specific registers
198 #; if nframes == 0, go to end
202 #; Check for alignment
205 andq $12, %rax #; mask alignment offset
208 andq $12, %rbx #; mask alignment offset
211 jne .MBNG_NONALIGN #; if not aligned, calculate manually
216 #; Pre-loop, we need to run 1-3 frames "manually" without
225 addq $4, %rdi #; dst++
226 addq $4, %rsi #; src++
227 decq %rdx #; nframes--
231 cmp $16, %rbx #; test if we've reached 16 byte alignment
236 cmp $4, %rdx #; if there are frames left, but less than 4
237 jnge .MBNG_NONALIGN #; we can't run SSE
241 movaps (%rsi), %xmm0 #; source => xmm0
242 addps (%rdi), %xmm0 #; mix with destination
243 movaps %xmm0, (%rdi) #; copy result to destination
245 addq $16, %rdi #; dst+=4
246 addq $16, %rsi #; src+=4
248 subq $4, %rdx #; nframes-=4
255 #; if there are remaining frames, the nonalign code will do nicely
256 #; for the rest 1-3 frames.
261 movss (%rsi), %xmm0 #; src => xmm0
262 addss (%rdi), %xmm0 #; xmm0 += dst
263 movss %xmm0, (%rdi) #; xmm0 => dst
284 #; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain);
286 .globl x86_sse_apply_gain_to_buffer
287 .def x86_sse_apply_gain_to_buffer; .scl 2; .type 32;
290 x86_sse_apply_gain_to_buffer:
292 #; due to Microsoft calling convention
293 #; %rcx float *buf 32(%rbp)
294 #; %rdx unsigned int nframes
296 #; %xmm1 float buf[0]
298 #; due to System V AMD64 (Linux) calling convention
299 #; %rdi float *buf 32(%rbp)
300 #; %rsi unsigned int nframes
302 #; %xmm1 float buf[0]
307 #; save the registers
309 pushq %rdi #; must be preserved
310 pushq %rsi #; must be preserved
312 #; to keep algorithms universal - move input params into Linux specific registers
319 #; if nframes == 0, go to end
320 movq %rsi, %rcx #; nframes
324 #; set up the gain buffer (gain is already in %xmm0)
325 shufps $0x00, %xmm0, %xmm0
327 #; Check for alignment
329 movq %rdi, %rdx #; buf => %rdx
330 andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
331 jz .AG_SSE #; if buffer IS aligned
334 #; we iterate 1-3 times, doing normal x87 float comparison
335 #; so we reach a 16 byte aligned "buf" (=%rdi) value
339 #; Load next value from the buffer into %xmm1
344 #; increment buffer, decrement counter
345 addq $4, %rdi #; buf++;
347 decq %rcx #; nframes--
348 jz .AG_END #; if we run out of frames, we go to the end
350 addq $4, %rdx #; one non-aligned byte less
352 jne .AGLP_START #; if more non-aligned frames exist, we do a do-over
356 #; We have reached the 16 byte aligned "buf" ("rdi") value
358 #; Figure out how many loops we should do
359 movq %rcx, %rax #; copy remaining nframes to %rax for division
361 shr $2,%rax #; unsigned divide by 4
363 #; %rax = SSE iterations
373 addq $16, %rdi #; buf + 4
374 subq $4, %rcx #; nframes-=4
379 #; Next we need to post-process all remaining frames
380 #; the remaining frame count is in %rcx
382 andq $3, %rcx #; nframes % 4
391 #; increment buffer, decrement counter
392 addq $4, %rdi #; buf++;
394 decq %rcx #; nframes--
395 jnz .AGPOST_START #; if we run out of frames, we go to the end
410 #; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes)
412 .globl x86_sse_apply_gain_vector
413 .def x86_sse_apply_gain_vector; .scl 2; .type 32;
417 x86_sse_apply_gain_vector:
419 #; due to Microsoft calling convention
421 #; %rdx float *gain_vector
422 #; %r8 unsigned int nframes
424 #; due to System V AMD64 (Linux) calling convention
426 #; %rsi float *gain_vector
427 #; %rdx unsigned int nframes
432 #; save the registers
433 pushq %rbx #; must be preserved
436 pushq %rdi #; must be preserved
437 pushq %rsi #; must be preserved
439 #; to keep algorithms universal - move input params into Linux specific registers
444 #; if nframes == 0 go to end
459 jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop
461 #; Buffers aren't 16 byte aligned, but they are unaligned by the same amount
464 movss (%rdi), %xmm0 #; buf => xmm0
465 movss (%rsi), %xmm1 #; gain value => xmm1
466 mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
467 movss %xmm0, (%rdi) #; signal with gain => buf
472 addq $4, %rdi #; buf++
473 addq $4, %rsi #; gab++
479 #; There are frames left for sure, as that is checked in the beginning
480 #; and within the previous loop. BUT, there might be less than 4 frames
484 movq %rdx, %rax #; nframes => %rax
485 shr $2, %rax #; unsigned divide by 4
502 andq $3, %rdx #; Remaining frames are nframes & 3
506 #; Inside this loop, we know there are frames left to process
507 #; but because either there are < 4 frames left, or the buffers
508 #; are not aligned, we can't use the parallel SSE ops
510 movss (%rdi), %xmm0 #; buf => xmm0
511 movss (%rsi), %xmm1 #; gain value => xmm1
512 mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0
513 movss %xmm0, (%rdi) #; signal with gain => buf
517 decq %rdx #; nframes--
534 #; float x86_sse_compute_peak(float *buf, long nframes, float current);
536 .globl x86_sse_compute_peak
537 .def x86_sse_compute_peak; .scl 2; .type 32;
541 x86_sse_compute_peak:
543 #; due to Microsoft calling convention
544 #; %rcx float* buf 32(%rbp)
545 #; %rdx unsigned int nframes
546 #; %xmm2 float current
547 #; %xmm1 float buf[0]
549 #; due to System V AMD64 (Linux) calling convention
550 #; %rdi float* buf 32(%rbp)
551 #; %rsi unsigned int nframes
552 #; %xmm0 float current
553 #; %xmm1 float buf[0]
560 pushq %rdi #; must be preserved
561 pushq %rsi #; must be preserved
563 #; to keep algorithms universal - move input params into Linux specific registers
568 #; if nframes == 0, go to end
569 movq %rsi, %rcx #; nframes
573 #; create the "abs" mask in %xmm2
577 shufps $0x00, %xmm2, %xmm2
579 #; Check for alignment
581 #;movq 8(%rbp), %rdi #; buf
582 movq %rdi, %rdx #; buf => %rdx
583 andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12
584 jz .CP_SSE #; if buffer IS aligned
587 #; we iterate 1-3 times, doing normal x87 float comparison
588 #; so we reach a 16 byte aligned "buf" (=%rdi) value
592 #; Load next value from the buffer
597 #; increment buffer, decrement counter
598 addq $4, %rdi #; buf++;
600 decq %rcx #; nframes--
601 jz .CP_END #; if we run out of frames, we go to the end
603 addq $4, %rdx #; one non-aligned byte less
605 jne .LP_START #; if more non-aligned frames exist, we do a do-over
609 #; We have reached the 16 byte aligned "buf" ("rdi") value
611 #; Figure out how many loops we should do
612 movq %rcx, %rax #; copy remaining nframes to %rax for division
614 shr $2,%rax #; unsigned divide by 4
617 #; %rax = SSE iterations
619 #; current maximum is at %xmm0, but we need to ..
620 shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's
622 #;prefetcht0 16(%rdi)
632 subq $4, %rcx #; nframes-=4
637 #; Calculate the maximum value contained in the 4 FP's in %xmm0
639 shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412)
640 maxps %xmm1, %xmm0 #; maximums of the two pairs
642 shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143)
645 #; now every float in %xmm0 is the same value, current maximum value
647 #; Next we need to post-process all remaining frames
648 #; the remaining frame count is in %rcx
650 #; if no remaining frames, jump to the end
652 andq $3, %rcx #; nframes % 4
661 addq $4, %rdi #; buf++;
663 decq %rcx #; nframes--;
673 #; return value is in xmm0