From: Sampo Savolainen Date: Thu, 2 Feb 2006 20:55:26 +0000 (+0000) Subject: 64 bit SSE mod X-Git-Tag: 2.0beta4~421 X-Git-Url: https://git.carlh.net/gitweb/?a=commitdiff_plain;h=483548cc130b82e7396c48753c662fe7430cd160;p=ardour.git 64 bit SSE mod git-svn-id: svn://localhost/trunk/ardour2@308 d708f5d6-7413-0410-9779-e7cbd77b26cf --- diff --git a/SConstruct b/SConstruct index d77c92a29b..c1f6d9797b 100644 --- a/SConstruct +++ b/SConstruct @@ -23,20 +23,17 @@ subst_dict = { } opts = Options('scache.conf') opts.AddOptions( - BoolOption('ALTIVEC', 'Compile using Altivec instructions', 0), ('ARCH', 'Set architecture-specific compilation flags by hand (all flags as 1 argument)',''), BoolOption('SYSLIBS', 'USE AT YOUR OWN RISK: CANCELS ALL SUPPORT FROM ARDOUR AUTHORS: Use existing system versions of various libraries instead of internal ones', 0), BoolOption('DEBUG', 'Set to build with debugging information and no optimizations', 0), PathOption('DESTDIR', 'Set the intermediate install "prefix"', '/'), BoolOption('DEVBUILD', 'Use shared libardour (developers only)', 0), BoolOption('NLS', 'Set to turn on i18n support', 1), - BoolOption('NOARCH', 'Do not use architecture-specific compilation flags', 0), PathOption('PREFIX', 'Set the install "prefix"', '/usr/local'), BoolOption('VST', 'Compile with support for VST', 0), BoolOption('VERSIONED', 'Add version information to ardour/gtk executable name inside the build directory', 0), - BoolOption('USE_SSE_EVERYWHERE', 'Ask the compiler to use x86/SSE instructions and also our hand-written x86/SSE optimizations when possible (off by default)', 0), - BoolOption('BUILD_SSE_OPTIMIZATIONS', 'Use our hand-written x86/SSE optimizations when possible (off by default)', 0), - BoolOption('BUILD_VECLIB_OPTIMIZATIONS', 'Build with Apple Accelerate/vecLib optimizations when possible (off by default)', 0) + EnumOption('DIST_TARGET', 'Build target for cross compiling packagers', 'i386', allowed_values=('none', 'tiger', 'panther', 'i686', 'x86_64', 'i386'), ignorecase=2), + BoolOption('FPU_OPTIMIZATION', 'Build runtime checked assembler code', 1) ) #---------------------------------------------------------------------- @@ -574,7 +571,7 @@ config_os = 3; config = config_guess.split ("-") if config[config_arch] == 'apple': - if env['BUILD_VECLIB_OPTIMIZATIONS'] == 1: + if env['FPU_OPTIMIZATION']: opt_flags.append ("-DBUILD_VECLIB_OPTIMIZATIONS") debug_flags.append ("-DBUILD_VECLIB_OPTIMIZATIONS") libraries['core'].Append(LINKFLAGS= '-framework Accelerate') @@ -585,21 +582,20 @@ if config[config_cpu] == 'powerpc': # # -mcpu=7450 does not reliably work with gcc 3.* # - if env['NOARCH'] == 0: - if env['ALTIVEC'] == 1: - if config[config_arch] == 'apple': - opt_flags.extend ([ "-mcpu=7450", "-faltivec"]) - else: - opt_flags.extend ([ "-mcpu=7400", "-maltivec", "-mabi=altivec"]) - else: - opt_flags.extend([ "-mcpu=750", "-mmultiple" ]) - opt_flags.extend (["-mhard-float", "-mpowerpc-gfxopt"]) + if env['DIST_TARGET'] == 'panther' or env['DIST_TARGET'] == 'tiger': + if config[config_arch] == 'apple': + opt_flags.extend ([ "-mcpu=7450", "-faltivec"]) + else: + opt_flags.extend ([ "-mcpu=7400", "-maltivec", "-mabi=altivec"]) + else: + opt_flags.extend([ "-mcpu=750", "-mmultiple" ]) + opt_flags.extend (["-mhard-float", "-mpowerpc-gfxopt"]) elif ((re.search ("i[0-9]86", config[config_cpu]) != None) or (re.search ("x86_64", config[config_cpu]) != None)): build_host_supports_sse = 0 - if env['NOARCH'] == 0: + if env['DIST_TARGET'] != 'none': debug_flags.append ("-DARCH_X86") opt_flags.append ("-DARCH_X86") @@ -612,7 +608,7 @@ elif ((re.search ("i[0-9]86", config[config_cpu]) != None) or (re.search ("x86_6 if "mmx" in x86_flags: opt_flags.append ("-mmmx") if "sse" in x86_flags: - build_host_supports_sse = 1 + build_host_supports_sse = 1 if "3dnow" in x86_flags: opt_flags.append ("-m3dnow") @@ -621,17 +617,20 @@ elif ((re.search ("i[0-9]86", config[config_cpu]) != None) or (re.search ("x86_6 elif config[config_cpu] == "i686": opt_flags.append ("-march=i686") - if env['USE_SSE_EVERYWHERE'] == 1: - opt_flags.extend (["-msse", "-mfpmath=sse"]) - debug_flags.extend (["-msse", "-mfpmath=sse"]) - if build_host_supports_sse != 1: - print "\nWarning: you are building Ardour with SSE support even though your system does not support these instructions. (This may not be an error, especially if you are a package maintainer)" - - if env['BUILD_SSE_OPTIMIZATIONS'] == 1: - opt_flags.append ("-DBUILD_SSE_OPTIMIZATIONS") - debug_flags.append ("-DBUILD_SSE_OPTIMIZATIONS") - if build_host_supports_sse != 1: - print "\nWarning: you are building Ardour with SSE support even though your system does not support these instructions. (This may not be an error, especially if you are a package maintainer)" + if (env['DIST_TARGET'] == 'i686') or (env['DIST_TARGET'] == 'x86_64'): + opt_flags.extend (["-msse", "-mfpmath=sse"]) + debug_flags.extend (["-msse", "-mfpmath=sse"]) + if build_host_supports_sse != 1: + print "\nWarning: you are building Ardour with SSE support even though your system does not support these instructions. (This may not be an error, especially if you are a package maintainer)" + + if env['FPU_OPTIMIZATION']: + opt_flags.append ("-DBUILD_SSE_OPTIMIZATIONS") + debug_flags.append ("-DBUILD_SSE_OPTIMIZATIONS") + if env['DIST_TARGET'] == 'x86_64': + opt_flags.append ("-DUSE_X86_64_ASM") + debug_flags.append ("-DUSE_X86_64_ASM") + if build_host_supports_sse != 1: + print "\nWarning: you are building Ardour with SSE support even though your system does not support these instructions. (This may not be an error, especially if you are a package maintainer)" # end of processor-specific section diff --git a/libs/ardour/SConscript b/libs/ardour/SConscript index 2b74a8d979..6f0b1fdc52 100644 --- a/libs/ardour/SConscript +++ b/libs/ardour/SConscript @@ -195,12 +195,23 @@ env['BUILDERS']['SharedAsmObject'] = Builder (action = '$CXX -c -fPIC $SOURCE -o single_source = 1) if env['DEVBUILD'] == 1: - if env['BUILD_SSE_OPTIMIZATIONS'] == 1: - arch_specific_objects = env.SharedAsmObject('sse_functions.os', 'sse_functions.s') + if env['FPU_OPTIMIZATION']: + if env['DIST_TARGET'] == "i386": + arch_specific_objects = env.SharedAsmObject('sse_functions.os', 'sse_functions.s') + if env['DIST_TARGET'] == "i686": + arch_specific_objects = env.SharedAsmObject('sse_functions.os', 'sse_functions.s') + if env['DIST_TARGET'] == "x86_64": + arch_specific_objects = env.SharedAsmObject('sse_functions_64bit.os', 'sse_functions_64bit.s') + libardour = ardour.SharedLibrary('ardour', ardour_files + extra_sources + arch_specific_objects) else: - if env['BUILD_SSE_OPTIMIZATIONS'] == 1: - arch_specific_objects = env.StaticObject(target='sse_functions',source='sse_functions.s') + if env['FPU_OPTIMIZATION']: + if env['DIST_TARGET'] == "i386": + arch_specific_objects = env.StaticObject(target='sse_functions',source='sse_functions.s') + if env['DIST_TARGET'] == "i686": + arch_specific_objects = env.StaticObject(target='sse_functions',source='sse_functions.s') + if env['DIST_TARGET'] == "x86_64": + arch_specific_objects = env.StaticObject(target='sse_functions_64bit',source='sse_functions_64bit.s') libardour = ardour.StaticLibrary('ardour', ardour_files + extra_sources + arch_specific_objects) diff --git a/libs/ardour/globals.cc b/libs/ardour/globals.cc index 238055089e..ea055a2b6b 100644 --- a/libs/ardour/globals.cc +++ b/libs/ardour/globals.cc @@ -192,6 +192,7 @@ ARDOUR::init (AudioEngine& engine, bool use_vst, bool try_optimization, void (*s unsigned int use_sse = 0; +#ifndef USE_X86_64_ASM asm volatile ( "mov $1, %%eax\n" "pushl %%ebx\n" @@ -202,7 +203,21 @@ ARDOUR::init (AudioEngine& engine, bool use_vst, bool try_optimization, void (*s : "=m" (use_sse) : : "%eax", "%ecx", "%edx", "memory"); +#else + asm volatile ( + "movq $1, %%rax\n" + "pushq %%rbx\n" + "cpuid\n" + "popq %%rbx\n" + "andq $33554432, %%rdx\n" + "movq %%rdx, %0\n" + : "=m" (use_sse) + : + : "%rax", "%rcx", "%rdx", "memory"); + +#endif /* USE_X86_64_ASM */ + if (use_sse) { cerr << "Enabling SSE optimized routines" << endl; diff --git a/libs/ardour/sse_functions_64bit.s b/libs/ardour/sse_functions_64bit.s new file mode 100644 index 0000000000..15ad0da841 --- /dev/null +++ b/libs/ardour/sse_functions_64bit.s @@ -0,0 +1,602 @@ +/* + Copyright (C) 2005-2006 Sampo Savolainen, John Rigg + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + $Id$ +*/ + + +#; void x86_sse_mix_buffers_with_gain (float *dst, float *src, unsigned int nframes, float gain); + +.globl x86_sse_mix_buffers_with_gain + .type x86_sse_mix_buffers_with_gain,@function + +x86_sse_mix_buffers_with_gain: + +#; %rdi float *dst +#; %rsi float *src +#; %rdx unsigned int nframes +#; %xmm0 float gain + + pushq %rbp + movq %rsp, %rbp + + #; save the registers + pushq %rbx + pushq %rdi + pushq %rsi + + #; if nframes == 0, go to end + cmp $0, %rdx + je .MBWG_END + + #; Check for alignment + + movq %rdi, %rax + andq $12, %rax #; mask alignment offset + + movq %rsi, %rbx + andq $12, %rbx #; mask alignment offset + + cmp %rax, %rbx + jne .MBWG_NONALIGN #; if not aligned, calculate manually + + #; if we are aligned + cmp $0, %rbx + jz .MBWG_SSE + + #; Pre-loop, we need to run 1-3 frames "manually" without + #; SSE instructions + +.MBWG_PRELOOP: + + #; gain is already in %xmm0 + movss (%rsi), %xmm1 + mulss %xmm0, %xmm1 + addss (%rdi), %xmm1 + movss %xmm1, (%rdi) + + addq $4, %rdi #; dst++ + addq $4, %rsi #; src++ + decq %rdx #; nframes-- + jz .MBWG_END + + addq $4, %rbx + + cmp $16, %rbx #; test if we've reached 16 byte alignment + jne .MBWG_PRELOOP + + +.MBWG_SSE: + + cmp $4, %rdx #; we know it's not zero, but if it's not >=4, then + jnge .MBWG_NONALIGN #; we jump straight to the "normal" code + + #; gain is already in %xmm0 + shufps $0x00, %xmm0, %xmm0 + + +.MBWG_SSELOOP: + + movaps (%rsi), %xmm1 #; source => xmm0 + mulps %xmm0, %xmm1 #; apply gain to source + addps (%rdi), %xmm1 #; mix with destination + movaps %xmm1, (%rdi) #; copy result to destination + + addq $16, %rdi #; dst+=4 + addq $16, %rsi #; src+=4 + + subq $4, %rdx #; nframes-=4 + cmp $4, %rdx + jge .MBWG_SSELOOP + + cmp $0, %rdx + je .MBWG_END + + #; if there are remaining frames, the nonalign code will do nicely + #; for the rest 1-3 frames. + +.MBWG_NONALIGN: + #; not aligned! + + #; gain is already in %xmm0 + +.MBWG_NONALIGNLOOP: + + movss (%rsi), %xmm1 + mulss %xmm0, %xmm1 + addss (%rdi), %xmm1 + movss %xmm1, (%rdi) + + addq $4, %rdi + addq $4, %rsi + + decq %rdx + jnz .MBWG_NONALIGNLOOP + +.MBWG_END: + + popq %rsi + popq %rdi + popq %rbx + + #; return + leave + ret + +.size x86_sse_mix_buffers_with_gain, .-x86_sse_mix_buffers_with_gain + + +#; void x86_sse_mix_buffers_no_gain (float *dst, float *src, unsigned int nframes); + +.globl x86_sse_mix_buffers_no_gain + .type x86_sse_mix_buffers_no_gain,@function + +x86_sse_mix_buffers_no_gain: + +#; %rdi float *dst +#; %rsi float *src +#; %rdx unsigned int nframes + + pushq %rbp + movq %rsp, %rbp + + #; save the registers + pushq %rbx + pushq %rdi + pushq %rsi + + #; the real function + + #; if nframes == 0, go to end + cmp $0, %rdx + je .MBNG_END + + #; Check for alignment + + movq %rdi, %rax + andq $12, %rax #; mask alignment offset + + movq %rsi, %rbx + andq $12, %rbx #; mask alignment offset + + cmp %rax, %rbx + jne .MBNG_NONALIGN #; if not aligned, calculate manually + + cmp $0, %rbx + je .MBNG_SSE + + #; Pre-loop, we need to run 1-3 frames "manually" without + #; SSE instructions + +.MBNG_PRELOOP: + + movss (%rsi), %xmm0 + addss (%rdi), %xmm0 + movss %xmm0, (%rdi) + + addq $4, %rdi #; dst++ + addq $4, %rsi #; src++ + decq %rdx #; nframes-- + jz .MBNG_END + addq $4, %rbx + + cmp $16, %rbx #; test if we've reached 16 byte alignment + jne .MBNG_PRELOOP + +.MBNG_SSE: + + cmp $4, %rdx #; if there are frames left, but less than 4 + jnge .MBNG_NONALIGN #; we can't run SSE + +.MBNG_SSELOOP: + + movaps (%rsi), %xmm0 #; source => xmm0 + addps (%rdi), %xmm0 #; mix with destination + movaps %xmm0, (%rdi) #; copy result to destination + + addq $16, %rdi #; dst+=4 + addq $16, %rsi #; src+=4 + + subq $4, %rdx #; nframes-=4 + cmp $4, %rdx + jge .MBNG_SSELOOP + + cmp $0, %rdx + je .MBNG_END + + #; if there are remaining frames, the nonalign code will do nicely + #; for the rest 1-3 frames. + +.MBNG_NONALIGN: + #; not aligned! + + movss (%rsi), %xmm0 #; src => xmm0 + addss (%rdi), %xmm0 #; xmm0 += dst + movss %xmm0, (%rdi) #; xmm0 => dst + + addq $4, %rdi + addq $4, %rsi + + decq %rdx + jnz .MBNG_NONALIGN + +.MBNG_END: + + popq %rsi + popq %rdi + popq %rbx + + #; return + leave + ret + +.size x86_sse_mix_buffers_no_gain, .-x86_sse_mix_buffers_no_gain + + +#; void x86_sse_apply_gain_to_buffer (float *buf, unsigned int nframes, float gain); + +.globl x86_sse_apply_gain_to_buffer + .type x86_sse_apply_gain_to_buffer,@function + +x86_sse_apply_gain_to_buffer: + +#; %rdi float *buf 32(%rbp) +#; %rsi unsigned int nframes +#; %xmm0 float gain +#; %xmm1 float buf[0] + + pushq %rbp + movq %rsp, %rbp + + #; save %rdi + pushq %rdi + + #; the real function + + #; if nframes == 0, go to end + movq %rsi, %rcx #; nframes + cmp $0, %rcx + je .AG_END + + #; set up the gain buffer (gain is already in %xmm0) + shufps $0x00, %xmm0, %xmm0 + + #; Check for alignment + + movq %rdi, %rdx #; buf => %rdx + andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12 + jz .AG_SSE #; if buffer IS aligned + + #; PRE-LOOP + #; we iterate 1-3 times, doing normal x87 float comparison + #; so we reach a 16 byte aligned "buf" (=%rdi) value + +.AGLP_START: + + #; Load next value from the buffer into %xmm1 + movss (%rdi), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, (%rdi) + + #; increment buffer, decrement counter + addq $4, %rdi #; buf++; + + decq %rcx #; nframes-- + jz .AG_END #; if we run out of frames, we go to the end + + addq $4, %rdx #; one non-aligned byte less + cmp $16, %rdx + jne .AGLP_START #; if more non-aligned frames exist, we do a do-over + +.AG_SSE: + + #; We have reached the 16 byte aligned "buf" ("rdi") value + + #; Figure out how many loops we should do + movq %rcx, %rax #; copy remaining nframes to %rax for division + movq $0, %rdx #; 0 the edx register + + + pushq %rdi + movq $4, %rdi + divq %rdi #; %rdx = remainder == 0 + popq %rdi + + #; %rax = SSE iterations + cmp $0, %rax + je .AGPOST_START + + +.AGLP_SSE: + + movaps (%rdi), %xmm1 + mulps %xmm0, %xmm1 + movaps %xmm1, (%rdi) + + addq $16, %rdi + subq $4, %rcx #; nframes-=4 + + decq %rax + jnz .AGLP_SSE + + #; Next we need to post-process all remaining frames + #; the remaining frame count is in %rcx + + #; if no remaining frames, jump to the end + cmp $0, %rcx + andq $3, %rcx #; nframes % 4 + je .AG_END + +.AGPOST_START: + + movss (%rdi), %xmm1 + mulss %xmm0, %xmm1 + movss %xmm1, (%rdi) + + #; increment buffer, decrement counter + addq $4, %rdi #; buf++; + + decq %rcx #; nframes-- + jnz .AGPOST_START #; if we run out of frames, we go to the end + +.AG_END: + + + popq %rdi + + #; return + leave + ret + +.size x86_sse_apply_gain_to_buffer, .-x86_sse_apply_gain_to_buffer +#; end proc + + +#; x86_sse_apply_gain_vector(float *buf, float *gain_vector, unsigned int nframes) + +.globl x86_sse_apply_gain_vector + .type x86_sse_apply_gain_vector,@function + +x86_sse_apply_gain_vector: + +#; %rdi float *buf +#; %rsi float *gain_vector +#; %rdx unsigned int nframes + + pushq %rbp + movq %rsp, %rbp + + #; Save registers + pushq %rdi + pushq %rsi + pushq %rbx + + #; if nframes == 0 go to end + cmp $0, %rdx + je .AGA_END + + #; Check alignment + movq %rdi, %rax + andq $12, %rax + + movq %rsi, %rbx + andq $12, %rbx + + cmp %rax,%rbx + jne .AGA_ENDLOOP + + cmp $0, %rax + jz .AGA_SSE #; if buffers are aligned, jump to the SSE loop + +#; Buffers aren't 16 byte aligned, but they are unaligned by the same amount +.AGA_ALIGNLOOP: + + movss (%rdi), %xmm0 #; buf => xmm0 + movss (%rsi), %xmm1 #; gain value => xmm1 + mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0 + movss %xmm0, (%rdi) #; signal with gain => buf + + decq %rdx + jz .AGA_END + + addq $4, %rdi #; buf++ + addq $4, %rsi #; gab++ + + addq $4, %rax + cmp $16, %rax + jne .AGA_ALIGNLOOP + +#; There are frames left for sure, as that is checked in the beginning +#; and within the previous loop. BUT, there might be less than 4 frames +#; to process + +.AGA_SSE: + movq %rdx, %rax #; nframes => %rax + shr $2, %rax #; unsigned divide by 4 + + cmp $0, %rax #; Jos toimii ilman tätä, niin kiva + je .AGA_ENDLOOP + +.AGA_SSELOOP: + movaps (%rdi), %xmm0 + movaps (%rsi), %xmm1 + mulps %xmm1, %xmm0 + movaps %xmm0, (%rdi) + + addq $16, %rdi + addq $16, %rsi + + decq %rax + jnz .AGA_SSELOOP + + andq $3, %rdx #; Remaining frames are nframes & 3 + jz .AGA_END + + +#; Inside this loop, we know there are frames left to process +#; but because either there are < 4 frames left, or the buffers +#; are not aligned, we can't use the parallel SSE ops +.AGA_ENDLOOP: + movss (%rdi), %xmm0 #; buf => xmm0 + movss (%rsi), %xmm1 #; gain value => xmm1 + mulss %xmm1, %xmm0 #; xmm1 * xmm0 => xmm0 + movss %xmm0, (%rdi) #; signal with gain => buf + + addq $4,%rdi + addq $4,%rsi + decq %rdx #; nframes-- + jnz .AGA_ENDLOOP + +.AGA_END: + + popq %rbx + popq %rsi + popq %rdi + + leave + ret + +.size x86_sse_apply_gain_vector, .-x86_sse_apply_gain_vector +#; end proc + + +#; float x86_sse_compute_peak(float *buf, long nframes, float current); + +.globl x86_sse_compute_peak + .type x86_sse_compute_peak,@function + +abs_mask: + .long 2147483647 + + +x86_sse_compute_peak: + +#; %rdi float *buf 32(%rbp) +#; %rsi unsigned int nframes +#; %xmm0 float current +#; %xmm1 float buf[0] + + pushq %rbp + movq %rsp, %rbp + + #; save %rdi + pushq %rdi + + #; if nframes == 0, go to end + movq %rsi, %rcx #; nframes + cmp $0, %rcx + je .CP_END + + #; create the "abs" mask in %xmm2 + movss abs_mask, %xmm2 + shufps $0x00, %xmm2, %xmm2 + + #; Check for alignment + + #;movq 8(%rbp), %rdi #; buf + movq %rdi, %rdx #; buf => %rdx + andq $12, %rdx #; mask bits 1 & 2, result = 0, 4, 8 or 12 + jz .CP_SSE #; if buffer IS aligned + + #; PRE-LOOP + #; we iterate 1-3 times, doing normal x87 float comparison + #; so we reach a 16 byte aligned "buf" (=%rdi) value + +.LP_START: + + #; Load next value from the buffer + movss (%rdi), %xmm1 + andps %xmm2, %xmm1 + maxss %xmm1, %xmm0 + + #; increment buffer, decrement counter + addq $4, %rdi #; buf++; + + decq %rcx #; nframes-- + jz .CP_END #; if we run out of frames, we go to the end + + addq $4, %rdx #; one non-aligned byte less + cmp $16, %rdx + jne .LP_START #; if more non-aligned frames exist, we do a do-over + +.CP_SSE: + + #; We have reached the 16 byte aligned "buf" ("rdi") value + + #; Figure out how many loops we should do + movq %rcx, %rax #; copy remaining nframes to %rax for division + + shr $2,%rax #; unsigned divide by 4 + jz .POST_START + + #; %rax = SSE iterations + + #; current maximum is at %xmm0, but we need to .. + shufps $0x00, %xmm0, %xmm0 #; shuffle "current" to all 4 FP's + + #;prefetcht0 16(%rdi) + +.LP_SSE: + + movaps (%rdi), %xmm1 + andps %xmm2, %xmm1 + maxps %xmm1, %xmm0 + + addq $16, %rdi + + decq %rax + jnz .LP_SSE + + #; Calculate the maximum value contained in the 4 FP's in %xmm0 + movaps %xmm0, %xmm1 + shufps $0x4e, %xmm1, %xmm1 #; shuffle left & right pairs (1234 => 3412) + maxps %xmm1, %xmm0 #; maximums of the two pairs + movaps %xmm0, %xmm1 + shufps $0xb1, %xmm1, %xmm1 #; shuffle the floats inside the two pairs (1234 => 2143) + maxps %xmm1, %xmm0 + + #; now every float in %xmm0 is the same value, current maximum value + + #; Next we need to post-process all remaining frames + #; the remaining frame count is in %rcx + + #; if no remaining frames, jump to the end + + andq $3, %rcx #; nframes % 4 + jz .CP_END + +.POST_START: + + movss (%rdi), %xmm1 + andps %xmm2, %xmm1 + maxss %xmm1, %xmm0 + + addq $4, %rdi #; buf++; + + decq %rcx #; nframes--; + jnz .POST_START + +.CP_END: + + popq %rdi + + #; return + leave + ret + +.size x86_sse_compute_peak, .-x86_sse_compute_peak +#; end proc