1 ////////////////////////////////////////////////////////////////////////////////
3 /// Win32 version of the AMD 3DNow! optimized routines for AMD K6-2/Athlon
4 /// processors. All 3DNow! optimized functions have been gathered into this
5 /// single source code file, regardless to their class or original source code
6 /// file, in order to ease porting the library to other compiler and processor
9 /// By the way; the performance gain depends heavily on the CPU generation: On
10 /// K6-2 these routines provided speed-up of even 2.4 times, while on Athlon the
11 /// difference to the original routines stayed at unremarkable 8%! Such a small
12 /// improvement on Athlon is due to 3DNow can perform only two operations in
13 /// parallel, and obviously also the Athlon FPU is doing a very good job with
14 /// the standard C floating point routines! Here these routines are anyway,
15 /// although it might not be worth the effort to convert these to GCC platform,
16 /// for Athlon CPU at least. The situation is different regarding the SSE
17 /// optimizations though, thanks to the four parallel operations of SSE that
18 /// already make a difference.
20 /// This file is to be compiled in Windows platform with Microsoft Visual C++
21 /// Compiler. Please see '3dnow_gcc.cpp' for the gcc compiler version for all
22 /// GNU platforms (if file supplied).
24 /// NOTICE: If using Visual Studio 6.0, you'll need to install the "Visual C++
25 /// 6.0 processor pack" update to support 3DNow! instruction set. The update is
26 /// available for download at Microsoft Developers Network, see here:
27 /// http://msdn.microsoft.com/vstudio/downloads/tools/ppack/default.aspx
29 /// If the above URL is expired or removed, go to "http://msdn.microsoft.com" and
30 /// perform a search with keywords "processor pack".
32 /// Author : Copyright (c) Olli Parviainen
33 /// Author e-mail : oparviai @ iki.fi
34 /// SoundTouch WWW: http://www.iki.fi/oparviai/soundtouch
36 ////////////////////////////////////////////////////////////////////////////////
38 // Last changed : $Date$
39 // File revision : $Revision$
43 ////////////////////////////////////////////////////////////////////////////////
47 // SoundTouch audio processing library
48 // Copyright (c) Olli Parviainen
50 // This library is free software; you can redistribute it and/or
51 // modify it under the terms of the GNU Lesser General Public
52 // License as published by the Free Software Foundation; either
53 // version 2.1 of the License, or (at your option) any later version.
55 // This library is distributed in the hope that it will be useful,
56 // but WITHOUT ANY WARRANTY; without even the implied warranty of
57 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
58 // Lesser General Public License for more details.
60 // You should have received a copy of the GNU Lesser General Public
61 // License along with this library; if not, write to the Free Software
62 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
64 ////////////////////////////////////////////////////////////////////////////////
66 #include "cpu_detect.h"
70 #error "wrong platform - this source code file is exclusively for Win32 platform"
73 using namespace soundtouch;
76 // 3DNow! routines available only with float sample type
78 //////////////////////////////////////////////////////////////////////////////
80 // implementation of 3DNow! optimized functions of class 'TDStretch3DNow'
82 //////////////////////////////////////////////////////////////////////////////
84 #include "TDStretch.h"
87 // these are declared in 'TDStretch.cpp'
88 extern int scanOffsets[4][24];
91 // Calculates cross correlation of two buffers
92 double TDStretch3DNow::calcCrossCorrStereo(const float *pV1, const float *pV2) const
94 uint overlapLengthLocal = overlapLength;
97 // Calculates the cross-correlation value between 'pV1' and 'pV2' vectors
102 for (i = 0; i < overlapLength / 4; i ++)
104 corr += pV1[0] * pV2[0];
120 // give prefetch hints to CPU of what data are to be needed soonish.
121 // give more aggressive hints on pV1 as that changes more between different calls
122 // while pV2 stays the same.
127 mov eax, dword ptr pV2
128 mov ebx, dword ptr pV1
132 mov ecx, overlapLengthLocal
133 shr ecx, 2 // div by four
137 prefetch [eax + 32] // give a prefetch hint to CPU what data are to be needed soonish
139 prefetch [ebx + 64] // give a prefetch hint to CPU what data are to be needed soonish
147 pfmul mm3, [ebx + 16]
151 pfmul mm4, [ebx + 24]
160 // add halfs of mm0 together and return the result.
161 // note: mm1 is used as a dummy parameter only, we actually don't care about it's value
173 //////////////////////////////////////////////////////////////////////////////
175 // implementation of 3DNow! optimized functions of class 'FIRFilter'
177 //////////////////////////////////////////////////////////////////////////////
179 #include "FIRFilter.h"
181 FIRFilter3DNow::FIRFilter3DNow() : FIRFilter()
183 filterCoeffsUnalign = NULL;
187 FIRFilter3DNow::~FIRFilter3DNow()
189 delete[] filterCoeffsUnalign;
193 // (overloaded) Calculates filter coefficients for 3DNow! routine
194 void FIRFilter3DNow::setCoefficients(const float *coeffs, uint newLength, uint uResultDivFactor)
199 FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
201 // Scale the filter coefficients so that it won't be necessary to scale the filtering result
202 // also rearrange coefficients suitably for 3DNow!
203 // Ensure that filter coeffs array is aligned to 16-byte boundary
204 delete[] filterCoeffsUnalign;
205 filterCoeffsUnalign = new float[2 * newLength + 4];
206 filterCoeffsAlign = (float *)(((uint)filterCoeffsUnalign + 15) & -16);
208 fDivider = (float)resultDivider;
210 // rearrange the filter coefficients for mmx routines
211 for (i = 0; i < newLength; i ++)
213 filterCoeffsAlign[2 * i + 0] =
214 filterCoeffsAlign[2 * i + 1] = coeffs[i + 0] / fDivider;
219 // 3DNow!-optimized version of the filter routine for stereo sound
220 uint FIRFilter3DNow::evaluateFilterStereo(float *dest, const float *src, const uint numSamples) const
222 float *filterCoeffsLocal = filterCoeffsAlign;
223 uint count = (numSamples - length) & -2;
224 uint lengthLocal = length / 4;
227 assert(count % 2 == 0);
235 for (j = 0; j < count; j += 2)
242 filterCoeffsLocal = filterCoeffs;
243 for (i = 0; i < lengthLocal; i ++)
245 // unroll loop for efficiency.
247 suml1 += ptr[0] * filterCoeffsLocal[0] +
248 ptr[2] * filterCoeffsLocal[2] +
249 ptr[4] * filterCoeffsLocal[4] +
250 ptr[6] * filterCoeffsLocal[6];
252 sumr1 += ptr[1] * filterCoeffsLocal[1] +
253 ptr[3] * filterCoeffsLocal[3] +
254 ptr[5] * filterCoeffsLocal[5] +
255 ptr[7] * filterCoeffsLocal[7];
257 suml2 += ptr[8] * filterCoeffsLocal[0] +
258 ptr[10] * filterCoeffsLocal[2] +
259 ptr[12] * filterCoeffsLocal[4] +
260 ptr[14] * filterCoeffsLocal[6];
262 sumr2 += ptr[9] * filterCoeffsLocal[1] +
263 ptr[11] * filterCoeffsLocal[3] +
264 ptr[13] * filterCoeffsLocal[5] +
265 ptr[15] * filterCoeffsLocal[7];
268 filterCoeffsLocal += 8;
270 dest[0] = (float)suml1;
271 dest[1] = (float)sumr1;
272 dest[2] = (float)suml2;
273 dest[3] = (float)sumr2;
282 mov eax, dword ptr dest
283 mov ebx, dword ptr src
288 // "outer loop" : during each round 2*2 output samples are calculated
289 prefetch [ebx] // give a prefetch hint to CPU what data are to be needed soonish
290 prefetch [filterCoeffsLocal] // give a prefetch hint to CPU what data are to be needed soonish
293 mov edi, filterCoeffsLocal
299 // "inner loop" : during each round four FIR filter taps are evaluated for 2*2 output samples
302 prefetch [edi + 32] // give a prefetch hint to CPU what data are to be needed soonish
304 prefetch [esi + 32] // give a prefetch hint to CPU what data are to be needed soonish
312 pfmul mm5, [esi + 16]
317 pfmul mm2, [esi + 16]
319 pfmul mm6, [esi + 24]
324 pfmul mm3, [esi + 24]
326 pfmul mm7, [esi + 32]
350 #endif // ALLOW_3DNOW