00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00045
00046 #include "STTypes.h"
00047 using namespace soundtouch;
00048
00049 #ifdef ALLOW_MMX
00050 #include <stdexcept>
00051 #include <string>
00052 #include <climits>
00053
00054
00055
00056
00057 #ifdef USE_GCC_INTRINSICS
00058 # include <mmintrin.h>
00059 # define SI(A,B...) A
00060 # define GI(X...) X
00061 #else
00062 # include "x86/mmx.h"
00063 # define _mm_empty() __asm__ __volatile__ ("emms")
00064 # define __m64 mmx_t
00065 # define SI(A,B...) B
00066 # define GI(X...)
00067 #endif
00068
00069 #include "cpu_detect.h"
00070 #include "TDStretch.h"
00071
00072
00073
00075
00076
00077
00079
00080
00081 extern int scanOffsets[4][24];
00082
00083
00084 long TDStretchMMX::calcCrossCorrStereo(const short *pV1, const short *pV2) const
00085 {
00086 uint tmp;
00087 uint counter = (overlapLength>>3)-1;
00088 __m64 *pv1=(__m64*)pV1, *pv2=(__m64*)pV2;
00089 GI(__m64 m0, m1, m2, m3, m4, m5);
00090 uint shift = overlapDividerBits;
00091
00092
00093 SI(m1 = pv1[0], movq_a2r(0, pv1, mm1));
00094 SI(m2 = pv1[1], movq_a2r(8, pv1, mm2));
00095 SI(m0 = _mm_setzero_si64(), pxor_r2r(mm0, mm0));
00096 SI(m5 = _mm_cvtsi32_si64(shift),movd_v2r(shift, mm5));
00097
00098 do {
00099
00100
00101
00102 SI(m1 = _mm_madd_pi16(m1, pv2[0]),pmaddwd_a2r(0, pv2, mm1));
00103 SI(m3 = pv1[2], movq_a2r(16, pv1, mm3));
00104 SI(m2 = _mm_madd_pi16(m2, pv2[1]),pmaddwd_a2r(8, pv2, mm2));
00105 SI(m4 = pv1[3], movq_a2r(24, pv1, mm4));
00106 SI(m3 = _mm_madd_pi16(m3, pv2[2]),pmaddwd_a2r(16, pv2, mm3));
00107 SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2));
00108 SI(m4 = _mm_madd_pi16(m4, pv2[3]),pmaddwd_a2r(24, pv2, mm4));
00109 SI(m1 = pv1[4], movq_a2r(32, pv1, mm1));
00110 SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2));
00111 pv1 += 4;
00112 SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3));
00113 SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0));
00114 SI(m2 = pv1[1], movq_a2r(8, pv1, mm2));
00115 SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3));
00116 pv2 += 4;
00117 SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0));
00118 } while ((--counter)!=0);
00119
00120
00121 SI(m1 = _mm_madd_pi16(m1, pv2[0]), pmaddwd_a2r(0, pv2, mm1));
00122 SI(m3 = pv1[2], movq_a2r(16, pv1, mm3));
00123 SI(m2 = _mm_madd_pi16(m2, pv2[1]), pmaddwd_a2r(8, pv2, mm2));
00124 SI(m4 = pv1[3], movq_a2r(24, pv1, mm4));
00125 SI(m3 = _mm_madd_pi16(m3, pv2[2]), pmaddwd_a2r(16, pv2, mm3));
00126 SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2));
00127 SI(m4 = _mm_madd_pi16(m4, pv2[3]), pmaddwd_a2r(24, pv2, mm4));
00128 SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2));
00129 SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3));
00130 SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0));
00131 SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3));
00132 SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0));
00133
00134
00135
00136 SI(m1 = m0, movq_r2r(mm0, mm1));
00137 SI(m1 = _mm_srli_si64(m1, 32), psrld_i2r(32, mm1));
00138 SI(m0 = _mm_add_pi32(m0, m1), paddd_r2r(mm1, mm0));
00139 SI(tmp = _mm_cvtsi64_si32(m0), movd_r2m(mm0, tmp));
00140 return tmp;
00141 }
00142
00143 #ifdef USE_MULTI_MMX
00144
00145 long TDStretchMMX::calcCrossCorrMulti(const short *pV1, const short *pV2) const
00146 {
00147
00148 static const __m64 mm_mask[4][8] __attribute__ ((aligned(8))) = {
00149 {
00150
00151 0xffffffffffffffffULL,
00152 0xffffffffffffffffULL,
00153 0xffffffffffffffffULL,
00154 0xffffffffffffffffULL,
00155 0,
00156 0,
00157 0,
00158 0
00159 },
00160 {
00161 0xffffffffffffffffULL,
00162 0xffffffffffffffffULL,
00163 0xffffffffffffffffULL,
00164 0x0000ffffffffffffULL,
00165 0,
00166 0,
00167 0,
00168 0
00169 },
00170 {
00171 0xffffffffffffffffULL,
00172 0xffffffffffffffffULL,
00173 0xffffffffffffffffULL,
00174 0x00000000ffffffffULL,
00175 0,
00176 0,
00177 0,
00178 0
00179 },
00180 {
00181 0xffffffffffffffffULL,
00182 0xffffffffffffffffULL,
00183 0xffffffffffffffffULL,
00184 0x000000000000ffffULL,
00185 0,
00186 0,
00187 0,
00188 0
00189 }
00190 };
00191 uint tmp;
00192 uint adjustedOverlapLength = overlapLength*channels;
00193 uint counter = ((adjustedOverlapLength+15)>>4)-1;
00194 uint remainder = (16-adjustedOverlapLength)&0xf;
00195
00196 __m64 *ph = (__m64*)&mm_mask[remainder&3][remainder>>2];
00197 __m64 *pv1=(__m64*)pV1, *pv2=(__m64*)pV2;
00198 GI(__m64 m0, m1, m2, m3, m4, m5, m6);
00199 uint shift = overlapDividerBits;
00200
00201
00202 SI(m1 = pv1[0], movq_a2r(0, pv1, mm1));
00203 SI(m2 = pv1[1], movq_a2r(8, pv1, mm2));
00204 SI(m0 = _mm_setzero_si64(), pxor_r2r(mm0, mm0));
00205 SI(m5 = _mm_cvtsi32_si64(shift),movd_v2r(shift, mm5));
00206
00207 do {
00208
00209
00210
00211 SI(m1 = _mm_madd_pi16(m1, pv2[0]),pmaddwd_a2r(0, pv2, mm1));
00212 SI(m3 = pv1[2], movq_a2r(16, pv1, mm3));
00213 SI(m2 = _mm_madd_pi16(m2, pv2[1]),pmaddwd_a2r(8, pv2, mm2));
00214 SI(m4 = pv1[3], movq_a2r(24, pv1, mm4));
00215 SI(m3 = _mm_madd_pi16(m3, pv2[2]),pmaddwd_a2r(16, pv2, mm3));
00216 SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2));
00217 SI(m4 = _mm_madd_pi16(m4, pv2[3]),pmaddwd_a2r(24, pv2, mm4));
00218 SI(m1 = pv1[4], movq_a2r(32, pv1, mm1));
00219 SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2));
00220 pv1 += 4;
00221 SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3));
00222 SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0));
00223 SI(m2 = pv1[1], movq_a2r(8, pv1, mm2));
00224 SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3));
00225 pv2 += 4;
00226 SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0));
00227 } while ((--counter)!=0);
00228
00229 SI(m6 = ph[0], movq_a2r(0, ph, mm6));
00230
00231 SI(m1 = _mm_madd_pi16(m1, pv2[0]), pmaddwd_a2r(0, pv2, mm1));
00232 SI(m1 = _mm_and_si64(m1, m6), pand_r2r(mm6, mm1));
00233 SI(m3 = pv1[2], movq_a2r(16, pv1, mm3));
00234 SI(m6 = ph[1], movq_a2r(8, ph, mm6));
00235 SI(m2 = _mm_madd_pi16(m2, pv2[1]), pmaddwd_a2r(8, pv2, mm2));
00236 SI(m2 = _mm_and_si64(m2, m6), pand_r2r(mm6, mm2));
00237 SI(m4 = pv1[3], movq_a2r(24, pv1, mm4));
00238 SI(m6 = ph[2], movq_a2r(16, ph, mm6));
00239 SI(m3 = _mm_madd_pi16(m3, pv2[2]), pmaddwd_a2r(16, pv2, mm3));
00240 SI(m3 = _mm_and_si64(m3, m6), pand_r2r(mm6, mm3));
00241 SI(m2 = _mm_add_pi32(m2, m1), paddd_r2r(mm1, mm2));
00242 SI(m6 = ph[3], movq_a2r(24, ph, mm6));
00243 SI(m4 = _mm_madd_pi16(m4, pv2[3]), pmaddwd_a2r(24, pv2, mm4));
00244 SI(m4 = _mm_and_si64(m4, m6), pand_r2r(mm6, mm4));
00245 SI(m2 = _mm_srai_pi32(m2, m5), psrad_r2r(mm5, mm2));
00246 SI(m3 = _mm_add_pi32(m3, m4), paddd_r2r(mm4, mm3));
00247 SI(m0 = _mm_add_pi32(m0, m2), paddd_r2r(mm2, mm0));
00248 SI(m3 = _mm_srai_pi32(m3, m5), psrad_r2r(mm5, mm3));
00249 SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0));
00250
00251
00252
00253 SI(m1 = m0, movq_r2r(mm0, mm1));
00254 SI(m1 = _mm_srli_si64(m1, 32), psrld_i2r(32, mm1));
00255 SI(m0 = _mm_add_pi32(m0, m1), paddd_r2r(mm1, mm0));
00256 SI(tmp = _mm_cvtsi64_si32(m0), movd_r2m(mm0, tmp));
00257 return tmp;
00258 }
00259 #endif
00260
00261 void TDStretchMMX::clearCrossCorrState()
00262 {
00263 _mm_empty();
00264 }
00265
00266
00267 void TDStretchMMX::overlapStereo(short *output, const short *input) const
00268 {
00269 _mm_empty();
00270 uint shift = overlapDividerBits;
00271 uint counter = overlapLength>>2;
00272 __m64 *inPtr = (__m64*) input;
00273 __m64 *midPtr = (__m64*) pMidBuffer;
00274 __m64 *outPtr = ((__m64*) output)-2;
00275 GI(__m64 m0, m1, m2, m3, m4, m5, m6, m7);
00276
00277
00278 uint tmp0 = 0x0002fffe;
00279 SI(m5 = _mm_cvtsi32_si64(tmp0), movd_v2r(tmp0, mm5));
00280 SI(m5 = _mm_unpacklo_pi32(m5,m5), punpckldq_r2r(mm5, mm5));
00281
00282 SI(m6 = _mm_cvtsi32_si64(overlapLength), movd_v2r(overlapLength, mm6));
00283 SI(m6 = _mm_unpacklo_pi32(m6, m6), punpckldq_r2r(mm6, mm6));
00284
00285 uint tmp1 = (overlapLength-1)|0x00010000;
00286 SI(m7 = _mm_cvtsi32_si64(tmp1), movd_v2r(tmp1, mm7));
00287 SI(m7 = _mm_unpacklo_pi32(m7, m7), punpckldq_r2r(mm7, mm7));
00288
00289 do {
00290
00291
00292
00293
00294
00295
00296
00297
00298
00299
00300
00301
00302
00303
00304
00305
00306 SI(m0 = midPtr[0], movq_a2r(0, midPtr, mm0));
00307 outPtr += 2;
00308 SI(m3 = inPtr[0], movq_a2r(0, inPtr, mm3));
00309 SI(m1 = m0, movq_r2r(mm0, mm1));
00310 SI(m2 = midPtr[1], movq_a2r(8, midPtr, mm2));
00311 SI(m0 = _mm_unpacklo_pi16(m0, m3),punpcklwd_r2r(mm3, mm0));
00312 midPtr += 2;
00313 SI(m4 = inPtr[1], movq_a2r(8, inPtr, mm4));
00314 SI(m1 = _mm_unpackhi_pi16(m1, m3),punpckhwd_r2r(mm3, mm1));
00315 inPtr+=2;
00316 SI(m3 = m2, movq_r2r(mm2, mm3));
00317 SI(m2 = _mm_unpacklo_pi16(m2, m4),punpcklwd_r2r(mm4, mm2));
00318
00319 SI(m0 = _mm_madd_pi16(m0, m6), pmaddwd_r2r(mm6, mm0));
00320 SI(m3 = _mm_unpackhi_pi16(m3, m4),punpckhwd_r2r(mm4, mm3));
00321 SI(m4 = _mm_cvtsi32_si64(shift), movd_v2r(shift, mm4));
00322
00323 SI(m1 = _mm_madd_pi16(m1, m7), pmaddwd_r2r(mm7, mm1));
00324 SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6));
00325 SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7));
00326 SI(m0 = _mm_srai_pi32(m0, m4), psrad_r2r(mm4, mm0));
00327
00328 SI(m2 = _mm_madd_pi16(m2, m6), pmaddwd_r2r(mm6, mm2));
00329 SI(m1 = _mm_srai_pi32(m1, m4), psrad_r2r(mm4, mm1));
00330
00331 SI(m3 = _mm_madd_pi16(m3, m7), pmaddwd_r2r(mm7, mm3));
00332 SI(m2 = _mm_srai_pi32(m2, m4), psrad_r2r(mm4, mm2));
00333 SI(m0 = _mm_packs_pi32(m0, m1), packssdw_r2r(mm1, mm0));
00334 SI(m3 = _mm_srai_pi32(m3, m4), psrad_r2r(mm4, mm3));
00335 SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6));
00336 SI(m2 = _mm_packs_pi32(m2, m3), packssdw_r2r(mm3, mm2));
00337 SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7));
00338 SI(outPtr[0] = m0, movq_r2a(mm0, 0, outPtr));
00339 SI(outPtr[1] = m2, movq_r2a(mm2, 8, outPtr));
00340 } while ((--counter)!=0);
00341 _mm_empty();
00342 }
00343
00344 #if 0
00345
00346 void TDStretchMMX::overlapMulti(short *output, const short *input) const
00347 {
00348 _mm_empty();
00349 uint shift = overlapDividerBits;
00350 uint counter = overlapLength>>2;
00351 __m64 *inPtr = (__m64*) input;
00352 __m64 *midPtr = (__m64*) pMidBuffer;
00353 __m64 *outPtr = ((__m64*) output)-2;
00354 GI(__m64 m0, m1, m2, m3, m4, m5, m6, m7);
00355
00356
00357 uint tmp0 = 0x0002fffe;
00358 SI(m5 = _mm_cvtsi32_si64(tmp0), movd_v2r(tmp0, mm5));
00359 SI(m5 = _mm_unpacklo_pi32(m5,m5), punpckldq_r2r(mm5, mm5));
00360
00361 SI(m6 = _mm_cvtsi32_si64(overlapLength), movd_v2r(overlapLength, mm6));
00362 SI(m6 = _mm_unpacklo_pi32(m6, m6), punpckldq_r2r(mm6, mm6));
00363
00364 uint tmp1 = (overlapLength-1)|0x00010000;
00365 SI(m7 = _mm_cvtsi32_si64(tmp1), movd_v2r(tmp1, mm7));
00366 SI(m7 = _mm_unpacklo_pi32(m7, m7), punpckldq_r2r(mm7, mm7));
00367
00368 do {
00369
00370
00371
00372
00373
00374
00375
00376
00377
00378
00379
00380
00381
00382
00383
00384
00385 SI(m0 = midPtr[0], movq_a2r(0, midPtr, mm0));
00386 outPtr += 2;
00387 SI(m3 = inPtr[0], movq_a2r(0, inPtr, mm3));
00388 SI(m1 = m0, movq_r2r(mm0, mm1));
00389 SI(m2 = midPtr[1], movq_a2r(8, midPtr, mm2));
00390 SI(m0 = _mm_unpacklo_pi16(m0, m3),punpcklwd_r2r(mm3, mm0));
00391 midPtr += 2;
00392 SI(m4 = inPtr[1], movq_a2r(8, inPtr, mm4));
00393 SI(m1 = _mm_unpackhi_pi16(m1, m3),punpckhwd_r2r(mm3, mm1));
00394 inPtr+=2;
00395 SI(m3 = m2, movq_r2r(mm2, mm3));
00396 SI(m2 = _mm_unpacklo_pi16(m2, m4),punpcklwd_r2r(mm4, mm2));
00397
00398 SI(m0 = _mm_madd_pi16(m0, m6), pmaddwd_r2r(mm6, mm0));
00399 SI(m3 = _mm_unpackhi_pi16(m3, m4),punpckhwd_r2r(mm4, mm3));
00400 SI(m4 = _mm_cvtsi32_si64(shift), movd_v2r(shift, mm4));
00401
00402 SI(m1 = _mm_madd_pi16(m1, m7), pmaddwd_r2r(mm7, mm1));
00403 SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6));
00404 SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7));
00405 SI(m0 = _mm_srai_pi32(m0, m4), psrad_r2r(mm4, mm0));
00406
00407 SI(m2 = _mm_madd_pi16(m2, m6), pmaddwd_r2r(mm6, mm2));
00408 SI(m1 = _mm_srai_pi32(m1, m4), psrad_r2r(mm4, mm1));
00409
00410 SI(m3 = _mm_madd_pi16(m3, m7), pmaddwd_r2r(mm7, mm3));
00411 SI(m2 = _mm_srai_pi32(m2, m4), psrad_r2r(mm4, mm2));
00412 SI(m0 = _mm_packs_pi32(m0, m1), packssdw_r2r(mm1, mm0));
00413 SI(m3 = _mm_srai_pi32(m3, m4), psrad_r2r(mm4, mm3));
00414 SI(m6 = _mm_add_pi16(m6, m5), paddw_r2r(mm5, mm6));
00415 SI(m2 = _mm_packs_pi32(m2, m3), packssdw_r2r(mm3, mm2));
00416 SI(m7 = _mm_add_pi16(m7, m5), paddw_r2r(mm5, mm7));
00417 SI(outPtr[0] = m0, movq_r2a(mm0, 0, outPtr));
00418 SI(outPtr[1] = m2, movq_r2a(mm2, 8, outPtr));
00419 } while ((--counter)!=0);
00420 _mm_empty();
00421 }
00422 #endif
00423
00425
00426
00427
00429
00430 #include "FIRFilter.h"
00431 FIRFilterMMX::FIRFilterMMX() : FIRFilter()
00432 {
00433 filterCoeffsUnalign = NULL;
00434 }
00435
00436
00437 FIRFilterMMX::~FIRFilterMMX()
00438 {
00439 delete[] filterCoeffsUnalign;
00440 }
00441
00442
00443 void FIRFilterMMX::setCoefficients(const short *coeffs, uint newLength, uint uResultDivFactor)
00444 {
00445 uint i;
00446 FIRFilter::setCoefficients(coeffs, newLength, uResultDivFactor);
00447
00448
00449 delete[] filterCoeffsUnalign;
00450 filterCoeffsUnalign = new short[2 * newLength + 8];
00451 filterCoeffsAlign = (short *)(((ulong)filterCoeffsUnalign + 15) & -16);
00452
00453
00454 for (i = 0;i < length; i += 4)
00455 {
00456 filterCoeffsAlign[2 * i + 0] = coeffs[i + 0];
00457 filterCoeffsAlign[2 * i + 1] = coeffs[i + 2];
00458 filterCoeffsAlign[2 * i + 2] = coeffs[i + 0];
00459 filterCoeffsAlign[2 * i + 3] = coeffs[i + 2];
00460
00461 filterCoeffsAlign[2 * i + 4] = coeffs[i + 1];
00462 filterCoeffsAlign[2 * i + 5] = coeffs[i + 3];
00463 filterCoeffsAlign[2 * i + 6] = coeffs[i + 1];
00464 filterCoeffsAlign[2 * i + 7] = coeffs[i + 3];
00465 }
00466 }
00467
00468
00469
00470
00471 uint FIRFilterMMX::evaluateFilterStereo(short *dest, const short *src, const uint numSamples) const
00472 {
00473 _mm_empty();
00474 __m64 *inPtr = (__m64*)src;
00475 __m64 *outPtr = ((__m64*)dest) - 1;
00476 uint counter = (numSamples - length) >> 1;
00477 GI(__m64 m0, m1, m2, m3, m4, m5, m6, m7);
00478
00479 do {
00480 __m64 *filterInPtr = inPtr;
00481 __m64 *filterPtr = (__m64*)filterCoeffsAlign;
00482 uint filterCounter = lengthDiv8;
00483
00484 SI(m0 = _mm_setzero_si64(), pxor_r2r(mm0, mm0));
00485 SI(m1 = filterInPtr[0], movq_a2r(0, filterInPtr, mm1));
00486 SI(m7 = _mm_setzero_si64(), pxor_r2r(mm7, mm7));
00487
00488 do {
00489 SI(m2 = filterInPtr[1], movq_a2r(8, filterInPtr, mm2));
00490 SI(m4 = m1, movq_r2r(mm1, mm4));
00491 SI(m3 = filterInPtr[2], movq_a2r(16, filterInPtr, mm3));
00492 SI(m1 = _mm_unpackhi_pi16(m1, m2), punpckhwd_r2r(mm2, mm1));
00493 SI(m6 = m2, movq_r2r(mm2, mm6));
00494 SI(m4 = _mm_unpacklo_pi16(m4, m2), punpcklwd_r2r(mm2, mm4));
00495 SI(m2 = filterPtr[0], movq_a2r(0, filterPtr, mm2));
00496 SI(m5 = m1, movq_r2r(mm1, mm5));
00497 SI(m6 = _mm_unpacklo_pi16(m6, m3), punpcklwd_r2r(mm3, mm6));
00498 SI(m4 = _mm_madd_pi16(m4, m2), pmaddwd_r2r(mm2, mm4));
00499 SI(m5 = _mm_madd_pi16(m5, m2), pmaddwd_r2r(mm2, mm5));
00500 SI(m2 = filterPtr[1], movq_a2r(8, filterPtr, mm2));
00501 SI(m0 = _mm_add_pi32(m0, m4), paddd_r2r(mm4, mm0));
00502 SI(m4 = m3, movq_r2r(mm3, mm4));
00503 SI(m1 = _mm_madd_pi16(m1, m2), pmaddwd_r2r(mm2, mm1));
00504 SI(m7 = _mm_add_pi32(m7, m5), paddd_r2r(mm5, mm7));
00505 SI(m6 = _mm_madd_pi16(m6, m2), pmaddwd_r2r(mm2, mm6));
00506 SI(m2 = filterInPtr[3], movq_a2r(24, filterInPtr, mm2));
00507 SI(m0 = _mm_add_pi32(m0, m1), paddd_r2r(mm1, mm0));
00508 SI(m1 = filterInPtr[4], movq_a2r(32, filterInPtr, mm1));
00509 SI(m7 = _mm_add_pi32(m7, m6), paddd_r2r(mm6, mm7));
00510 SI(m3 = _mm_unpackhi_pi16(m3, m2), punpckhwd_r2r(mm2, mm3));
00511 SI(m6 = m2, movq_r2r(mm2, mm6));
00512 SI(m4 = _mm_unpackhi_pi16(m4, m2), punpcklwd_r2r(mm2, mm4));
00513 SI(m2 = filterPtr[2], movq_a2r(16, filterInPtr, mm2));
00514 SI(m5 = m3, movq_r2r(mm3, mm5));
00515 SI(m6 = _mm_unpackhi_pi16(m6, m1), punpcklwd_r2r(mm1, mm6));
00516 filterPtr += 4;
00517 SI(m4 = _mm_madd_pi16(m4, m2), pmaddwd_r2r(mm2, mm4));
00518 filterInPtr += 4;
00519 SI(m5 = _mm_madd_pi16(m5, m2), pmaddwd_r2r(mm2, mm5));
00520 SI(m2 = filterPtr[-1], movq_a2r(-8, filterPtr, mm2));
00521 SI(m0 = _mm_add_pi32(m0, m4), paddd_r2r(mm4, mm0));
00522 SI(m3 = _mm_madd_pi16(m3, m2), pmaddwd_r2r(mm2, mm3));
00523 SI(m7 = _mm_add_pi32(m7, m5), paddd_r2r(mm5, mm7));
00524 SI(m6 = _mm_madd_pi16(m6, m2), pmaddwd_r2r(mm2, mm6));
00525 SI(m0 = _mm_add_pi32(m0, m3), paddd_r2r(mm3, mm0));
00526 SI(m7 = _mm_add_pi32(m7, m6), paddd_r2r(mm6, mm7));
00527 } while ((--filterCounter)!=0);
00528
00529 SI(m4 = _mm_cvtsi32_si64(resultDivFactor), movd_v2r(resultDivFactor, mm4));
00530
00531 SI(m0 = _mm_srai_pi32(m0, m4), psrad_r2r(mm4, mm0));
00532 outPtr++;
00533
00534 SI(m7 = _mm_srai_pi32(m7, m4), psrad_r2r(mm4, mm7));
00535 inPtr++;
00536
00537 SI(m0 = _mm_packs_pi32(m0, m7), packssdw_r2r(mm7, mm0));
00538 SI(*outPtr = m0, movq_r2a(mm0, 0, outPtr));
00539 } while ((--counter)!=0);
00540
00541 _mm_empty();
00542 return (numSamples & 0xfffffffe) - length;
00543 }
00544
00545 #endif // ALLOW_MMX