00001
00002
00003
00004
00005
00006 #ifndef HAVE_SSE2
00007
00008
00009
00010
00011
00012
00013
00014
00015 #undef HAVE_SSE
00016 #endif
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038
00039
00040
00041
00042
00043
00044
00045
00046
00047
00048
00049
00050
00051
00052
00053
00054
00055
00056
00057
00058
00059
00060
00061
00062
00063
00064
00065
00066
00067 #undef HAVE_ONLY_MMX1
00068 #if defined(HAVE_MMX) && !defined(HAVE_MMX2) && !defined(HAVE_3DNOW) && !defined(HAVE_SSE)
00069
00070
00071
00072
00073
00074 #define HAVE_ONLY_MMX1
00075 #endif
00076
00077
00078
00079 #define small_memcpy(to,from,n)\
00080 {\
00081 register unsigned long int dummy;\
00082 __asm__ __volatile__(\
00083 "rep; movsb"\
00084 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
00085 \
00086 \
00087 \
00088 :"0" (to), "1" (from),"2" (n)\
00089 : "memory");\
00090 }
00091
00092 #undef MMREG_SIZE
00093 #ifdef HAVE_SSE
00094 #define MMREG_SIZE 16
00095 #else
00096 #define MMREG_SIZE 64 //8
00097 #endif
00098
00099 #undef PREFETCH
00100 #undef EMMS
00101
00102 #ifdef HAVE_MMX2
00103 #define PREFETCH "prefetchnta"
00104 #elif defined ( HAVE_3DNOW )
00105 #define PREFETCH "prefetch"
00106 #else
00107 #define PREFETCH "/nop"
00108 #endif
00109
00110
00111 #ifdef HAVE_3DNOW
00112 #define EMMS "femms"
00113 #else
00114 #define EMMS "emms"
00115 #endif
00116
00117 #undef MOVNTQ
00118 #ifdef HAVE_MMX2
00119 #define MOVNTQ "movntq"
00120 #else
00121 #define MOVNTQ "movq"
00122 #endif
00123
00124 #undef MIN_LEN
00125 #ifdef HAVE_ONLY_MMX1
00126 #define MIN_LEN 0x800
00127 #else
00128 #define MIN_LEN 0x40
00129 #endif
00130
00131 void * RENAME(fast_memcpy)(void * to, const void * from, size_t len);
00132
00133 void * RENAME(fast_memcpy)(void * to, const void * from, size_t len)
00134 {
00135 void *retval;
00136 size_t i;
00137 retval = to;
00138 long oldbx;
00139
00140 (void) oldbx;
00141 #ifdef STATISTIC
00142
00143 {
00144 static int freq[33];
00145 static int t=0;
00146 int i;
00147 for(i=0; len>(1<<i); i++);
00148 freq[i]++;
00149 t++;
00150 if(t % (1024 * 1024) == 0)
00151 for(i=0; i<32; i++)
00152 printf("freq < %8d %4d\n", 1<<i, freq[i]);
00153 }
00154 #endif
00155
00156 #ifndef HAVE_ONLY_MMX1
00157
00158 __asm__ __volatile__ (
00159 PREFETCH" (%0)\n"
00160 PREFETCH" 64(%0)\n"
00161 PREFETCH" 128(%0)\n"
00162 PREFETCH" 192(%0)\n"
00163 PREFETCH" 256(%0)\n"
00164 : : "r" (from) );
00165 #endif
00166 if (len >= MIN_LEN)
00167 {
00168 register unsigned long int delta;
00169
00170 delta = ((unsigned long int)to)&(MMREG_SIZE-1);
00171 if (delta)
00172 {
00173 delta=MMREG_SIZE-delta;
00174 len -= delta;
00175 small_memcpy(to, from, delta);
00176 }
00177 i = len >> 6;
00178 len&=63;
00179
00180
00181
00182
00183
00184
00185
00186
00187
00188 #ifdef HAVE_SSE
00189 if (((unsigned long)from) & 15)
00190
00191 for (; i>0; i--)
00192 {
00193 __asm__ __volatile__ (
00194 PREFETCH" 320(%0)\n"
00195 "movups (%0), %%xmm0\n"
00196 "movups 16(%0), %%xmm1\n"
00197 "movups 32(%0), %%xmm2\n"
00198 "movups 48(%0), %%xmm3\n"
00199 "movntps %%xmm0, (%1)\n"
00200 "movntps %%xmm1, 16(%1)\n"
00201 "movntps %%xmm2, 32(%1)\n"
00202 "movntps %%xmm3, 48(%1)\n"
00203 :
00204 : "r" (from), "r" (to)
00205 : "memory"
00206 );
00207 from=((const unsigned char *) from)+64;
00208 to=((unsigned char *)to)+64;
00209 }
00210 else
00211
00212
00213
00214
00215
00216 for (; i>0; i--)
00217 {
00218 __asm__ __volatile__ (
00219 PREFETCH" 320(%0)\n"
00220 "movaps (%0), %%xmm0\n"
00221 "movaps 16(%0), %%xmm1\n"
00222 "movaps 32(%0), %%xmm2\n"
00223 "movaps 48(%0), %%xmm3\n"
00224 "movntps %%xmm0, (%1)\n"
00225 "movntps %%xmm1, 16(%1)\n"
00226 "movntps %%xmm2, 32(%1)\n"
00227 "movntps %%xmm3, 48(%1)\n"
00228 :: "r" (from), "r" (to) : "memory");
00229 from=((const unsigned char *)from)+64;
00230 to=((unsigned char *)to)+64;
00231 }
00232 #else
00233
00234 for (; ((long)to & (BLOCK_SIZE-1)) && i>0; i--)
00235 {
00236 __asm__ __volatile__ (
00237 #ifndef HAVE_ONLY_MMX1
00238 PREFETCH" 320(%0)\n"
00239 #endif
00240 "movq (%0), %%mm0\n"
00241 "movq 8(%0), %%mm1\n"
00242 "movq 16(%0), %%mm2\n"
00243 "movq 24(%0), %%mm3\n"
00244 "movq 32(%0), %%mm4\n"
00245 "movq 40(%0), %%mm5\n"
00246 "movq 48(%0), %%mm6\n"
00247 "movq 56(%0), %%mm7\n"
00248 MOVNTQ" %%mm0, (%1)\n"
00249 MOVNTQ" %%mm1, 8(%1)\n"
00250 MOVNTQ" %%mm2, 16(%1)\n"
00251 MOVNTQ" %%mm3, 24(%1)\n"
00252 MOVNTQ" %%mm4, 32(%1)\n"
00253 MOVNTQ" %%mm5, 40(%1)\n"
00254 MOVNTQ" %%mm6, 48(%1)\n"
00255 MOVNTQ" %%mm7, 56(%1)\n"
00256 :: "r" (from), "r" (to) : "memory");
00257 from=((const unsigned char *)from)+64;
00258 to=((unsigned char *)to)+64;
00259 }
00260
00261
00262
00263 if (i>=BLOCK_SIZE/64)
00264 __asm__ volatile(
00265
00266 MOVX" %%"REG_b", %6\n\t"
00267 "xor %%"REG_a", %%"REG_a" \n\t"
00268 ASMALIGN(4)
00269 "1: \n\t"
00270 "movl (%0, %%"REG_a"), %%ebx \n\t"
00271 "movl 32(%0, %%"REG_a"), %%ebx \n\t"
00272 "movl 64(%0, %%"REG_a"), %%ebx \n\t"
00273 "movl 96(%0, %%"REG_a"), %%ebx \n\t"
00274 "add $128, %%"REG_a" \n\t"
00275 "cmp %3, %%"REG_a" \n\t"
00276 " jb 1b \n\t"
00277
00278 "xor %%"REG_a", %%"REG_a" \n\t"
00279 ASMALIGN(4)
00280 "2: \n\t"
00281 "movq (%0, %%"REG_a"), %%mm0\n"
00282 "movq 8(%0, %%"REG_a"), %%mm1\n"
00283 "movq 16(%0, %%"REG_a"), %%mm2\n"
00284 "movq 24(%0, %%"REG_a"), %%mm3\n"
00285 "movq 32(%0, %%"REG_a"), %%mm4\n"
00286 "movq 40(%0, %%"REG_a"), %%mm5\n"
00287 "movq 48(%0, %%"REG_a"), %%mm6\n"
00288 "movq 56(%0, %%"REG_a"), %%mm7\n"
00289 MOVNTQ" %%mm0, (%1, %%"REG_a")\n"
00290 MOVNTQ" %%mm1, 8(%1, %%"REG_a")\n"
00291 MOVNTQ" %%mm2, 16(%1, %%"REG_a")\n"
00292 MOVNTQ" %%mm3, 24(%1, %%"REG_a")\n"
00293 MOVNTQ" %%mm4, 32(%1, %%"REG_a")\n"
00294 MOVNTQ" %%mm5, 40(%1, %%"REG_a")\n"
00295 MOVNTQ" %%mm6, 48(%1, %%"REG_a")\n"
00296 MOVNTQ" %%mm7, 56(%1, %%"REG_a")\n"
00297 "add $64, %%"REG_a" \n\t"
00298 "cmp %3, %%"REG_a" \n\t"
00299 "jb 2b \n\t"
00300
00301 #if CONFUSION_FACTOR > 0
00302
00303 "mov %5, %%"REG_a" \n\t"
00304 "2: \n\t"
00305 "movl (%0), %%ebx \n\t"
00306 "movl (%0), %%ebx \n\t"
00307 "movl (%0), %%ebx \n\t"
00308 "movl (%0), %%ebx \n\t"
00309 "dec %%"REG_a" \n\t"
00310 " jnz 2b \n\t"
00311 #endif
00312
00313 "xor %%"REG_a", %%"REG_a" \n\t"
00314 "add %3, %0 \n\t"
00315 "add %3, %1 \n\t"
00316 "sub %4, %2 \n\t"
00317 "cmp %4, %2 \n\t"
00318 " jae 1b \n\t"
00319 MOVX" %6, %%"REG_b" \n\t"
00320 : "+r" (from), "+r" (to), "+r" (i)
00321 : "r" ((long)BLOCK_SIZE), "i" (BLOCK_SIZE/64), "i" ((long)CONFUSION_FACTOR), "m" (oldbx)
00322 : "%"REG_a, "memory"
00323 );
00324
00325 for (; i>0; i--)
00326 {
00327 __asm__ __volatile__ (
00328 #ifndef HAVE_ONLY_MMX1
00329 PREFETCH" 320(%0)\n"
00330 #endif
00331 "movq (%0), %%mm0\n"
00332 "movq 8(%0), %%mm1\n"
00333 "movq 16(%0), %%mm2\n"
00334 "movq 24(%0), %%mm3\n"
00335 "movq 32(%0), %%mm4\n"
00336 "movq 40(%0), %%mm5\n"
00337 "movq 48(%0), %%mm6\n"
00338 "movq 56(%0), %%mm7\n"
00339 MOVNTQ" %%mm0, (%1)\n"
00340 MOVNTQ" %%mm1, 8(%1)\n"
00341 MOVNTQ" %%mm2, 16(%1)\n"
00342 MOVNTQ" %%mm3, 24(%1)\n"
00343 MOVNTQ" %%mm4, 32(%1)\n"
00344 MOVNTQ" %%mm5, 40(%1)\n"
00345 MOVNTQ" %%mm6, 48(%1)\n"
00346 MOVNTQ" %%mm7, 56(%1)\n"
00347 :: "r" (from), "r" (to) : "memory");
00348 from=((const unsigned char *)from)+64;
00349 to=((unsigned char *)to)+64;
00350 }
00351
00352 #endif
00353 #ifdef HAVE_MMX2
00354
00355
00356 __asm__ __volatile__ ("sfence":::"memory");
00357 #endif
00358 #ifndef HAVE_SSE
00359
00360 __asm__ __volatile__ (EMMS:::"memory");
00361 #endif
00362 }
00363
00364
00365
00366 if (len) small_memcpy(to, from, len);
00367 return retval;
00368 }
00369
00370
00371