00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #include "../dsputil.h"
00023 #include "mmx.h"
00024
00025 static const __attribute__ ((aligned(8))) uint64_t round_tab[3]={
00026 0x0000000000000000ULL,
00027 0x0001000100010001ULL,
00028 0x0002000200020002ULL,
00029 };
00030
00031 static attribute_used __attribute__ ((aligned(8))) uint64_t bone= 0x0101010101010101LL;
00032
00033 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00034 {
00035 long len= -(stride*h);
00036 asm volatile(
00037 ".balign 16 \n\t"
00038 "1: \n\t"
00039 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00040 "movq (%2, %%"REG_a"), %%mm2 \n\t"
00041 "movq (%2, %%"REG_a"), %%mm4 \n\t"
00042 "add %3, %%"REG_a" \n\t"
00043 "psubusb %%mm0, %%mm2 \n\t"
00044 "psubusb %%mm4, %%mm0 \n\t"
00045 "movq (%1, %%"REG_a"), %%mm1 \n\t"
00046 "movq (%2, %%"REG_a"), %%mm3 \n\t"
00047 "movq (%2, %%"REG_a"), %%mm5 \n\t"
00048 "psubusb %%mm1, %%mm3 \n\t"
00049 "psubusb %%mm5, %%mm1 \n\t"
00050 "por %%mm2, %%mm0 \n\t"
00051 "por %%mm1, %%mm3 \n\t"
00052 "movq %%mm0, %%mm1 \n\t"
00053 "movq %%mm3, %%mm2 \n\t"
00054 "punpcklbw %%mm7, %%mm0 \n\t"
00055 "punpckhbw %%mm7, %%mm1 \n\t"
00056 "punpcklbw %%mm7, %%mm3 \n\t"
00057 "punpckhbw %%mm7, %%mm2 \n\t"
00058 "paddw %%mm1, %%mm0 \n\t"
00059 "paddw %%mm3, %%mm2 \n\t"
00060 "paddw %%mm2, %%mm0 \n\t"
00061 "paddw %%mm0, %%mm6 \n\t"
00062 "add %3, %%"REG_a" \n\t"
00063 " js 1b \n\t"
00064 : "+a" (len)
00065 : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
00066 );
00067 }
00068
00069 static inline void sad8_1_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00070 {
00071 long len= -(stride*h);
00072 asm volatile(
00073 ".balign 16 \n\t"
00074 "1: \n\t"
00075 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00076 "movq (%2, %%"REG_a"), %%mm2 \n\t"
00077 "psadbw %%mm2, %%mm0 \n\t"
00078 "add %3, %%"REG_a" \n\t"
00079 "movq (%1, %%"REG_a"), %%mm1 \n\t"
00080 "movq (%2, %%"REG_a"), %%mm3 \n\t"
00081 "psadbw %%mm1, %%mm3 \n\t"
00082 "paddw %%mm3, %%mm0 \n\t"
00083 "paddw %%mm0, %%mm6 \n\t"
00084 "add %3, %%"REG_a" \n\t"
00085 " js 1b \n\t"
00086 : "+a" (len)
00087 : "r" (blk1 - len), "r" (blk2 - len), "r" ((long)stride)
00088 );
00089 }
00090
00091 static inline void sad8_2_mmx2(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
00092 {
00093 long len= -(stride*h);
00094 asm volatile(
00095 ".balign 16 \n\t"
00096 "1: \n\t"
00097 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00098 "movq (%2, %%"REG_a"), %%mm2 \n\t"
00099 "pavgb %%mm2, %%mm0 \n\t"
00100 "movq (%3, %%"REG_a"), %%mm2 \n\t"
00101 "psadbw %%mm2, %%mm0 \n\t"
00102 "add %4, %%"REG_a" \n\t"
00103 "movq (%1, %%"REG_a"), %%mm1 \n\t"
00104 "movq (%2, %%"REG_a"), %%mm3 \n\t"
00105 "pavgb %%mm1, %%mm3 \n\t"
00106 "movq (%3, %%"REG_a"), %%mm1 \n\t"
00107 "psadbw %%mm1, %%mm3 \n\t"
00108 "paddw %%mm3, %%mm0 \n\t"
00109 "paddw %%mm0, %%mm6 \n\t"
00110 "add %4, %%"REG_a" \n\t"
00111 " js 1b \n\t"
00112 : "+a" (len)
00113 : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
00114 );
00115 }
00116
00117 static inline void sad8_4_mmx2(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00118 {
00119 long len= -(stride*h);
00120 asm volatile(
00121 ".balign 16 \n\t"
00122 "movq "MANGLE(bone)", %%mm5 \n\t"
00123 "1: \n\t"
00124 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00125 "movq (%2, %%"REG_a"), %%mm2 \n\t"
00126 "movq 1(%1, %%"REG_a"), %%mm1 \n\t"
00127 "movq 1(%2, %%"REG_a"), %%mm3 \n\t"
00128 "pavgb %%mm2, %%mm0 \n\t"
00129 "pavgb %%mm1, %%mm3 \n\t"
00130 "psubusb %%mm5, %%mm3 \n\t"
00131 "pavgb %%mm3, %%mm0 \n\t"
00132 "movq (%3, %%"REG_a"), %%mm2 \n\t"
00133 "psadbw %%mm2, %%mm0 \n\t"
00134 "add %4, %%"REG_a" \n\t"
00135 "movq (%1, %%"REG_a"), %%mm1 \n\t"
00136 "movq (%2, %%"REG_a"), %%mm3 \n\t"
00137 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
00138 "movq 1(%2, %%"REG_a"), %%mm4 \n\t"
00139 "pavgb %%mm3, %%mm1 \n\t"
00140 "pavgb %%mm4, %%mm2 \n\t"
00141 "psubusb %%mm5, %%mm2 \n\t"
00142 "pavgb %%mm1, %%mm2 \n\t"
00143 "movq (%3, %%"REG_a"), %%mm1 \n\t"
00144 "psadbw %%mm1, %%mm2 \n\t"
00145 "paddw %%mm2, %%mm0 \n\t"
00146 "paddw %%mm0, %%mm6 \n\t"
00147 "add %4, %%"REG_a" \n\t"
00148 " js 1b \n\t"
00149 : "+a" (len)
00150 : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), "r" ((long)stride)
00151 );
00152 }
00153
00154 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
00155 {
00156 long len= -(stride*h);
00157 asm volatile(
00158 ".balign 16 \n\t"
00159 "1: \n\t"
00160 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00161 "movq (%2, %%"REG_a"), %%mm1 \n\t"
00162 "movq (%1, %%"REG_a"), %%mm2 \n\t"
00163 "movq (%2, %%"REG_a"), %%mm3 \n\t"
00164 "punpcklbw %%mm7, %%mm0 \n\t"
00165 "punpcklbw %%mm7, %%mm1 \n\t"
00166 "punpckhbw %%mm7, %%mm2 \n\t"
00167 "punpckhbw %%mm7, %%mm3 \n\t"
00168 "paddw %%mm0, %%mm1 \n\t"
00169 "paddw %%mm2, %%mm3 \n\t"
00170 "movq (%3, %%"REG_a"), %%mm4 \n\t"
00171 "movq (%3, %%"REG_a"), %%mm2 \n\t"
00172 "paddw %%mm5, %%mm1 \n\t"
00173 "paddw %%mm5, %%mm3 \n\t"
00174 "psrlw $1, %%mm1 \n\t"
00175 "psrlw $1, %%mm3 \n\t"
00176 "packuswb %%mm3, %%mm1 \n\t"
00177 "psubusb %%mm1, %%mm4 \n\t"
00178 "psubusb %%mm2, %%mm1 \n\t"
00179 "por %%mm4, %%mm1 \n\t"
00180 "movq %%mm1, %%mm0 \n\t"
00181 "punpcklbw %%mm7, %%mm0 \n\t"
00182 "punpckhbw %%mm7, %%mm1 \n\t"
00183 "paddw %%mm1, %%mm0 \n\t"
00184 "paddw %%mm0, %%mm6 \n\t"
00185 "add %4, %%"REG_a" \n\t"
00186 " js 1b \n\t"
00187 : "+a" (len)
00188 : "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((long)stride)
00189 );
00190 }
00191
00192 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
00193 {
00194 long len= -(stride*h);
00195 asm volatile(
00196 ".balign 16 \n\t"
00197 "1: \n\t"
00198 "movq (%1, %%"REG_a"), %%mm0 \n\t"
00199 "movq (%2, %%"REG_a"), %%mm1 \n\t"
00200 "movq %%mm0, %%mm4 \n\t"
00201 "movq %%mm1, %%mm2 \n\t"
00202 "punpcklbw %%mm7, %%mm0 \n\t"
00203 "punpcklbw %%mm7, %%mm1 \n\t"
00204 "punpckhbw %%mm7, %%mm4 \n\t"
00205 "punpckhbw %%mm7, %%mm2 \n\t"
00206 "paddw %%mm1, %%mm0 \n\t"
00207 "paddw %%mm2, %%mm4 \n\t"
00208 "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
00209 "movq 1(%2, %%"REG_a"), %%mm3 \n\t"
00210 "movq %%mm2, %%mm1 \n\t"
00211 "punpcklbw %%mm7, %%mm2 \n\t"
00212 "punpckhbw %%mm7, %%mm1 \n\t"
00213 "paddw %%mm0, %%mm2 \n\t"
00214 "paddw %%mm4, %%mm1 \n\t"
00215 "movq %%mm3, %%mm4 \n\t"
00216 "punpcklbw %%mm7, %%mm3 \n\t"
00217 "punpckhbw %%mm7, %%mm4 \n\t"
00218 "paddw %%mm3, %%mm2 \n\t"
00219 "paddw %%mm4, %%mm1 \n\t"
00220 "movq (%3, %%"REG_a"), %%mm3 \n\t"
00221 "movq (%3, %%"REG_a"), %%mm4 \n\t"
00222 "paddw %%mm5, %%mm2 \n\t"
00223 "paddw %%mm5, %%mm1 \n\t"
00224 "psrlw $2, %%mm2 \n\t"
00225 "psrlw $2, %%mm1 \n\t"
00226 "packuswb %%mm1, %%mm2 \n\t"
00227 "psubusb %%mm2, %%mm3 \n\t"
00228 "psubusb %%mm4, %%mm2 \n\t"
00229 "por %%mm3, %%mm2 \n\t"
00230 "movq %%mm2, %%mm0 \n\t"
00231 "punpcklbw %%mm7, %%mm0 \n\t"
00232 "punpckhbw %%mm7, %%mm2 \n\t"
00233 "paddw %%mm2, %%mm0 \n\t"
00234 "paddw %%mm0, %%mm6 \n\t"
00235 "add %4, %%"REG_a" \n\t"
00236 " js 1b \n\t"
00237 : "+a" (len)
00238 : "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((long)stride)
00239 );
00240 }
00241
00242 static inline int sum_mmx(void)
00243 {
00244 int ret;
00245 asm volatile(
00246 "movq %%mm6, %%mm0 \n\t"
00247 "psrlq $32, %%mm6 \n\t"
00248 "paddw %%mm0, %%mm6 \n\t"
00249 "movq %%mm6, %%mm0 \n\t"
00250 "psrlq $16, %%mm6 \n\t"
00251 "paddw %%mm0, %%mm6 \n\t"
00252 "movd %%mm6, %0 \n\t"
00253 : "=r" (ret)
00254 );
00255 return ret&0xFFFF;
00256 }
00257
00258 static inline int sum_mmx2(void)
00259 {
00260 int ret;
00261 asm volatile(
00262 "movd %%mm6, %0 \n\t"
00263 : "=r" (ret)
00264 );
00265 return ret;
00266 }
00267
00268
00269 #define PIX_SAD(suf)\
00270 static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00271 {\
00272 assert(h==8);\
00273 asm volatile("pxor %%mm7, %%mm7 \n\t"\
00274 "pxor %%mm6, %%mm6 \n\t":);\
00275 \
00276 sad8_1_ ## suf(blk1, blk2, stride, 8);\
00277 \
00278 return sum_ ## suf();\
00279 }\
00280 static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00281 {\
00282 assert(h==8);\
00283 asm volatile("pxor %%mm7, %%mm7 \n\t"\
00284 "pxor %%mm6, %%mm6 \n\t"\
00285 "movq %0, %%mm5 \n\t"\
00286 :: "m"(round_tab[1]) \
00287 );\
00288 \
00289 sad8_2_ ## suf(blk1, blk1+1, blk2, stride, 8);\
00290 \
00291 return sum_ ## suf();\
00292 }\
00293 \
00294 static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00295 {\
00296 assert(h==8);\
00297 asm volatile("pxor %%mm7, %%mm7 \n\t"\
00298 "pxor %%mm6, %%mm6 \n\t"\
00299 "movq %0, %%mm5 \n\t"\
00300 :: "m"(round_tab[1]) \
00301 );\
00302 \
00303 sad8_2_ ## suf(blk1, blk1+stride, blk2, stride, 8);\
00304 \
00305 return sum_ ## suf();\
00306 }\
00307 \
00308 static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00309 {\
00310 assert(h==8);\
00311 asm volatile("pxor %%mm7, %%mm7 \n\t"\
00312 "pxor %%mm6, %%mm6 \n\t"\
00313 "movq %0, %%mm5 \n\t"\
00314 :: "m"(round_tab[2]) \
00315 );\
00316 \
00317 sad8_4_ ## suf(blk1, blk2, stride, 8);\
00318 \
00319 return sum_ ## suf();\
00320 }\
00321 \
00322 static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00323 {\
00324 asm volatile("pxor %%mm7, %%mm7 \n\t"\
00325 "pxor %%mm6, %%mm6 \n\t":);\
00326 \
00327 sad8_1_ ## suf(blk1 , blk2 , stride, h);\
00328 sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
00329 \
00330 return sum_ ## suf();\
00331 }\
00332 static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00333 {\
00334 asm volatile("pxor %%mm7, %%mm7 \n\t"\
00335 "pxor %%mm6, %%mm6 \n\t"\
00336 "movq %0, %%mm5 \n\t"\
00337 :: "m"(round_tab[1]) \
00338 );\
00339 \
00340 sad8_2_ ## suf(blk1 , blk1+1, blk2 , stride, h);\
00341 sad8_2_ ## suf(blk1+8, blk1+9, blk2+8, stride, h);\
00342 \
00343 return sum_ ## suf();\
00344 }\
00345 static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00346 {\
00347 asm volatile("pxor %%mm7, %%mm7 \n\t"\
00348 "pxor %%mm6, %%mm6 \n\t"\
00349 "movq %0, %%mm5 \n\t"\
00350 :: "m"(round_tab[1]) \
00351 );\
00352 \
00353 sad8_2_ ## suf(blk1 , blk1+stride, blk2 , stride, h);\
00354 sad8_2_ ## suf(blk1+8, blk1+stride+8,blk2+8, stride, h);\
00355 \
00356 return sum_ ## suf();\
00357 }\
00358 static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
00359 {\
00360 asm volatile("pxor %%mm7, %%mm7 \n\t"\
00361 "pxor %%mm6, %%mm6 \n\t"\
00362 "movq %0, %%mm5 \n\t"\
00363 :: "m"(round_tab[2]) \
00364 );\
00365 \
00366 sad8_4_ ## suf(blk1 , blk2 , stride, h);\
00367 sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
00368 \
00369 return sum_ ## suf();\
00370 }\
00371
00372 PIX_SAD(mmx)
00373 PIX_SAD(mmx2)
00374
00375 void dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx)
00376 {
00377 if (mm_flags & MM_MMX) {
00378 c->pix_abs[0][0] = sad16_mmx;
00379 c->pix_abs[0][1] = sad16_x2_mmx;
00380 c->pix_abs[0][2] = sad16_y2_mmx;
00381 c->pix_abs[0][3] = sad16_xy2_mmx;
00382 c->pix_abs[1][0] = sad8_mmx;
00383 c->pix_abs[1][1] = sad8_x2_mmx;
00384 c->pix_abs[1][2] = sad8_y2_mmx;
00385 c->pix_abs[1][3] = sad8_xy2_mmx;
00386
00387 c->sad[0]= sad16_mmx;
00388 c->sad[1]= sad8_mmx;
00389 }
00390 if (mm_flags & MM_MMXEXT) {
00391 c->pix_abs[0][0] = sad16_mmx2;
00392 c->pix_abs[1][0] = sad8_mmx2;
00393
00394 c->sad[0]= sad16_mmx2;
00395 c->sad[1]= sad8_mmx2;
00396
00397 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
00398 c->pix_abs[0][1] = sad16_x2_mmx2;
00399 c->pix_abs[0][2] = sad16_y2_mmx2;
00400 c->pix_abs[0][3] = sad16_xy2_mmx2;
00401 c->pix_abs[1][1] = sad8_x2_mmx2;
00402 c->pix_abs[1][2] = sad8_y2_mmx2;
00403 c->pix_abs[1][3] = sad8_xy2_mmx2;
00404 }
00405 }
00406 }