00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #include "../dsputil.h"
00024 #include "../simple_idct.h"
00025 #include "../mpegvideo.h"
00026 #include "mmx.h"
00027
00028
00029
00030
00031 extern const uint8_t ff_h263_loop_filter_strength[32];
00032
00033 int mm_flags;
00034
00035
00036 static const uint64_t mm_bone attribute_used __attribute__ ((aligned(8))) = 0x0101010101010101ULL;
00037 static const uint64_t mm_wone attribute_used __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
00038 static const uint64_t mm_wtwo attribute_used __attribute__ ((aligned(8))) = 0x0002000200020002ULL;
00039
00040 static const uint64_t ff_pw_20 attribute_used __attribute__ ((aligned(8))) = 0x0014001400140014ULL;
00041 static const uint64_t ff_pw_3 attribute_used __attribute__ ((aligned(8))) = 0x0003000300030003ULL;
00042 static const uint64_t ff_pw_4 attribute_used __attribute__ ((aligned(8))) = 0x0004000400040004ULL;
00043 static const uint64_t ff_pw_5 attribute_used __attribute__ ((aligned(8))) = 0x0005000500050005ULL;
00044 static const uint64_t ff_pw_16 attribute_used __attribute__ ((aligned(8))) = 0x0010001000100010ULL;
00045 static const uint64_t ff_pw_32 attribute_used __attribute__ ((aligned(8))) = 0x0020002000200020ULL;
00046 static const uint64_t ff_pw_64 attribute_used __attribute__ ((aligned(8))) = 0x0040004000400040ULL;
00047 static const uint64_t ff_pw_15 attribute_used __attribute__ ((aligned(8))) = 0x000F000F000F000FULL;
00048
00049 static const uint64_t ff_pb_3F attribute_used __attribute__ ((aligned(8))) = 0x3F3F3F3F3F3F3F3FULL;
00050 static const uint64_t ff_pb_FC attribute_used __attribute__ ((aligned(8))) = 0xFCFCFCFCFCFCFCFCULL;
00051
00052 #define JUMPALIGN() __asm __volatile (".balign 8"::)
00053 #define MOVQ_ZERO(regd) __asm __volatile ("pxor %%" #regd ", %%" #regd ::)
00054
00055 #define MOVQ_WONE(regd) \
00056 __asm __volatile ( \
00057 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00058 "psrlw $15, %%" #regd ::)
00059
00060 #define MOVQ_BFE(regd) \
00061 __asm __volatile ( \
00062 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
00063 "paddb %%" #regd ", %%" #regd " \n\t" ::)
00064
00065 #ifndef PIC
00066 #define MOVQ_BONE(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_bone))
00067 #define MOVQ_WTWO(regd) __asm __volatile ("movq %0, %%" #regd " \n\t" ::"m"(mm_wtwo))
00068 #else
00069
00070
00071 #define MOVQ_BONE(regd) \
00072 __asm __volatile ( \
00073 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00074 "psrlw $15, %%" #regd " \n\t" \
00075 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
00076
00077 #define MOVQ_WTWO(regd) \
00078 __asm __volatile ( \
00079 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
00080 "psrlw $15, %%" #regd " \n\t" \
00081 "psllw $1, %%" #regd " \n\t"::)
00082
00083 #endif
00084
00085
00086
00087
00088 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
00089 "movq " #rega ", " #regr " \n\t"\
00090 "pand " #regb ", " #regr " \n\t"\
00091 "pxor " #rega ", " #regb " \n\t"\
00092 "pand " #regfe "," #regb " \n\t"\
00093 "psrlq $1, " #regb " \n\t"\
00094 "paddb " #regb ", " #regr " \n\t"
00095
00096 #define PAVGB_MMX(rega, regb, regr, regfe) \
00097 "movq " #rega ", " #regr " \n\t"\
00098 "por " #regb ", " #regr " \n\t"\
00099 "pxor " #rega ", " #regb " \n\t"\
00100 "pand " #regfe "," #regb " \n\t"\
00101 "psrlq $1, " #regb " \n\t"\
00102 "psubb " #regb ", " #regr " \n\t"
00103
00104
00105 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
00106 "movq " #rega ", " #regr " \n\t"\
00107 "movq " #regc ", " #regp " \n\t"\
00108 "pand " #regb ", " #regr " \n\t"\
00109 "pand " #regd ", " #regp " \n\t"\
00110 "pxor " #rega ", " #regb " \n\t"\
00111 "pxor " #regc ", " #regd " \n\t"\
00112 "pand %%mm6, " #regb " \n\t"\
00113 "pand %%mm6, " #regd " \n\t"\
00114 "psrlq $1, " #regb " \n\t"\
00115 "psrlq $1, " #regd " \n\t"\
00116 "paddb " #regb ", " #regr " \n\t"\
00117 "paddb " #regd ", " #regp " \n\t"
00118
00119 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
00120 "movq " #rega ", " #regr " \n\t"\
00121 "movq " #regc ", " #regp " \n\t"\
00122 "por " #regb ", " #regr " \n\t"\
00123 "por " #regd ", " #regp " \n\t"\
00124 "pxor " #rega ", " #regb " \n\t"\
00125 "pxor " #regc ", " #regd " \n\t"\
00126 "pand %%mm6, " #regb " \n\t"\
00127 "pand %%mm6, " #regd " \n\t"\
00128 "psrlq $1, " #regd " \n\t"\
00129 "psrlq $1, " #regb " \n\t"\
00130 "psubb " #regb ", " #regr " \n\t"\
00131 "psubb " #regd ", " #regp " \n\t"
00132
00133
00134
00135 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
00136 #define SET_RND MOVQ_WONE
00137 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
00138 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
00139
00140 #include "dsputil_mmx_rnd.h"
00141
00142 #undef DEF
00143 #undef SET_RND
00144 #undef PAVGBP
00145 #undef PAVGB
00146
00147
00148
00149 #define DEF(x, y) x ## _ ## y ##_mmx
00150 #define SET_RND MOVQ_WTWO
00151 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
00152 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
00153
00154 #include "dsputil_mmx_rnd.h"
00155
00156 #undef DEF
00157 #undef SET_RND
00158 #undef PAVGBP
00159 #undef PAVGB
00160
00161
00162
00163
00164 #define DEF(x) x ## _3dnow
00165
00166 #define PAVGB "pavgusb"
00167
00168 #include "dsputil_mmx_avg.h"
00169
00170 #undef DEF
00171 #undef PAVGB
00172
00173
00174
00175
00176 #define DEF(x) x ## _mmx2
00177
00178
00179 #define PAVGB "pavgb"
00180
00181 #include "dsputil_mmx_avg.h"
00182
00183 #undef DEF
00184 #undef PAVGB
00185
00186
00187
00188
00189 #ifdef CONFIG_ENCODERS
00190 static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
00191 {
00192 asm volatile(
00193 "mov $-128, %%"REG_a" \n\t"
00194 "pxor %%mm7, %%mm7 \n\t"
00195 ".balign 16 \n\t"
00196 "1: \n\t"
00197 "movq (%0), %%mm0 \n\t"
00198 "movq (%0, %2), %%mm2 \n\t"
00199 "movq %%mm0, %%mm1 \n\t"
00200 "movq %%mm2, %%mm3 \n\t"
00201 "punpcklbw %%mm7, %%mm0 \n\t"
00202 "punpckhbw %%mm7, %%mm1 \n\t"
00203 "punpcklbw %%mm7, %%mm2 \n\t"
00204 "punpckhbw %%mm7, %%mm3 \n\t"
00205 "movq %%mm0, (%1, %%"REG_a")\n\t"
00206 "movq %%mm1, 8(%1, %%"REG_a")\n\t"
00207 "movq %%mm2, 16(%1, %%"REG_a")\n\t"
00208 "movq %%mm3, 24(%1, %%"REG_a")\n\t"
00209 "add %3, %0 \n\t"
00210 "add $32, %%"REG_a" \n\t"
00211 "js 1b \n\t"
00212 : "+r" (pixels)
00213 : "r" (block+64), "r" ((long)line_size), "r" ((long)line_size*2)
00214 : "%"REG_a
00215 );
00216 }
00217
00218 static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
00219 {
00220 asm volatile(
00221 "pxor %%mm7, %%mm7 \n\t"
00222 "mov $-128, %%"REG_a" \n\t"
00223 ".balign 16 \n\t"
00224 "1: \n\t"
00225 "movq (%0), %%mm0 \n\t"
00226 "movq (%1), %%mm2 \n\t"
00227 "movq %%mm0, %%mm1 \n\t"
00228 "movq %%mm2, %%mm3 \n\t"
00229 "punpcklbw %%mm7, %%mm0 \n\t"
00230 "punpckhbw %%mm7, %%mm1 \n\t"
00231 "punpcklbw %%mm7, %%mm2 \n\t"
00232 "punpckhbw %%mm7, %%mm3 \n\t"
00233 "psubw %%mm2, %%mm0 \n\t"
00234 "psubw %%mm3, %%mm1 \n\t"
00235 "movq %%mm0, (%2, %%"REG_a")\n\t"
00236 "movq %%mm1, 8(%2, %%"REG_a")\n\t"
00237 "add %3, %0 \n\t"
00238 "add %3, %1 \n\t"
00239 "add $16, %%"REG_a" \n\t"
00240 "jnz 1b \n\t"
00241 : "+r" (s1), "+r" (s2)
00242 : "r" (block+64), "r" ((long)stride)
00243 : "%"REG_a
00244 );
00245 }
00246 #endif //CONFIG_ENCODERS
00247
00248 void put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00249 {
00250 const DCTELEM *p;
00251 uint8_t *pix;
00252
00253
00254 p = block;
00255 pix = pixels;
00256
00257 __asm __volatile(
00258 "movq %3, %%mm0\n\t"
00259 "movq 8%3, %%mm1\n\t"
00260 "movq 16%3, %%mm2\n\t"
00261 "movq 24%3, %%mm3\n\t"
00262 "movq 32%3, %%mm4\n\t"
00263 "movq 40%3, %%mm5\n\t"
00264 "movq 48%3, %%mm6\n\t"
00265 "movq 56%3, %%mm7\n\t"
00266 "packuswb %%mm1, %%mm0\n\t"
00267 "packuswb %%mm3, %%mm2\n\t"
00268 "packuswb %%mm5, %%mm4\n\t"
00269 "packuswb %%mm7, %%mm6\n\t"
00270 "movq %%mm0, (%0)\n\t"
00271 "movq %%mm2, (%0, %1)\n\t"
00272 "movq %%mm4, (%0, %1, 2)\n\t"
00273 "movq %%mm6, (%0, %2)\n\t"
00274 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "m"(*p)
00275 :"memory");
00276 pix += line_size*4;
00277 p += 32;
00278
00279
00280
00281
00282 __asm __volatile(
00283 "movq (%3), %%mm0\n\t"
00284 "movq 8(%3), %%mm1\n\t"
00285 "movq 16(%3), %%mm2\n\t"
00286 "movq 24(%3), %%mm3\n\t"
00287 "movq 32(%3), %%mm4\n\t"
00288 "movq 40(%3), %%mm5\n\t"
00289 "movq 48(%3), %%mm6\n\t"
00290 "movq 56(%3), %%mm7\n\t"
00291 "packuswb %%mm1, %%mm0\n\t"
00292 "packuswb %%mm3, %%mm2\n\t"
00293 "packuswb %%mm5, %%mm4\n\t"
00294 "packuswb %%mm7, %%mm6\n\t"
00295 "movq %%mm0, (%0)\n\t"
00296 "movq %%mm2, (%0, %1)\n\t"
00297 "movq %%mm4, (%0, %1, 2)\n\t"
00298 "movq %%mm6, (%0, %2)\n\t"
00299 ::"r" (pix), "r" ((long)line_size), "r" ((long)line_size*3), "r"(p)
00300 :"memory");
00301 }
00302
00303 static const unsigned char __align8 vector128[8] =
00304 { 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 };
00305
00306 void put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00307 {
00308 int i;
00309
00310 movq_m2r(*vector128, mm1);
00311 for (i = 0; i < 8; i++) {
00312 movq_m2r(*(block), mm0);
00313 packsswb_m2r(*(block + 4), mm0);
00314 block += 8;
00315 paddb_r2r(mm1, mm0);
00316 movq_r2m(mm0, *pixels);
00317 pixels += line_size;
00318 }
00319 }
00320
00321 void add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
00322 {
00323 const DCTELEM *p;
00324 uint8_t *pix;
00325 int i;
00326
00327
00328 p = block;
00329 pix = pixels;
00330 MOVQ_ZERO(mm7);
00331 i = 4;
00332 do {
00333 __asm __volatile(
00334 "movq (%2), %%mm0\n\t"
00335 "movq 8(%2), %%mm1\n\t"
00336 "movq 16(%2), %%mm2\n\t"
00337 "movq 24(%2), %%mm3\n\t"
00338 "movq %0, %%mm4\n\t"
00339 "movq %1, %%mm6\n\t"
00340 "movq %%mm4, %%mm5\n\t"
00341 "punpcklbw %%mm7, %%mm4\n\t"
00342 "punpckhbw %%mm7, %%mm5\n\t"
00343 "paddsw %%mm4, %%mm0\n\t"
00344 "paddsw %%mm5, %%mm1\n\t"
00345 "movq %%mm6, %%mm5\n\t"
00346 "punpcklbw %%mm7, %%mm6\n\t"
00347 "punpckhbw %%mm7, %%mm5\n\t"
00348 "paddsw %%mm6, %%mm2\n\t"
00349 "paddsw %%mm5, %%mm3\n\t"
00350 "packuswb %%mm1, %%mm0\n\t"
00351 "packuswb %%mm3, %%mm2\n\t"
00352 "movq %%mm0, %0\n\t"
00353 "movq %%mm2, %1\n\t"
00354 :"+m"(*pix), "+m"(*(pix+line_size))
00355 :"r"(p)
00356 :"memory");
00357 pix += line_size*2;
00358 p += 16;
00359 } while (--i);
00360 }
00361
00362 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00363 {
00364 __asm __volatile(
00365 "lea (%3, %3), %%"REG_a" \n\t"
00366 ".balign 8 \n\t"
00367 "1: \n\t"
00368 "movd (%1), %%mm0 \n\t"
00369 "movd (%1, %3), %%mm1 \n\t"
00370 "movd %%mm0, (%2) \n\t"
00371 "movd %%mm1, (%2, %3) \n\t"
00372 "add %%"REG_a", %1 \n\t"
00373 "add %%"REG_a", %2 \n\t"
00374 "movd (%1), %%mm0 \n\t"
00375 "movd (%1, %3), %%mm1 \n\t"
00376 "movd %%mm0, (%2) \n\t"
00377 "movd %%mm1, (%2, %3) \n\t"
00378 "add %%"REG_a", %1 \n\t"
00379 "add %%"REG_a", %2 \n\t"
00380 "subl $4, %0 \n\t"
00381 "jnz 1b \n\t"
00382 : "+g"(h), "+r" (pixels), "+r" (block)
00383 : "r"((long)line_size)
00384 : "%"REG_a, "memory"
00385 );
00386 }
00387
00388 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00389 {
00390 __asm __volatile(
00391 "lea (%3, %3), %%"REG_a" \n\t"
00392 ".balign 8 \n\t"
00393 "1: \n\t"
00394 "movq (%1), %%mm0 \n\t"
00395 "movq (%1, %3), %%mm1 \n\t"
00396 "movq %%mm0, (%2) \n\t"
00397 "movq %%mm1, (%2, %3) \n\t"
00398 "add %%"REG_a", %1 \n\t"
00399 "add %%"REG_a", %2 \n\t"
00400 "movq (%1), %%mm0 \n\t"
00401 "movq (%1, %3), %%mm1 \n\t"
00402 "movq %%mm0, (%2) \n\t"
00403 "movq %%mm1, (%2, %3) \n\t"
00404 "add %%"REG_a", %1 \n\t"
00405 "add %%"REG_a", %2 \n\t"
00406 "subl $4, %0 \n\t"
00407 "jnz 1b \n\t"
00408 : "+g"(h), "+r" (pixels), "+r" (block)
00409 : "r"((long)line_size)
00410 : "%"REG_a, "memory"
00411 );
00412 }
00413
00414 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
00415 {
00416 __asm __volatile(
00417 "lea (%3, %3), %%"REG_a" \n\t"
00418 ".balign 8 \n\t"
00419 "1: \n\t"
00420 "movq (%1), %%mm0 \n\t"
00421 "movq 8(%1), %%mm4 \n\t"
00422 "movq (%1, %3), %%mm1 \n\t"
00423 "movq 8(%1, %3), %%mm5 \n\t"
00424 "movq %%mm0, (%2) \n\t"
00425 "movq %%mm4, 8(%2) \n\t"
00426 "movq %%mm1, (%2, %3) \n\t"
00427 "movq %%mm5, 8(%2, %3) \n\t"
00428 "add %%"REG_a", %1 \n\t"
00429 "add %%"REG_a", %2 \n\t"
00430 "movq (%1), %%mm0 \n\t"
00431 "movq 8(%1), %%mm4 \n\t"
00432 "movq (%1, %3), %%mm1 \n\t"
00433 "movq 8(%1, %3), %%mm5 \n\t"
00434 "movq %%mm0, (%2) \n\t"
00435 "movq %%mm4, 8(%2) \n\t"
00436 "movq %%mm1, (%2, %3) \n\t"
00437 "movq %%mm5, 8(%2, %3) \n\t"
00438 "add %%"REG_a", %1 \n\t"
00439 "add %%"REG_a", %2 \n\t"
00440 "subl $4, %0 \n\t"
00441 "jnz 1b \n\t"
00442 : "+g"(h), "+r" (pixels), "+r" (block)
00443 : "r"((long)line_size)
00444 : "%"REG_a, "memory"
00445 );
00446 }
00447
00448 static void clear_blocks_mmx(DCTELEM *blocks)
00449 {
00450 __asm __volatile(
00451 "pxor %%mm7, %%mm7 \n\t"
00452 "mov $-128*6, %%"REG_a" \n\t"
00453 "1: \n\t"
00454 "movq %%mm7, (%0, %%"REG_a") \n\t"
00455 "movq %%mm7, 8(%0, %%"REG_a") \n\t"
00456 "movq %%mm7, 16(%0, %%"REG_a") \n\t"
00457 "movq %%mm7, 24(%0, %%"REG_a") \n\t"
00458 "add $32, %%"REG_a" \n\t"
00459 " js 1b \n\t"
00460 : : "r" (((uint8_t *)blocks)+128*6)
00461 : "%"REG_a
00462 );
00463 }
00464
00465 #ifdef CONFIG_ENCODERS
00466 static int pix_sum16_mmx(uint8_t * pix, int line_size){
00467 const int h=16;
00468 int sum;
00469 long index= -line_size*h;
00470
00471 __asm __volatile(
00472 "pxor %%mm7, %%mm7 \n\t"
00473 "pxor %%mm6, %%mm6 \n\t"
00474 "1: \n\t"
00475 "movq (%2, %1), %%mm0 \n\t"
00476 "movq (%2, %1), %%mm1 \n\t"
00477 "movq 8(%2, %1), %%mm2 \n\t"
00478 "movq 8(%2, %1), %%mm3 \n\t"
00479 "punpcklbw %%mm7, %%mm0 \n\t"
00480 "punpckhbw %%mm7, %%mm1 \n\t"
00481 "punpcklbw %%mm7, %%mm2 \n\t"
00482 "punpckhbw %%mm7, %%mm3 \n\t"
00483 "paddw %%mm0, %%mm1 \n\t"
00484 "paddw %%mm2, %%mm3 \n\t"
00485 "paddw %%mm1, %%mm3 \n\t"
00486 "paddw %%mm3, %%mm6 \n\t"
00487 "add %3, %1 \n\t"
00488 " js 1b \n\t"
00489 "movq %%mm6, %%mm5 \n\t"
00490 "psrlq $32, %%mm6 \n\t"
00491 "paddw %%mm5, %%mm6 \n\t"
00492 "movq %%mm6, %%mm5 \n\t"
00493 "psrlq $16, %%mm6 \n\t"
00494 "paddw %%mm5, %%mm6 \n\t"
00495 "movd %%mm6, %0 \n\t"
00496 "andl $0xFFFF, %0 \n\t"
00497 : "=&r" (sum), "+r" (index)
00498 : "r" (pix - index), "r" ((long)line_size)
00499 );
00500
00501 return sum;
00502 }
00503 #endif //CONFIG_ENCODERS
00504
00505 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
00506 long i=0;
00507 asm volatile(
00508 "1: \n\t"
00509 "movq (%1, %0), %%mm0 \n\t"
00510 "movq (%2, %0), %%mm1 \n\t"
00511 "paddb %%mm0, %%mm1 \n\t"
00512 "movq %%mm1, (%2, %0) \n\t"
00513 "movq 8(%1, %0), %%mm0 \n\t"
00514 "movq 8(%2, %0), %%mm1 \n\t"
00515 "paddb %%mm0, %%mm1 \n\t"
00516 "movq %%mm1, 8(%2, %0) \n\t"
00517 "add $16, %0 \n\t"
00518 "cmp %3, %0 \n\t"
00519 " jb 1b \n\t"
00520 : "+r" (i)
00521 : "r"(src), "r"(dst), "r"((long)w-15)
00522 );
00523 for(; i<w; i++)
00524 dst[i+0] += src[i+0];
00525 }
00526
00527 #define H263_LOOP_FILTER \
00528 "pxor %%mm7, %%mm7 \n\t"\
00529 "movq %0, %%mm0 \n\t"\
00530 "movq %0, %%mm1 \n\t"\
00531 "movq %3, %%mm2 \n\t"\
00532 "movq %3, %%mm3 \n\t"\
00533 "punpcklbw %%mm7, %%mm0 \n\t"\
00534 "punpckhbw %%mm7, %%mm1 \n\t"\
00535 "punpcklbw %%mm7, %%mm2 \n\t"\
00536 "punpckhbw %%mm7, %%mm3 \n\t"\
00537 "psubw %%mm2, %%mm0 \n\t"\
00538 "psubw %%mm3, %%mm1 \n\t"\
00539 "movq %1, %%mm2 \n\t"\
00540 "movq %1, %%mm3 \n\t"\
00541 "movq %2, %%mm4 \n\t"\
00542 "movq %2, %%mm5 \n\t"\
00543 "punpcklbw %%mm7, %%mm2 \n\t"\
00544 "punpckhbw %%mm7, %%mm3 \n\t"\
00545 "punpcklbw %%mm7, %%mm4 \n\t"\
00546 "punpckhbw %%mm7, %%mm5 \n\t"\
00547 "psubw %%mm2, %%mm4 \n\t"\
00548 "psubw %%mm3, %%mm5 \n\t"\
00549 "psllw $2, %%mm4 \n\t"\
00550 "psllw $2, %%mm5 \n\t"\
00551 "paddw %%mm0, %%mm4 \n\t"\
00552 "paddw %%mm1, %%mm5 \n\t"\
00553 "pxor %%mm6, %%mm6 \n\t"\
00554 "pcmpgtw %%mm4, %%mm6 \n\t"\
00555 "pcmpgtw %%mm5, %%mm7 \n\t"\
00556 "pxor %%mm6, %%mm4 \n\t"\
00557 "pxor %%mm7, %%mm5 \n\t"\
00558 "psubw %%mm6, %%mm4 \n\t"\
00559 "psubw %%mm7, %%mm5 \n\t"\
00560 "psrlw $3, %%mm4 \n\t"\
00561 "psrlw $3, %%mm5 \n\t"\
00562 "packuswb %%mm5, %%mm4 \n\t"\
00563 "packsswb %%mm7, %%mm6 \n\t"\
00564 "pxor %%mm7, %%mm7 \n\t"\
00565 "movd %4, %%mm2 \n\t"\
00566 "punpcklbw %%mm2, %%mm2 \n\t"\
00567 "punpcklbw %%mm2, %%mm2 \n\t"\
00568 "punpcklbw %%mm2, %%mm2 \n\t"\
00569 "psubusb %%mm4, %%mm2 \n\t"\
00570 "movq %%mm2, %%mm3 \n\t"\
00571 "psubusb %%mm4, %%mm3 \n\t"\
00572 "psubb %%mm3, %%mm2 \n\t"\
00573 "movq %1, %%mm3 \n\t"\
00574 "movq %2, %%mm4 \n\t"\
00575 "pxor %%mm6, %%mm3 \n\t"\
00576 "pxor %%mm6, %%mm4 \n\t"\
00577 "paddusb %%mm2, %%mm3 \n\t"\
00578 "psubusb %%mm2, %%mm4 \n\t"\
00579 "pxor %%mm6, %%mm3 \n\t"\
00580 "pxor %%mm6, %%mm4 \n\t"\
00581 "paddusb %%mm2, %%mm2 \n\t"\
00582 "packsswb %%mm1, %%mm0 \n\t"\
00583 "pcmpgtb %%mm0, %%mm7 \n\t"\
00584 "pxor %%mm7, %%mm0 \n\t"\
00585 "psubb %%mm7, %%mm0 \n\t"\
00586 "movq %%mm0, %%mm1 \n\t"\
00587 "psubusb %%mm2, %%mm0 \n\t"\
00588 "psubb %%mm0, %%mm1 \n\t"\
00589 "pand %5, %%mm1 \n\t"\
00590 "psrlw $2, %%mm1 \n\t"\
00591 "pxor %%mm7, %%mm1 \n\t"\
00592 "psubb %%mm7, %%mm1 \n\t"\
00593 "movq %0, %%mm5 \n\t"\
00594 "movq %3, %%mm6 \n\t"\
00595 "psubb %%mm1, %%mm5 \n\t"\
00596 "paddb %%mm1, %%mm6 \n\t"
00597
00598 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
00599 const int strength= ff_h263_loop_filter_strength[qscale];
00600
00601 asm volatile(
00602
00603 H263_LOOP_FILTER
00604
00605 "movq %%mm3, %1 \n\t"
00606 "movq %%mm4, %2 \n\t"
00607 "movq %%mm5, %0 \n\t"
00608 "movq %%mm6, %3 \n\t"
00609 : "+m" (*(uint64_t*)(src - 2*stride)),
00610 "+m" (*(uint64_t*)(src - 1*stride)),
00611 "+m" (*(uint64_t*)(src + 0*stride)),
00612 "+m" (*(uint64_t*)(src + 1*stride))
00613 : "g" (2*strength), "m"(ff_pb_FC)
00614 );
00615 }
00616
00617 static inline void transpose4x4(uint8_t *dst, uint8_t *src, int dst_stride, int src_stride){
00618 asm volatile(
00619 "movd %4, %%mm0 \n\t"
00620 "movd %5, %%mm1 \n\t"
00621 "movd %6, %%mm2 \n\t"
00622 "movd %7, %%mm3 \n\t"
00623 "punpcklbw %%mm1, %%mm0 \n\t"
00624 "punpcklbw %%mm3, %%mm2 \n\t"
00625 "movq %%mm0, %%mm1 \n\t"
00626 "punpcklwd %%mm2, %%mm0 \n\t"
00627 "punpckhwd %%mm2, %%mm1 \n\t"
00628 "movd %%mm0, %0 \n\t"
00629 "punpckhdq %%mm0, %%mm0 \n\t"
00630 "movd %%mm0, %1 \n\t"
00631 "movd %%mm1, %2 \n\t"
00632 "punpckhdq %%mm1, %%mm1 \n\t"
00633 "movd %%mm1, %3 \n\t"
00634
00635 : "=m" (*(uint32_t*)(dst + 0*dst_stride)),
00636 "=m" (*(uint32_t*)(dst + 1*dst_stride)),
00637 "=m" (*(uint32_t*)(dst + 2*dst_stride)),
00638 "=m" (*(uint32_t*)(dst + 3*dst_stride))
00639 : "m" (*(uint32_t*)(src + 0*src_stride)),
00640 "m" (*(uint32_t*)(src + 1*src_stride)),
00641 "m" (*(uint32_t*)(src + 2*src_stride)),
00642 "m" (*(uint32_t*)(src + 3*src_stride))
00643 );
00644 }
00645
00646 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
00647 const int strength= ff_h263_loop_filter_strength[qscale];
00648 uint64_t temp[4] __attribute__ ((aligned(8)));
00649 uint8_t *btemp= (uint8_t*)temp;
00650
00651 src -= 2;
00652
00653 transpose4x4(btemp , src , 8, stride);
00654 transpose4x4(btemp+4, src + 4*stride, 8, stride);
00655 asm volatile(
00656 H263_LOOP_FILTER
00657
00658 : "+m" (temp[0]),
00659 "+m" (temp[1]),
00660 "+m" (temp[2]),
00661 "+m" (temp[3])
00662 : "g" (2*strength), "m"(ff_pb_FC)
00663 );
00664
00665 asm volatile(
00666 "movq %%mm5, %%mm1 \n\t"
00667 "movq %%mm4, %%mm0 \n\t"
00668 "punpcklbw %%mm3, %%mm5 \n\t"
00669 "punpcklbw %%mm6, %%mm4 \n\t"
00670 "punpckhbw %%mm3, %%mm1 \n\t"
00671 "punpckhbw %%mm6, %%mm0 \n\t"
00672 "movq %%mm5, %%mm3 \n\t"
00673 "movq %%mm1, %%mm6 \n\t"
00674 "punpcklwd %%mm4, %%mm5 \n\t"
00675 "punpcklwd %%mm0, %%mm1 \n\t"
00676 "punpckhwd %%mm4, %%mm3 \n\t"
00677 "punpckhwd %%mm0, %%mm6 \n\t"
00678 "movd %%mm5, (%0) \n\t"
00679 "punpckhdq %%mm5, %%mm5 \n\t"
00680 "movd %%mm5, (%0,%2) \n\t"
00681 "movd %%mm3, (%0,%2,2) \n\t"
00682 "punpckhdq %%mm3, %%mm3 \n\t"
00683 "movd %%mm3, (%0,%3) \n\t"
00684 "movd %%mm1, (%1) \n\t"
00685 "punpckhdq %%mm1, %%mm1 \n\t"
00686 "movd %%mm1, (%1,%2) \n\t"
00687 "movd %%mm6, (%1,%2,2) \n\t"
00688 "punpckhdq %%mm6, %%mm6 \n\t"
00689 "movd %%mm6, (%1,%3) \n\t"
00690 :: "r" (src),
00691 "r" (src + 4*stride),
00692 "r" ((long) stride ),
00693 "r" ((long)(3*stride))
00694 );
00695 }
00696
00697 #ifdef CONFIG_ENCODERS
00698 static int pix_norm1_mmx(uint8_t *pix, int line_size) {
00699 int tmp;
00700 asm volatile (
00701 "movl $16,%%ecx\n"
00702 "pxor %%mm0,%%mm0\n"
00703 "pxor %%mm7,%%mm7\n"
00704 "1:\n"
00705 "movq (%0),%%mm2\n"
00706 "movq 8(%0),%%mm3\n"
00707
00708 "movq %%mm2,%%mm1\n"
00709
00710 "punpckhbw %%mm0,%%mm1\n"
00711 "punpcklbw %%mm0,%%mm2\n"
00712
00713 "movq %%mm3,%%mm4\n"
00714 "punpckhbw %%mm0,%%mm3\n"
00715 "punpcklbw %%mm0,%%mm4\n"
00716
00717 "pmaddwd %%mm1,%%mm1\n"
00718 "pmaddwd %%mm2,%%mm2\n"
00719
00720 "pmaddwd %%mm3,%%mm3\n"
00721 "pmaddwd %%mm4,%%mm4\n"
00722
00723 "paddd %%mm1,%%mm2\n"
00724
00725 "paddd %%mm3,%%mm4\n"
00726 "paddd %%mm2,%%mm7\n"
00727
00728 "add %2, %0\n"
00729 "paddd %%mm4,%%mm7\n"
00730 "dec %%ecx\n"
00731 "jnz 1b\n"
00732
00733 "movq %%mm7,%%mm1\n"
00734 "psrlq $32, %%mm7\n"
00735 "paddd %%mm7,%%mm1\n"
00736 "movd %%mm1,%1\n"
00737 : "+r" (pix), "=r"(tmp) : "r" ((long)line_size) : "%ecx" );
00738 return tmp;
00739 }
00740
00741 static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00742 int tmp;
00743 asm volatile (
00744 "movl %4,%%ecx\n"
00745 "pxor %%mm0,%%mm0\n"
00746 "pxor %%mm7,%%mm7\n"
00747 "1:\n"
00748 "movq (%0),%%mm1\n"
00749 "movq (%1),%%mm2\n"
00750
00751 "movq %%mm1,%%mm5\n"
00752 "psubusb %%mm2,%%mm1\n"
00753 "psubusb %%mm5,%%mm2\n"
00754
00755 "por %%mm1,%%mm2\n"
00756
00757 "movq %%mm2,%%mm1\n"
00758
00759 "punpckhbw %%mm0,%%mm2\n"
00760 "punpcklbw %%mm0,%%mm1\n"
00761
00762 "pmaddwd %%mm2,%%mm2\n"
00763 "pmaddwd %%mm1,%%mm1\n"
00764
00765 "add %3,%0\n"
00766 "add %3,%1\n"
00767
00768 "paddd %%mm2,%%mm1\n"
00769 "paddd %%mm1,%%mm7\n"
00770
00771 "decl %%ecx\n"
00772 "jnz 1b\n"
00773
00774 "movq %%mm7,%%mm1\n"
00775 "psrlq $32, %%mm7\n"
00776 "paddd %%mm7,%%mm1\n"
00777 "movd %%mm1,%2\n"
00778 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00779 : "r" ((long)line_size) , "m" (h)
00780 : "%ecx");
00781 return tmp;
00782 }
00783
00784 static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
00785 int tmp;
00786 asm volatile (
00787 "movl %4,%%ecx\n"
00788 "pxor %%mm0,%%mm0\n"
00789 "pxor %%mm7,%%mm7\n"
00790 "1:\n"
00791 "movq (%0),%%mm1\n"
00792 "movq (%1),%%mm2\n"
00793 "movq 8(%0),%%mm3\n"
00794 "movq 8(%1),%%mm4\n"
00795
00796
00797
00798
00799 "movq %%mm1,%%mm5\n"
00800 "movq %%mm3,%%mm6\n"
00801 "psubusb %%mm2,%%mm1\n"
00802 "psubusb %%mm4,%%mm3\n"
00803 "psubusb %%mm5,%%mm2\n"
00804 "psubusb %%mm6,%%mm4\n"
00805
00806 "por %%mm1,%%mm2\n"
00807 "por %%mm3,%%mm4\n"
00808
00809
00810 "movq %%mm2,%%mm1\n"
00811 "movq %%mm4,%%mm3\n"
00812
00813 "punpckhbw %%mm0,%%mm2\n"
00814 "punpckhbw %%mm0,%%mm4\n"
00815 "punpcklbw %%mm0,%%mm1\n"
00816 "punpcklbw %%mm0,%%mm3\n"
00817
00818 "pmaddwd %%mm2,%%mm2\n"
00819 "pmaddwd %%mm4,%%mm4\n"
00820 "pmaddwd %%mm1,%%mm1\n"
00821 "pmaddwd %%mm3,%%mm3\n"
00822
00823 "add %3,%0\n"
00824 "add %3,%1\n"
00825
00826 "paddd %%mm2,%%mm1\n"
00827 "paddd %%mm4,%%mm3\n"
00828 "paddd %%mm1,%%mm7\n"
00829 "paddd %%mm3,%%mm7\n"
00830
00831 "decl %%ecx\n"
00832 "jnz 1b\n"
00833
00834 "movq %%mm7,%%mm1\n"
00835 "psrlq $32, %%mm7\n"
00836 "paddd %%mm7,%%mm1\n"
00837 "movd %%mm1,%2\n"
00838 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
00839 : "r" ((long)line_size) , "m" (h)
00840 : "%ecx");
00841 return tmp;
00842 }
00843
00844 static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
00845 int tmp;
00846 asm volatile (
00847 "movl %3,%%ecx\n"
00848 "pxor %%mm7,%%mm7\n"
00849 "pxor %%mm6,%%mm6\n"
00850
00851 "movq (%0),%%mm0\n"
00852 "movq %%mm0, %%mm1\n"
00853 "psllq $8, %%mm0\n"
00854 "psrlq $8, %%mm1\n"
00855 "psrlq $8, %%mm0\n"
00856 "movq %%mm0, %%mm2\n"
00857 "movq %%mm1, %%mm3\n"
00858 "punpcklbw %%mm7,%%mm0\n"
00859 "punpcklbw %%mm7,%%mm1\n"
00860 "punpckhbw %%mm7,%%mm2\n"
00861 "punpckhbw %%mm7,%%mm3\n"
00862 "psubw %%mm1, %%mm0\n"
00863 "psubw %%mm3, %%mm2\n"
00864
00865 "add %2,%0\n"
00866
00867 "movq (%0),%%mm4\n"
00868 "movq %%mm4, %%mm1\n"
00869 "psllq $8, %%mm4\n"
00870 "psrlq $8, %%mm1\n"
00871 "psrlq $8, %%mm4\n"
00872 "movq %%mm4, %%mm5\n"
00873 "movq %%mm1, %%mm3\n"
00874 "punpcklbw %%mm7,%%mm4\n"
00875 "punpcklbw %%mm7,%%mm1\n"
00876 "punpckhbw %%mm7,%%mm5\n"
00877 "punpckhbw %%mm7,%%mm3\n"
00878 "psubw %%mm1, %%mm4\n"
00879 "psubw %%mm3, %%mm5\n"
00880 "psubw %%mm4, %%mm0\n"
00881 "psubw %%mm5, %%mm2\n"
00882 "pxor %%mm3, %%mm3\n"
00883 "pxor %%mm1, %%mm1\n"
00884 "pcmpgtw %%mm0, %%mm3\n\t"
00885 "pcmpgtw %%mm2, %%mm1\n\t"
00886 "pxor %%mm3, %%mm0\n"
00887 "pxor %%mm1, %%mm2\n"
00888 "psubw %%mm3, %%mm0\n"
00889 "psubw %%mm1, %%mm2\n"
00890 "paddw %%mm0, %%mm2\n"
00891 "paddw %%mm2, %%mm6\n"
00892
00893 "add %2,%0\n"
00894 "1:\n"
00895
00896 "movq (%0),%%mm0\n"
00897 "movq %%mm0, %%mm1\n"
00898 "psllq $8, %%mm0\n"
00899 "psrlq $8, %%mm1\n"
00900 "psrlq $8, %%mm0\n"
00901 "movq %%mm0, %%mm2\n"
00902 "movq %%mm1, %%mm3\n"
00903 "punpcklbw %%mm7,%%mm0\n"
00904 "punpcklbw %%mm7,%%mm1\n"
00905 "punpckhbw %%mm7,%%mm2\n"
00906 "punpckhbw %%mm7,%%mm3\n"
00907 "psubw %%mm1, %%mm0\n"
00908 "psubw %%mm3, %%mm2\n"
00909 "psubw %%mm0, %%mm4\n"
00910 "psubw %%mm2, %%mm5\n"
00911 "pxor %%mm3, %%mm3\n"
00912 "pxor %%mm1, %%mm1\n"
00913 "pcmpgtw %%mm4, %%mm3\n\t"
00914 "pcmpgtw %%mm5, %%mm1\n\t"
00915 "pxor %%mm3, %%mm4\n"
00916 "pxor %%mm1, %%mm5\n"
00917 "psubw %%mm3, %%mm4\n"
00918 "psubw %%mm1, %%mm5\n"
00919 "paddw %%mm4, %%mm5\n"
00920 "paddw %%mm5, %%mm6\n"
00921
00922 "add %2,%0\n"
00923
00924 "movq (%0),%%mm4\n"
00925 "movq %%mm4, %%mm1\n"
00926 "psllq $8, %%mm4\n"
00927 "psrlq $8, %%mm1\n"
00928 "psrlq $8, %%mm4\n"
00929 "movq %%mm4, %%mm5\n"
00930 "movq %%mm1, %%mm3\n"
00931 "punpcklbw %%mm7,%%mm4\n"
00932 "punpcklbw %%mm7,%%mm1\n"
00933 "punpckhbw %%mm7,%%mm5\n"
00934 "punpckhbw %%mm7,%%mm3\n"
00935 "psubw %%mm1, %%mm4\n"
00936 "psubw %%mm3, %%mm5\n"
00937 "psubw %%mm4, %%mm0\n"
00938 "psubw %%mm5, %%mm2\n"
00939 "pxor %%mm3, %%mm3\n"
00940 "pxor %%mm1, %%mm1\n"
00941 "pcmpgtw %%mm0, %%mm3\n\t"
00942 "pcmpgtw %%mm2, %%mm1\n\t"
00943 "pxor %%mm3, %%mm0\n"
00944 "pxor %%mm1, %%mm2\n"
00945 "psubw %%mm3, %%mm0\n"
00946 "psubw %%mm1, %%mm2\n"
00947 "paddw %%mm0, %%mm2\n"
00948 "paddw %%mm2, %%mm6\n"
00949
00950 "add %2,%0\n"
00951 "subl $2, %%ecx\n"
00952 " jnz 1b\n"
00953
00954 "movq %%mm6, %%mm0\n"
00955 "punpcklwd %%mm7,%%mm0\n"
00956 "punpckhwd %%mm7,%%mm6\n"
00957 "paddd %%mm0, %%mm6\n"
00958
00959 "movq %%mm6,%%mm0\n"
00960 "psrlq $32, %%mm6\n"
00961 "paddd %%mm6,%%mm0\n"
00962 "movd %%mm0,%1\n"
00963 : "+r" (pix1), "=r"(tmp)
00964 : "r" ((long)line_size) , "g" (h-2)
00965 : "%ecx");
00966 return tmp;
00967 }
00968
00969 static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
00970 int tmp;
00971 uint8_t * pix= pix1;
00972 asm volatile (
00973 "movl %3,%%ecx\n"
00974 "pxor %%mm7,%%mm7\n"
00975 "pxor %%mm6,%%mm6\n"
00976
00977 "movq (%0),%%mm0\n"
00978 "movq 1(%0),%%mm1\n"
00979 "movq %%mm0, %%mm2\n"
00980 "movq %%mm1, %%mm3\n"
00981 "punpcklbw %%mm7,%%mm0\n"
00982 "punpcklbw %%mm7,%%mm1\n"
00983 "punpckhbw %%mm7,%%mm2\n"
00984 "punpckhbw %%mm7,%%mm3\n"
00985 "psubw %%mm1, %%mm0\n"
00986 "psubw %%mm3, %%mm2\n"
00987
00988 "add %2,%0\n"
00989
00990 "movq (%0),%%mm4\n"
00991 "movq 1(%0),%%mm1\n"
00992 "movq %%mm4, %%mm5\n"
00993 "movq %%mm1, %%mm3\n"
00994 "punpcklbw %%mm7,%%mm4\n"
00995 "punpcklbw %%mm7,%%mm1\n"
00996 "punpckhbw %%mm7,%%mm5\n"
00997 "punpckhbw %%mm7,%%mm3\n"
00998 "psubw %%mm1, %%mm4\n"
00999 "psubw %%mm3, %%mm5\n"
01000 "psubw %%mm4, %%mm0\n"
01001 "psubw %%mm5, %%mm2\n"
01002 "pxor %%mm3, %%mm3\n"
01003 "pxor %%mm1, %%mm1\n"
01004 "pcmpgtw %%mm0, %%mm3\n\t"
01005 "pcmpgtw %%mm2, %%mm1\n\t"
01006 "pxor %%mm3, %%mm0\n"
01007 "pxor %%mm1, %%mm2\n"
01008 "psubw %%mm3, %%mm0\n"
01009 "psubw %%mm1, %%mm2\n"
01010 "paddw %%mm0, %%mm2\n"
01011 "paddw %%mm2, %%mm6\n"
01012
01013 "add %2,%0\n"
01014 "1:\n"
01015
01016 "movq (%0),%%mm0\n"
01017 "movq 1(%0),%%mm1\n"
01018 "movq %%mm0, %%mm2\n"
01019 "movq %%mm1, %%mm3\n"
01020 "punpcklbw %%mm7,%%mm0\n"
01021 "punpcklbw %%mm7,%%mm1\n"
01022 "punpckhbw %%mm7,%%mm2\n"
01023 "punpckhbw %%mm7,%%mm3\n"
01024 "psubw %%mm1, %%mm0\n"
01025 "psubw %%mm3, %%mm2\n"
01026 "psubw %%mm0, %%mm4\n"
01027 "psubw %%mm2, %%mm5\n"
01028 "pxor %%mm3, %%mm3\n"
01029 "pxor %%mm1, %%mm1\n"
01030 "pcmpgtw %%mm4, %%mm3\n\t"
01031 "pcmpgtw %%mm5, %%mm1\n\t"
01032 "pxor %%mm3, %%mm4\n"
01033 "pxor %%mm1, %%mm5\n"
01034 "psubw %%mm3, %%mm4\n"
01035 "psubw %%mm1, %%mm5\n"
01036 "paddw %%mm4, %%mm5\n"
01037 "paddw %%mm5, %%mm6\n"
01038
01039 "add %2,%0\n"
01040
01041 "movq (%0),%%mm4\n"
01042 "movq 1(%0),%%mm1\n"
01043 "movq %%mm4, %%mm5\n"
01044 "movq %%mm1, %%mm3\n"
01045 "punpcklbw %%mm7,%%mm4\n"
01046 "punpcklbw %%mm7,%%mm1\n"
01047 "punpckhbw %%mm7,%%mm5\n"
01048 "punpckhbw %%mm7,%%mm3\n"
01049 "psubw %%mm1, %%mm4\n"
01050 "psubw %%mm3, %%mm5\n"
01051 "psubw %%mm4, %%mm0\n"
01052 "psubw %%mm5, %%mm2\n"
01053 "pxor %%mm3, %%mm3\n"
01054 "pxor %%mm1, %%mm1\n"
01055 "pcmpgtw %%mm0, %%mm3\n\t"
01056 "pcmpgtw %%mm2, %%mm1\n\t"
01057 "pxor %%mm3, %%mm0\n"
01058 "pxor %%mm1, %%mm2\n"
01059 "psubw %%mm3, %%mm0\n"
01060 "psubw %%mm1, %%mm2\n"
01061 "paddw %%mm0, %%mm2\n"
01062 "paddw %%mm2, %%mm6\n"
01063
01064 "add %2,%0\n"
01065 "subl $2, %%ecx\n"
01066 " jnz 1b\n"
01067
01068 "movq %%mm6, %%mm0\n"
01069 "punpcklwd %%mm7,%%mm0\n"
01070 "punpckhwd %%mm7,%%mm6\n"
01071 "paddd %%mm0, %%mm6\n"
01072
01073 "movq %%mm6,%%mm0\n"
01074 "psrlq $32, %%mm6\n"
01075 "paddd %%mm6,%%mm0\n"
01076 "movd %%mm0,%1\n"
01077 : "+r" (pix1), "=r"(tmp)
01078 : "r" ((long)line_size) , "g" (h-2)
01079 : "%ecx");
01080 return tmp + hf_noise8_mmx(pix+8, line_size, h);
01081 }
01082
01083 static int nsse16_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
01084 int score1= sse16_mmx(c, pix1, pix2, line_size, h);
01085 int score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
01086
01087 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
01088 else return score1 + ABS(score2)*8;
01089 }
01090
01091 static int nsse8_mmx(MpegEncContext *c, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
01092 int score1= sse8_mmx(c, pix1, pix2, line_size, h);
01093 int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
01094
01095 if(c) return score1 + ABS(score2)*c->avctx->nsse_weight;
01096 else return score1 + ABS(score2)*8;
01097 }
01098
01099 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
01100 int tmp;
01101
01102 assert( (((int)pix) & 7) == 0);
01103 assert((line_size &7) ==0);
01104
01105 #define SUM(in0, in1, out0, out1) \
01106 "movq (%0), %%mm2\n"\
01107 "movq 8(%0), %%mm3\n"\
01108 "add %2,%0\n"\
01109 "movq %%mm2, " #out0 "\n"\
01110 "movq %%mm3, " #out1 "\n"\
01111 "psubusb " #in0 ", %%mm2\n"\
01112 "psubusb " #in1 ", %%mm3\n"\
01113 "psubusb " #out0 ", " #in0 "\n"\
01114 "psubusb " #out1 ", " #in1 "\n"\
01115 "por %%mm2, " #in0 "\n"\
01116 "por %%mm3, " #in1 "\n"\
01117 "movq " #in0 ", %%mm2\n"\
01118 "movq " #in1 ", %%mm3\n"\
01119 "punpcklbw %%mm7, " #in0 "\n"\
01120 "punpcklbw %%mm7, " #in1 "\n"\
01121 "punpckhbw %%mm7, %%mm2\n"\
01122 "punpckhbw %%mm7, %%mm3\n"\
01123 "paddw " #in1 ", " #in0 "\n"\
01124 "paddw %%mm3, %%mm2\n"\
01125 "paddw %%mm2, " #in0 "\n"\
01126 "paddw " #in0 ", %%mm6\n"
01127
01128
01129 asm volatile (
01130 "movl %3,%%ecx\n"
01131 "pxor %%mm6,%%mm6\n"
01132 "pxor %%mm7,%%mm7\n"
01133 "movq (%0),%%mm0\n"
01134 "movq 8(%0),%%mm1\n"
01135 "add %2,%0\n"
01136 "subl $2, %%ecx\n"
01137 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01138 "1:\n"
01139
01140 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
01141
01142 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01143
01144 "subl $2, %%ecx\n"
01145 "jnz 1b\n"
01146
01147 "movq %%mm6,%%mm0\n"
01148 "psrlq $32, %%mm6\n"
01149 "paddw %%mm6,%%mm0\n"
01150 "movq %%mm0,%%mm6\n"
01151 "psrlq $16, %%mm0\n"
01152 "paddw %%mm6,%%mm0\n"
01153 "movd %%mm0,%1\n"
01154 : "+r" (pix), "=r"(tmp)
01155 : "r" ((long)line_size) , "m" (h)
01156 : "%ecx");
01157 return tmp & 0xFFFF;
01158 }
01159 #undef SUM
01160
01161 static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
01162 int tmp;
01163
01164 assert( (((int)pix) & 7) == 0);
01165 assert((line_size &7) ==0);
01166
01167 #define SUM(in0, in1, out0, out1) \
01168 "movq (%0), " #out0 "\n"\
01169 "movq 8(%0), " #out1 "\n"\
01170 "add %2,%0\n"\
01171 "psadbw " #out0 ", " #in0 "\n"\
01172 "psadbw " #out1 ", " #in1 "\n"\
01173 "paddw " #in1 ", " #in0 "\n"\
01174 "paddw " #in0 ", %%mm6\n"
01175
01176 asm volatile (
01177 "movl %3,%%ecx\n"
01178 "pxor %%mm6,%%mm6\n"
01179 "pxor %%mm7,%%mm7\n"
01180 "movq (%0),%%mm0\n"
01181 "movq 8(%0),%%mm1\n"
01182 "add %2,%0\n"
01183 "subl $2, %%ecx\n"
01184 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01185 "1:\n"
01186
01187 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
01188
01189 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01190
01191 "subl $2, %%ecx\n"
01192 "jnz 1b\n"
01193
01194 "movd %%mm6,%1\n"
01195 : "+r" (pix), "=r"(tmp)
01196 : "r" ((long)line_size) , "m" (h)
01197 : "%ecx");
01198 return tmp;
01199 }
01200 #undef SUM
01201
01202 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
01203 int tmp;
01204
01205 assert( (((int)pix1) & 7) == 0);
01206 assert( (((int)pix2) & 7) == 0);
01207 assert((line_size &7) ==0);
01208
01209 #define SUM(in0, in1, out0, out1) \
01210 "movq (%0),%%mm2\n"\
01211 "movq (%1)," #out0 "\n"\
01212 "movq 8(%0),%%mm3\n"\
01213 "movq 8(%1)," #out1 "\n"\
01214 "add %3,%0\n"\
01215 "add %3,%1\n"\
01216 "psubb " #out0 ", %%mm2\n"\
01217 "psubb " #out1 ", %%mm3\n"\
01218 "pxor %%mm7, %%mm2\n"\
01219 "pxor %%mm7, %%mm3\n"\
01220 "movq %%mm2, " #out0 "\n"\
01221 "movq %%mm3, " #out1 "\n"\
01222 "psubusb " #in0 ", %%mm2\n"\
01223 "psubusb " #in1 ", %%mm3\n"\
01224 "psubusb " #out0 ", " #in0 "\n"\
01225 "psubusb " #out1 ", " #in1 "\n"\
01226 "por %%mm2, " #in0 "\n"\
01227 "por %%mm3, " #in1 "\n"\
01228 "movq " #in0 ", %%mm2\n"\
01229 "movq " #in1 ", %%mm3\n"\
01230 "punpcklbw %%mm7, " #in0 "\n"\
01231 "punpcklbw %%mm7, " #in1 "\n"\
01232 "punpckhbw %%mm7, %%mm2\n"\
01233 "punpckhbw %%mm7, %%mm3\n"\
01234 "paddw " #in1 ", " #in0 "\n"\
01235 "paddw %%mm3, %%mm2\n"\
01236 "paddw %%mm2, " #in0 "\n"\
01237 "paddw " #in0 ", %%mm6\n"
01238
01239
01240 asm volatile (
01241 "movl %4,%%ecx\n"
01242 "pxor %%mm6,%%mm6\n"
01243 "pcmpeqw %%mm7,%%mm7\n"
01244 "psllw $15, %%mm7\n"
01245 "packsswb %%mm7, %%mm7\n"
01246 "movq (%0),%%mm0\n"
01247 "movq (%1),%%mm2\n"
01248 "movq 8(%0),%%mm1\n"
01249 "movq 8(%1),%%mm3\n"
01250 "add %3,%0\n"
01251 "add %3,%1\n"
01252 "subl $2, %%ecx\n"
01253 "psubb %%mm2, %%mm0\n"
01254 "psubb %%mm3, %%mm1\n"
01255 "pxor %%mm7, %%mm0\n"
01256 "pxor %%mm7, %%mm1\n"
01257 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01258 "1:\n"
01259
01260 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
01261
01262 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01263
01264 "subl $2, %%ecx\n"
01265 "jnz 1b\n"
01266
01267 "movq %%mm6,%%mm0\n"
01268 "psrlq $32, %%mm6\n"
01269 "paddw %%mm6,%%mm0\n"
01270 "movq %%mm0,%%mm6\n"
01271 "psrlq $16, %%mm0\n"
01272 "paddw %%mm6,%%mm0\n"
01273 "movd %%mm0,%2\n"
01274 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
01275 : "r" ((long)line_size) , "m" (h)
01276 : "%ecx");
01277 return tmp & 0x7FFF;
01278 }
01279 #undef SUM
01280
01281 static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
01282 int tmp;
01283
01284 assert( (((int)pix1) & 7) == 0);
01285 assert( (((int)pix2) & 7) == 0);
01286 assert((line_size &7) ==0);
01287
01288 #define SUM(in0, in1, out0, out1) \
01289 "movq (%0)," #out0 "\n"\
01290 "movq (%1),%%mm2\n"\
01291 "movq 8(%0)," #out1 "\n"\
01292 "movq 8(%1),%%mm3\n"\
01293 "add %3,%0\n"\
01294 "add %3,%1\n"\
01295 "psubb %%mm2, " #out0 "\n"\
01296 "psubb %%mm3, " #out1 "\n"\
01297 "pxor %%mm7, " #out0 "\n"\
01298 "pxor %%mm7, " #out1 "\n"\
01299 "psadbw " #out0 ", " #in0 "\n"\
01300 "psadbw " #out1 ", " #in1 "\n"\
01301 "paddw " #in1 ", " #in0 "\n"\
01302 "paddw " #in0 ", %%mm6\n"
01303
01304 asm volatile (
01305 "movl %4,%%ecx\n"
01306 "pxor %%mm6,%%mm6\n"
01307 "pcmpeqw %%mm7,%%mm7\n"
01308 "psllw $15, %%mm7\n"
01309 "packsswb %%mm7, %%mm7\n"
01310 "movq (%0),%%mm0\n"
01311 "movq (%1),%%mm2\n"
01312 "movq 8(%0),%%mm1\n"
01313 "movq 8(%1),%%mm3\n"
01314 "add %3,%0\n"
01315 "add %3,%1\n"
01316 "subl $2, %%ecx\n"
01317 "psubb %%mm2, %%mm0\n"
01318 "psubb %%mm3, %%mm1\n"
01319 "pxor %%mm7, %%mm0\n"
01320 "pxor %%mm7, %%mm1\n"
01321 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01322 "1:\n"
01323
01324 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
01325
01326 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
01327
01328 "subl $2, %%ecx\n"
01329 "jnz 1b\n"
01330
01331 "movd %%mm6,%2\n"
01332 : "+r" (pix1), "+r" (pix2), "=r"(tmp)
01333 : "r" ((long)line_size) , "m" (h)
01334 : "%ecx");
01335 return tmp;
01336 }
01337 #undef SUM
01338
01339 static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
01340 long i=0;
01341 asm volatile(
01342 "1: \n\t"
01343 "movq (%2, %0), %%mm0 \n\t"
01344 "movq (%1, %0), %%mm1 \n\t"
01345 "psubb %%mm0, %%mm1 \n\t"
01346 "movq %%mm1, (%3, %0) \n\t"
01347 "movq 8(%2, %0), %%mm0 \n\t"
01348 "movq 8(%1, %0), %%mm1 \n\t"
01349 "psubb %%mm0, %%mm1 \n\t"
01350 "movq %%mm1, 8(%3, %0) \n\t"
01351 "add $16, %0 \n\t"
01352 "cmp %4, %0 \n\t"
01353 " jb 1b \n\t"
01354 : "+r" (i)
01355 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w-15)
01356 );
01357 for(; i<w; i++)
01358 dst[i+0] = src1[i+0]-src2[i+0];
01359 }
01360
01361 static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
01362 long i=0;
01363 uint8_t l, lt;
01364
01365 asm volatile(
01366 "1: \n\t"
01367 "movq -1(%1, %0), %%mm0 \n\t"
01368 "movq (%1, %0), %%mm1 \n\t"
01369 "movq -1(%2, %0), %%mm2 \n\t"
01370 "movq (%2, %0), %%mm3 \n\t"
01371 "movq %%mm2, %%mm4 \n\t"
01372 "psubb %%mm0, %%mm2 \n\t"
01373 "paddb %%mm1, %%mm2 \n\t"
01374 "movq %%mm4, %%mm5 \n\t"
01375 "pmaxub %%mm1, %%mm4 \n\t"
01376 "pminub %%mm5, %%mm1 \n\t"
01377 "pminub %%mm2, %%mm4 \n\t"
01378 "pmaxub %%mm1, %%mm4 \n\t"
01379 "psubb %%mm4, %%mm3 \n\t"
01380 "movq %%mm3, (%3, %0) \n\t"
01381 "add $8, %0 \n\t"
01382 "cmp %4, %0 \n\t"
01383 " jb 1b \n\t"
01384 : "+r" (i)
01385 : "r"(src1), "r"(src2), "r"(dst), "r"((long)w)
01386 );
01387
01388 l= *left;
01389 lt= *left_top;
01390
01391 dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
01392
01393 *left_top= src1[w-1];
01394 *left = src2[w-1];
01395 }
01396
01397 #define LBUTTERFLY2(a1,b1,a2,b2)\
01398 "paddw " #b1 ", " #a1 " \n\t"\
01399 "paddw " #b2 ", " #a2 " \n\t"\
01400 "paddw " #b1 ", " #b1 " \n\t"\
01401 "paddw " #b2 ", " #b2 " \n\t"\
01402 "psubw " #a1 ", " #b1 " \n\t"\
01403 "psubw " #a2 ", " #b2 " \n\t"
01404
01405 #define HADAMARD48\
01406 LBUTTERFLY2(%%mm0, %%mm1, %%mm2, %%mm3)\
01407 LBUTTERFLY2(%%mm4, %%mm5, %%mm6, %%mm7)\
01408 LBUTTERFLY2(%%mm0, %%mm2, %%mm1, %%mm3)\
01409 LBUTTERFLY2(%%mm4, %%mm6, %%mm5, %%mm7)\
01410 LBUTTERFLY2(%%mm0, %%mm4, %%mm1, %%mm5)\
01411 LBUTTERFLY2(%%mm2, %%mm6, %%mm3, %%mm7)\
01412
01413 #define MMABS(a,z)\
01414 "pxor " #z ", " #z " \n\t"\
01415 "pcmpgtw " #a ", " #z " \n\t"\
01416 "pxor " #z ", " #a " \n\t"\
01417 "psubw " #z ", " #a " \n\t"
01418
01419 #define MMABS_SUM(a,z, sum)\
01420 "pxor " #z ", " #z " \n\t"\
01421 "pcmpgtw " #a ", " #z " \n\t"\
01422 "pxor " #z ", " #a " \n\t"\
01423 "psubw " #z ", " #a " \n\t"\
01424 "paddusw " #a ", " #sum " \n\t"
01425
01426 #define MMABS_MMX2(a,z)\
01427 "pxor " #z ", " #z " \n\t"\
01428 "psubw " #a ", " #z " \n\t"\
01429 "pmaxsw " #z ", " #a " \n\t"
01430
01431 #define MMABS_SUM_MMX2(a,z, sum)\
01432 "pxor " #z ", " #z " \n\t"\
01433 "psubw " #a ", " #z " \n\t"\
01434 "pmaxsw " #z ", " #a " \n\t"\
01435 "paddusw " #a ", " #sum " \n\t"
01436
01437 #define SBUTTERFLY(a,b,t,n)\
01438 "movq " #a ", " #t " \n\t" \
01439 "punpckl" #n " " #b ", " #a " \n\t" \
01440 "punpckh" #n " " #b ", " #t " \n\t" \
01441
01442 #define TRANSPOSE4(a,b,c,d,t)\
01443 SBUTTERFLY(a,b,t,wd) \
01444 SBUTTERFLY(c,d,b,wd) \
01445 SBUTTERFLY(a,c,d,dq) \
01446 SBUTTERFLY(t,b,c,dq)
01447
01448 #define LOAD4(o, a, b, c, d)\
01449 "movq "#o"(%1), " #a " \n\t"\
01450 "movq "#o"+16(%1), " #b " \n\t"\
01451 "movq "#o"+32(%1), " #c " \n\t"\
01452 "movq "#o"+48(%1), " #d " \n\t"
01453
01454 #define STORE4(o, a, b, c, d)\
01455 "movq "#a", "#o"(%1) \n\t"\
01456 "movq "#b", "#o"+16(%1) \n\t"\
01457 "movq "#c", "#o"+32(%1) \n\t"\
01458 "movq "#d", "#o"+48(%1) \n\t"\
01459
01460 static int hadamard8_diff_mmx(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
01461 uint64_t temp[16] __align8;
01462 int sum=0;
01463
01464 assert(h==8);
01465
01466 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
01467
01468 asm volatile(
01469 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
01470 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
01471
01472 HADAMARD48
01473
01474 "movq %%mm7, 112(%1) \n\t"
01475
01476 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
01477 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
01478
01479 "movq 112(%1), %%mm7 \n\t"
01480 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
01481 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
01482
01483 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
01484 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
01485
01486 HADAMARD48
01487
01488 "movq %%mm7, 120(%1) \n\t"
01489
01490 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
01491 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
01492
01493 "movq 120(%1), %%mm7 \n\t"
01494 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
01495 "movq %%mm7, %%mm5 \n\t"
01496 "movq %%mm6, %%mm7 \n\t"
01497 "movq %%mm0, %%mm6 \n\t"
01498
01499
01500 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
01501
01502
01503 HADAMARD48
01504 "movq %%mm7, 64(%1) \n\t"
01505 MMABS(%%mm0, %%mm7)
01506 MMABS_SUM(%%mm1, %%mm7, %%mm0)
01507 MMABS_SUM(%%mm2, %%mm7, %%mm0)
01508 MMABS_SUM(%%mm3, %%mm7, %%mm0)
01509 MMABS_SUM(%%mm4, %%mm7, %%mm0)
01510 MMABS_SUM(%%mm5, %%mm7, %%mm0)
01511 MMABS_SUM(%%mm6, %%mm7, %%mm0)
01512 "movq 64(%1), %%mm1 \n\t"
01513 MMABS_SUM(%%mm1, %%mm7, %%mm0)
01514 "movq %%mm0, 64(%1) \n\t"
01515
01516 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
01517 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
01518
01519 HADAMARD48
01520 "movq %%mm7, (%1) \n\t"
01521 MMABS(%%mm0, %%mm7)
01522 MMABS_SUM(%%mm1, %%mm7, %%mm0)
01523 MMABS_SUM(%%mm2, %%mm7, %%mm0)
01524 MMABS_SUM(%%mm3, %%mm7, %%mm0)
01525 MMABS_SUM(%%mm4, %%mm7, %%mm0)
01526 MMABS_SUM(%%mm5, %%mm7, %%mm0)
01527 MMABS_SUM(%%mm6, %%mm7, %%mm0)
01528 "movq (%1), %%mm1 \n\t"
01529 MMABS_SUM(%%mm1, %%mm7, %%mm0)
01530 "movq 64(%1), %%mm1 \n\t"
01531 MMABS_SUM(%%mm1, %%mm7, %%mm0)
01532
01533 "movq %%mm0, %%mm1 \n\t"
01534 "psrlq $32, %%mm0 \n\t"
01535 "paddusw %%mm1, %%mm0 \n\t"
01536 "movq %%mm0, %%mm1 \n\t"
01537 "psrlq $16, %%mm0 \n\t"
01538 "paddusw %%mm1, %%mm0 \n\t"
01539 "movd %%mm0, %0 \n\t"
01540
01541 : "=r" (sum)
01542 : "r"(temp)
01543 );
01544 return sum&0xFFFF;
01545 }
01546
01547 static int hadamard8_diff_mmx2(void *s, uint8_t *src1, uint8_t *src2, int stride, int h){
01548 uint64_t temp[16] __align8;
01549 int sum=0;
01550
01551 assert(h==8);
01552
01553 diff_pixels_mmx((DCTELEM*)temp, src1, src2, stride);
01554
01555 asm volatile(
01556 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
01557 LOAD4(64, %%mm4, %%mm5, %%mm6, %%mm7)
01558
01559 HADAMARD48
01560
01561 "movq %%mm7, 112(%1) \n\t"
01562
01563 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
01564 STORE4(0 , %%mm0, %%mm3, %%mm7, %%mm2)
01565
01566 "movq 112(%1), %%mm7 \n\t"
01567 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
01568 STORE4(64, %%mm4, %%mm7, %%mm0, %%mm6)
01569
01570 LOAD4(8 , %%mm0, %%mm1, %%mm2, %%mm3)
01571 LOAD4(72, %%mm4, %%mm5, %%mm6, %%mm7)
01572
01573 HADAMARD48
01574
01575 "movq %%mm7, 120(%1) \n\t"
01576
01577 TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm7)
01578 STORE4(8 , %%mm0, %%mm3, %%mm7, %%mm2)
01579
01580 "movq 120(%1), %%mm7 \n\t"
01581 TRANSPOSE4(%%mm4, %%mm5, %%mm6, %%mm7, %%mm0)
01582 "movq %%mm7, %%mm5 \n\t"
01583 "movq %%mm6, %%mm7 \n\t"
01584 "movq %%mm0, %%mm6 \n\t"
01585
01586
01587 LOAD4(64, %%mm0, %%mm1, %%mm2, %%mm3)
01588
01589
01590 HADAMARD48
01591 "movq %%mm7, 64(%1) \n\t"
01592 MMABS_MMX2(%%mm0, %%mm7)
01593 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
01594 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
01595 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
01596 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
01597 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
01598 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
01599 "movq 64(%1), %%mm1 \n\t"
01600 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
01601 "movq %%mm0, 64(%1) \n\t"
01602
01603 LOAD4(0 , %%mm0, %%mm1, %%mm2, %%mm3)
01604 LOAD4(8 , %%mm4, %%mm5, %%mm6, %%mm7)
01605
01606 HADAMARD48
01607 "movq %%mm7, (%1) \n\t"
01608 MMABS_MMX2(%%mm0, %%mm7)
01609 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
01610 MMABS_SUM_MMX2(%%mm2, %%mm7, %%mm0)
01611 MMABS_SUM_MMX2(%%mm3, %%mm7, %%mm0)
01612 MMABS_SUM_MMX2(%%mm4, %%mm7, %%mm0)
01613 MMABS_SUM_MMX2(%%mm5, %%mm7, %%mm0)
01614 MMABS_SUM_MMX2(%%mm6, %%mm7, %%mm0)
01615 "movq (%1), %%mm1 \n\t"
01616 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
01617 "movq 64(%1), %%mm1 \n\t"
01618 MMABS_SUM_MMX2(%%mm1, %%mm7, %%mm0)
01619
01620 "movq %%mm0, %%mm1 \n\t"
01621 "psrlq $32, %%mm0 \n\t"
01622 "paddusw %%mm1, %%mm0 \n\t"
01623 "movq %%mm0, %%mm1 \n\t"
01624 "psrlq $16, %%mm0 \n\t"
01625 "paddusw %%mm1, %%mm0 \n\t"
01626 "movd %%mm0, %0 \n\t"
01627
01628 : "=r" (sum)
01629 : "r"(temp)
01630 );
01631 return sum&0xFFFF;
01632 }
01633
01634
01635 WARPER8_16_SQ(hadamard8_diff_mmx, hadamard8_diff16_mmx)
01636 WARPER8_16_SQ(hadamard8_diff_mmx2, hadamard8_diff16_mmx2)
01637 #endif //CONFIG_ENCODERS
01638
01639 #define put_no_rnd_pixels8_mmx(a,b,c,d) put_pixels8_mmx(a,b,c,d)
01640 #define put_no_rnd_pixels16_mmx(a,b,c,d) put_pixels16_mmx(a,b,c,d)
01641
01642 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
01643 "paddw " #m4 ", " #m3 " \n\t" \
01644 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" \
01645 "pmullw " #m3 ", %%mm4 \n\t" \
01646 "movq "#in7", " #m3 " \n\t" \
01647 "movq "#in0", %%mm5 \n\t" \
01648 "paddw " #m3 ", %%mm5 \n\t" \
01649 "psubw %%mm5, %%mm4 \n\t" \
01650 "movq "#in1", %%mm5 \n\t" \
01651 "movq "#in2", %%mm6 \n\t" \
01652 "paddw " #m6 ", %%mm5 \n\t" \
01653 "paddw " #m5 ", %%mm6 \n\t" \
01654 "paddw %%mm6, %%mm6 \n\t" \
01655 "psubw %%mm6, %%mm5 \n\t" \
01656 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" \
01657 "paddw " #rnd ", %%mm4 \n\t" \
01658 "paddw %%mm4, %%mm5 \n\t" \
01659 "psraw $5, %%mm5 \n\t"\
01660 "packuswb %%mm5, %%mm5 \n\t"\
01661 OP(%%mm5, out, %%mm7, d)
01662
01663 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
01664 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01665 uint64_t temp;\
01666 \
01667 asm volatile(\
01668 "pxor %%mm7, %%mm7 \n\t"\
01669 "1: \n\t"\
01670 "movq (%0), %%mm0 \n\t" \
01671 "movq %%mm0, %%mm1 \n\t" \
01672 "movq %%mm0, %%mm2 \n\t" \
01673 "punpcklbw %%mm7, %%mm0 \n\t" \
01674 "punpckhbw %%mm7, %%mm1 \n\t" \
01675 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
01676 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
01677 "movq %%mm2, %%mm3 \n\t" \
01678 "movq %%mm2, %%mm4 \n\t" \
01679 "psllq $8, %%mm2 \n\t" \
01680 "psllq $16, %%mm3 \n\t" \
01681 "psllq $24, %%mm4 \n\t" \
01682 "punpckhbw %%mm7, %%mm2 \n\t" \
01683 "punpckhbw %%mm7, %%mm3 \n\t" \
01684 "punpckhbw %%mm7, %%mm4 \n\t" \
01685 "paddw %%mm3, %%mm5 \n\t" \
01686 "paddw %%mm2, %%mm6 \n\t" \
01687 "paddw %%mm5, %%mm5 \n\t" \
01688 "psubw %%mm5, %%mm6 \n\t" \
01689 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
01690 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
01691 "paddw %%mm4, %%mm0 \n\t" \
01692 "paddw %%mm1, %%mm5 \n\t" \
01693 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
01694 "psubw %%mm5, %%mm0 \n\t" \
01695 "paddw %6, %%mm6 \n\t"\
01696 "paddw %%mm6, %%mm0 \n\t" \
01697 "psraw $5, %%mm0 \n\t"\
01698 "movq %%mm0, %5 \n\t"\
01699 \
01700 \
01701 "movq 5(%0), %%mm0 \n\t" \
01702 "movq %%mm0, %%mm5 \n\t" \
01703 "movq %%mm0, %%mm6 \n\t" \
01704 "psrlq $8, %%mm0 \n\t" \
01705 "psrlq $16, %%mm5 \n\t" \
01706 "punpcklbw %%mm7, %%mm0 \n\t" \
01707 "punpcklbw %%mm7, %%mm5 \n\t" \
01708 "paddw %%mm0, %%mm2 \n\t" \
01709 "paddw %%mm5, %%mm3 \n\t" \
01710 "paddw %%mm2, %%mm2 \n\t" \
01711 "psubw %%mm2, %%mm3 \n\t" \
01712 "movq %%mm6, %%mm2 \n\t" \
01713 "psrlq $24, %%mm6 \n\t" \
01714 "punpcklbw %%mm7, %%mm2 \n\t" \
01715 "punpcklbw %%mm7, %%mm6 \n\t" \
01716 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
01717 "paddw %%mm2, %%mm1 \n\t" \
01718 "paddw %%mm6, %%mm4 \n\t" \
01719 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
01720 "psubw %%mm4, %%mm3 \n\t" \
01721 "paddw %6, %%mm1 \n\t"\
01722 "paddw %%mm1, %%mm3 \n\t" \
01723 "psraw $5, %%mm3 \n\t"\
01724 "movq %5, %%mm1 \n\t"\
01725 "packuswb %%mm3, %%mm1 \n\t"\
01726 OP_MMX2(%%mm1, (%1),%%mm4, q)\
01727 \
01728 \
01729 "movq 9(%0), %%mm1 \n\t" \
01730 "movq %%mm1, %%mm4 \n\t" \
01731 "movq %%mm1, %%mm3 \n\t" \
01732 "psrlq $8, %%mm1 \n\t" \
01733 "psrlq $16, %%mm4 \n\t" \
01734 "punpcklbw %%mm7, %%mm1 \n\t" \
01735 "punpcklbw %%mm7, %%mm4 \n\t" \
01736 "paddw %%mm1, %%mm5 \n\t" \
01737 "paddw %%mm4, %%mm0 \n\t" \
01738 "paddw %%mm5, %%mm5 \n\t" \
01739 "psubw %%mm5, %%mm0 \n\t" \
01740 "movq %%mm3, %%mm5 \n\t" \
01741 "psrlq $24, %%mm3 \n\t" \
01742 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" \
01743 "punpcklbw %%mm7, %%mm3 \n\t" \
01744 "paddw %%mm3, %%mm2 \n\t" \
01745 "psubw %%mm2, %%mm0 \n\t" \
01746 "movq %%mm5, %%mm2 \n\t" \
01747 "punpcklbw %%mm7, %%mm2 \n\t" \
01748 "punpckhbw %%mm7, %%mm5 \n\t" \
01749 "paddw %%mm2, %%mm6 \n\t" \
01750 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" \
01751 "paddw %6, %%mm0 \n\t"\
01752 "paddw %%mm6, %%mm0 \n\t" \
01753 "psraw $5, %%mm0 \n\t"\
01754 \
01755 \
01756 "paddw %%mm5, %%mm3 \n\t" \
01757 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
01758 "paddw %%mm4, %%mm6 \n\t" \
01759 "pshufw $0xBE, %%mm5, %%mm4 \n\t" \
01760 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
01761 "paddw %%mm1, %%mm4 \n\t" \
01762 "paddw %%mm2, %%mm5 \n\t" \
01763 "paddw %%mm6, %%mm6 \n\t" \
01764 "psubw %%mm6, %%mm4 \n\t" \
01765 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" \
01766 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" \
01767 "psubw %%mm5, %%mm3 \n\t" \
01768 "paddw %6, %%mm4 \n\t"\
01769 "paddw %%mm3, %%mm4 \n\t" \
01770 "psraw $5, %%mm4 \n\t"\
01771 "packuswb %%mm4, %%mm0 \n\t"\
01772 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
01773 \
01774 "add %3, %0 \n\t"\
01775 "add %4, %1 \n\t"\
01776 "decl %2 \n\t"\
01777 " jnz 1b \n\t"\
01778 : "+a"(src), "+c"(dst), "+m"(h)\
01779 : "d"((long)srcStride), "S"((long)dstStride), "m"(temp), "m"(ROUNDER)\
01780 : "memory"\
01781 );\
01782 }\
01783 \
01784 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01785 int i;\
01786 int16_t temp[16];\
01787 \
01788 for(i=0; i<h; i++)\
01789 {\
01790 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
01791 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
01792 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
01793 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
01794 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
01795 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
01796 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
01797 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
01798 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
01799 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
01800 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
01801 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
01802 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
01803 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
01804 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
01805 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
01806 asm volatile(\
01807 "movq (%0), %%mm0 \n\t"\
01808 "movq 8(%0), %%mm1 \n\t"\
01809 "paddw %2, %%mm0 \n\t"\
01810 "paddw %2, %%mm1 \n\t"\
01811 "psraw $5, %%mm0 \n\t"\
01812 "psraw $5, %%mm1 \n\t"\
01813 "packuswb %%mm1, %%mm0 \n\t"\
01814 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
01815 "movq 16(%0), %%mm0 \n\t"\
01816 "movq 24(%0), %%mm1 \n\t"\
01817 "paddw %2, %%mm0 \n\t"\
01818 "paddw %2, %%mm1 \n\t"\
01819 "psraw $5, %%mm0 \n\t"\
01820 "psraw $5, %%mm1 \n\t"\
01821 "packuswb %%mm1, %%mm0 \n\t"\
01822 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
01823 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
01824 : "memory"\
01825 );\
01826 dst+=dstStride;\
01827 src+=srcStride;\
01828 }\
01829 }\
01830 \
01831 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01832 uint64_t temp;\
01833 \
01834 asm volatile(\
01835 "pxor %%mm7, %%mm7 \n\t"\
01836 "1: \n\t"\
01837 "movq (%0), %%mm0 \n\t" \
01838 "movq %%mm0, %%mm1 \n\t" \
01839 "movq %%mm0, %%mm2 \n\t" \
01840 "punpcklbw %%mm7, %%mm0 \n\t" \
01841 "punpckhbw %%mm7, %%mm1 \n\t" \
01842 "pshufw $0x90, %%mm0, %%mm5 \n\t" \
01843 "pshufw $0x41, %%mm0, %%mm6 \n\t" \
01844 "movq %%mm2, %%mm3 \n\t" \
01845 "movq %%mm2, %%mm4 \n\t" \
01846 "psllq $8, %%mm2 \n\t" \
01847 "psllq $16, %%mm3 \n\t" \
01848 "psllq $24, %%mm4 \n\t" \
01849 "punpckhbw %%mm7, %%mm2 \n\t" \
01850 "punpckhbw %%mm7, %%mm3 \n\t" \
01851 "punpckhbw %%mm7, %%mm4 \n\t" \
01852 "paddw %%mm3, %%mm5 \n\t" \
01853 "paddw %%mm2, %%mm6 \n\t" \
01854 "paddw %%mm5, %%mm5 \n\t" \
01855 "psubw %%mm5, %%mm6 \n\t" \
01856 "pshufw $0x06, %%mm0, %%mm5 \n\t" \
01857 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" \
01858 "paddw %%mm4, %%mm0 \n\t" \
01859 "paddw %%mm1, %%mm5 \n\t" \
01860 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" \
01861 "psubw %%mm5, %%mm0 \n\t" \
01862 "paddw %6, %%mm6 \n\t"\
01863 "paddw %%mm6, %%mm0 \n\t" \
01864 "psraw $5, %%mm0 \n\t"\
01865 \
01866 \
01867 "movd 5(%0), %%mm5 \n\t" \
01868 "punpcklbw %%mm7, %%mm5 \n\t" \
01869 "pshufw $0xF9, %%mm5, %%mm6 \n\t" \
01870 "paddw %%mm5, %%mm1 \n\t" \
01871 "paddw %%mm6, %%mm2 \n\t" \
01872 "pshufw $0xBE, %%mm5, %%mm6 \n\t" \
01873 "pshufw $0x6F, %%mm5, %%mm5 \n\t" \
01874 "paddw %%mm6, %%mm3 \n\t" \
01875 "paddw %%mm5, %%mm4 \n\t" \
01876 "paddw %%mm2, %%mm2 \n\t" \
01877 "psubw %%mm2, %%mm3 \n\t" \
01878 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" \
01879 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" \
01880 "psubw %%mm4, %%mm3 \n\t" \
01881 "paddw %6, %%mm1 \n\t"\
01882 "paddw %%mm1, %%mm3 \n\t" \
01883 "psraw $5, %%mm3 \n\t"\
01884 "packuswb %%mm3, %%mm0 \n\t"\
01885 OP_MMX2(%%mm0, (%1), %%mm4, q)\
01886 \
01887 "add %3, %0 \n\t"\
01888 "add %4, %1 \n\t"\
01889 "decl %2 \n\t"\
01890 " jnz 1b \n\t"\
01891 : "+a"(src), "+c"(dst), "+m"(h)\
01892 : "S"((long)srcStride), "D"((long)dstStride), "m"(temp), "m"(ROUNDER)\
01893 : "memory"\
01894 );\
01895 }\
01896 \
01897 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
01898 int i;\
01899 int16_t temp[8];\
01900 \
01901 for(i=0; i<h; i++)\
01902 {\
01903 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
01904 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
01905 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
01906 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
01907 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
01908 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
01909 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
01910 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
01911 asm volatile(\
01912 "movq (%0), %%mm0 \n\t"\
01913 "movq 8(%0), %%mm1 \n\t"\
01914 "paddw %2, %%mm0 \n\t"\
01915 "paddw %2, %%mm1 \n\t"\
01916 "psraw $5, %%mm0 \n\t"\
01917 "psraw $5, %%mm1 \n\t"\
01918 "packuswb %%mm1, %%mm0 \n\t"\
01919 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
01920 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
01921 :"memory"\
01922 );\
01923 dst+=dstStride;\
01924 src+=srcStride;\
01925 }\
01926 }
01927
01928 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
01929 \
01930 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
01931 uint64_t temp[17*4];\
01932 uint64_t *temp_ptr= temp;\
01933 int count= 17;\
01934 \
01935 \
01936 asm volatile(\
01937 "pxor %%mm7, %%mm7 \n\t"\
01938 "1: \n\t"\
01939 "movq (%0), %%mm0 \n\t"\
01940 "movq (%0), %%mm1 \n\t"\
01941 "movq 8(%0), %%mm2 \n\t"\
01942 "movq 8(%0), %%mm3 \n\t"\
01943 "punpcklbw %%mm7, %%mm0 \n\t"\
01944 "punpckhbw %%mm7, %%mm1 \n\t"\
01945 "punpcklbw %%mm7, %%mm2 \n\t"\
01946 "punpckhbw %%mm7, %%mm3 \n\t"\
01947 "movq %%mm0, (%1) \n\t"\
01948 "movq %%mm1, 17*8(%1) \n\t"\
01949 "movq %%mm2, 2*17*8(%1) \n\t"\
01950 "movq %%mm3, 3*17*8(%1) \n\t"\
01951 "add $8, %1 \n\t"\
01952 "add %3, %0 \n\t"\
01953 "decl %2 \n\t"\
01954 " jnz 1b \n\t"\
01955 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
01956 : "r" ((long)srcStride)\
01957 : "memory"\
01958 );\
01959 \
01960 temp_ptr= temp;\
01961 count=4;\
01962 \
01963 \
01964 asm volatile(\
01965 \
01966 "1: \n\t"\
01967 "movq (%0), %%mm0 \n\t"\
01968 "movq 8(%0), %%mm1 \n\t"\
01969 "movq 16(%0), %%mm2 \n\t"\
01970 "movq 24(%0), %%mm3 \n\t"\
01971 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
01972 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
01973 "add %4, %1 \n\t"\
01974 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
01975 \
01976 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
01977 "add %4, %1 \n\t"\
01978 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
01979 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
01980 "add %4, %1 \n\t"\
01981 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
01982 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
01983 "add %4, %1 \n\t"\
01984 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
01985 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
01986 "add %4, %1 \n\t"\
01987 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
01988 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
01989 "add %4, %1 \n\t"\
01990 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
01991 \
01992 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
01993 "add %4, %1 \n\t" \
01994 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
01995 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
01996 \
01997 "add $136, %0 \n\t"\
01998 "add %6, %1 \n\t"\
01999 "decl %2 \n\t"\
02000 " jnz 1b \n\t"\
02001 \
02002 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
02003 : "r"((long)dstStride), "r"(2*(long)dstStride), "m"(ROUNDER), "g"(4-14*(long)dstStride)\
02004 :"memory"\
02005 );\
02006 }\
02007 \
02008 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
02009 uint64_t temp[9*2];\
02010 uint64_t *temp_ptr= temp;\
02011 int count= 9;\
02012 \
02013 \
02014 asm volatile(\
02015 "pxor %%mm7, %%mm7 \n\t"\
02016 "1: \n\t"\
02017 "movq (%0), %%mm0 \n\t"\
02018 "movq (%0), %%mm1 \n\t"\
02019 "punpcklbw %%mm7, %%mm0 \n\t"\
02020 "punpckhbw %%mm7, %%mm1 \n\t"\
02021 "movq %%mm0, (%1) \n\t"\
02022 "movq %%mm1, 9*8(%1) \n\t"\
02023 "add $8, %1 \n\t"\
02024 "add %3, %0 \n\t"\
02025 "decl %2 \n\t"\
02026 " jnz 1b \n\t"\
02027 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
02028 : "r" ((long)srcStride)\
02029 : "memory"\
02030 );\
02031 \
02032 temp_ptr= temp;\
02033 count=2;\
02034 \
02035 \
02036 asm volatile(\
02037 \
02038 "1: \n\t"\
02039 "movq (%0), %%mm0 \n\t"\
02040 "movq 8(%0), %%mm1 \n\t"\
02041 "movq 16(%0), %%mm2 \n\t"\
02042 "movq 24(%0), %%mm3 \n\t"\
02043 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
02044 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
02045 "add %4, %1 \n\t"\
02046 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
02047 \
02048 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
02049 "add %4, %1 \n\t"\
02050 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
02051 \
02052 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
02053 "add %4, %1 \n\t"\
02054 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
02055 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
02056 \
02057 "add $72, %0 \n\t"\
02058 "add %6, %1 \n\t"\
02059 "decl %2 \n\t"\
02060 " jnz 1b \n\t"\
02061 \
02062 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
02063 : "r"((long)dstStride), "r"(2*(long)dstStride), "m"(ROUNDER), "g"(4-6*(long)dstStride)\
02064 : "memory"\
02065 );\
02066 }\
02067 \
02068 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
02069 OPNAME ## pixels8_mmx(dst, src, stride, 8);\
02070 }\
02071 \
02072 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02073 uint64_t temp[8];\
02074 uint8_t * const half= (uint8_t*)temp;\
02075 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
02076 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
02077 }\
02078 \
02079 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02080 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
02081 }\
02082 \
02083 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02084 uint64_t temp[8];\
02085 uint8_t * const half= (uint8_t*)temp;\
02086 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
02087 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
02088 }\
02089 \
02090 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02091 uint64_t temp[8];\
02092 uint8_t * const half= (uint8_t*)temp;\
02093 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
02094 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
02095 }\
02096 \
02097 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02098 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
02099 }\
02100 \
02101 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02102 uint64_t temp[8];\
02103 uint8_t * const half= (uint8_t*)temp;\
02104 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
02105 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
02106 }\
02107 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02108 uint64_t half[8 + 9];\
02109 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02110 uint8_t * const halfHV= ((uint8_t*)half);\
02111 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02112 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
02113 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02114 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
02115 }\
02116 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02117 uint64_t half[8 + 9];\
02118 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02119 uint8_t * const halfHV= ((uint8_t*)half);\
02120 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02121 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
02122 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02123 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
02124 }\
02125 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02126 uint64_t half[8 + 9];\
02127 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02128 uint8_t * const halfHV= ((uint8_t*)half);\
02129 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02130 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
02131 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02132 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
02133 }\
02134 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02135 uint64_t half[8 + 9];\
02136 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02137 uint8_t * const halfHV= ((uint8_t*)half);\
02138 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02139 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
02140 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02141 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
02142 }\
02143 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02144 uint64_t half[8 + 9];\
02145 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02146 uint8_t * const halfHV= ((uint8_t*)half);\
02147 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02148 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02149 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
02150 }\
02151 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02152 uint64_t half[8 + 9];\
02153 uint8_t * const halfH= ((uint8_t*)half) + 64;\
02154 uint8_t * const halfHV= ((uint8_t*)half);\
02155 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02156 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
02157 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
02158 }\
02159 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02160 uint64_t half[8 + 9];\
02161 uint8_t * const halfH= ((uint8_t*)half);\
02162 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02163 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
02164 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
02165 }\
02166 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02167 uint64_t half[8 + 9];\
02168 uint8_t * const halfH= ((uint8_t*)half);\
02169 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02170 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
02171 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
02172 }\
02173 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02174 uint64_t half[9];\
02175 uint8_t * const halfH= ((uint8_t*)half);\
02176 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
02177 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
02178 }\
02179 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
02180 OPNAME ## pixels16_mmx(dst, src, stride, 16);\
02181 }\
02182 \
02183 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02184 uint64_t temp[32];\
02185 uint8_t * const half= (uint8_t*)temp;\
02186 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
02187 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
02188 }\
02189 \
02190 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02191 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
02192 }\
02193 \
02194 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02195 uint64_t temp[32];\
02196 uint8_t * const half= (uint8_t*)temp;\
02197 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
02198 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
02199 }\
02200 \
02201 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02202 uint64_t temp[32];\
02203 uint8_t * const half= (uint8_t*)temp;\
02204 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
02205 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
02206 }\
02207 \
02208 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02209 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
02210 }\
02211 \
02212 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02213 uint64_t temp[32];\
02214 uint8_t * const half= (uint8_t*)temp;\
02215 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
02216 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
02217 }\
02218 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02219 uint64_t half[16*2 + 17*2];\
02220 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02221 uint8_t * const halfHV= ((uint8_t*)half);\
02222 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02223 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
02224 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02225 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
02226 }\
02227 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02228 uint64_t half[16*2 + 17*2];\
02229 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02230 uint8_t * const halfHV= ((uint8_t*)half);\
02231 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02232 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
02233 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02234 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
02235 }\
02236 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02237 uint64_t half[16*2 + 17*2];\
02238 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02239 uint8_t * const halfHV= ((uint8_t*)half);\
02240 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02241 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
02242 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02243 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
02244 }\
02245 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02246 uint64_t half[16*2 + 17*2];\
02247 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02248 uint8_t * const halfHV= ((uint8_t*)half);\
02249 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02250 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
02251 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02252 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
02253 }\
02254 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02255 uint64_t half[16*2 + 17*2];\
02256 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02257 uint8_t * const halfHV= ((uint8_t*)half);\
02258 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02259 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02260 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
02261 }\
02262 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02263 uint64_t half[16*2 + 17*2];\
02264 uint8_t * const halfH= ((uint8_t*)half) + 256;\
02265 uint8_t * const halfHV= ((uint8_t*)half);\
02266 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02267 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
02268 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
02269 }\
02270 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02271 uint64_t half[17*2];\
02272 uint8_t * const halfH= ((uint8_t*)half);\
02273 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02274 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
02275 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
02276 }\
02277 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02278 uint64_t half[17*2];\
02279 uint8_t * const halfH= ((uint8_t*)half);\
02280 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02281 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
02282 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
02283 }\
02284 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
02285 uint64_t half[17*2];\
02286 uint8_t * const halfH= ((uint8_t*)half);\
02287 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
02288 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
02289 }
02290
02291 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
02292 #define AVG_3DNOW_OP(a,b,temp, size) \
02293 "mov" #size " " #b ", " #temp " \n\t"\
02294 "pavgusb " #temp ", " #a " \n\t"\
02295 "mov" #size " " #a ", " #b " \n\t"
02296 #define AVG_MMX2_OP(a,b,temp, size) \
02297 "mov" #size " " #b ", " #temp " \n\t"\
02298 "pavgb " #temp ", " #a " \n\t"\
02299 "mov" #size " " #a ", " #b " \n\t"
02300
02301 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
02302 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
02303 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
02304 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
02305 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
02306 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
02307 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
02308 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
02309 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
02310
02311 #if 0
02312 static void just_return() { return; }
02313 #endif
02314
02315 #define SET_QPEL_FUNC(postfix1, postfix2) \
02316 c->put_ ## postfix1 = put_ ## postfix2;\
02317 c->put_no_rnd_ ## postfix1 = put_no_rnd_ ## postfix2;\
02318 c->avg_ ## postfix1 = avg_ ## postfix2;
02319
02320 static int try_8x8basis_mmx(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
02321 long i=0;
02322
02323 assert(ABS(scale) < 256);
02324 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
02325
02326 asm volatile(
02327 "pcmpeqw %%mm6, %%mm6 \n\t"
02328 "psrlw $15, %%mm6 \n\t"
02329 "pxor %%mm7, %%mm7 \n\t"
02330 "movd %4, %%mm5 \n\t"
02331 "punpcklwd %%mm5, %%mm5 \n\t"
02332 "punpcklwd %%mm5, %%mm5 \n\t"
02333 "1: \n\t"
02334 "movq (%1, %0), %%mm0 \n\t"
02335 "movq 8(%1, %0), %%mm1 \n\t"
02336 "pmulhw %%mm5, %%mm0 \n\t"
02337 "pmulhw %%mm5, %%mm1 \n\t"
02338 "paddw %%mm6, %%mm0 \n\t"
02339 "paddw %%mm6, %%mm1 \n\t"
02340 "psraw $1, %%mm0 \n\t"
02341 "psraw $1, %%mm1 \n\t"
02342 "paddw (%2, %0), %%mm0 \n\t"
02343 "paddw 8(%2, %0), %%mm1 \n\t"
02344 "psraw $6, %%mm0 \n\t"
02345 "psraw $6, %%mm1 \n\t"
02346 "pmullw (%3, %0), %%mm0 \n\t"
02347 "pmullw 8(%3, %0), %%mm1 \n\t"
02348 "pmaddwd %%mm0, %%mm0 \n\t"
02349 "pmaddwd %%mm1, %%mm1 \n\t"
02350 "paddd %%mm1, %%mm0 \n\t"
02351 "psrld $4, %%mm0 \n\t"
02352 "paddd %%mm0, %%mm7 \n\t"
02353 "add $16, %0 \n\t"
02354 "cmp $128, %0 \n\t"
02355 " jb 1b \n\t"
02356 "movq %%mm7, %%mm6 \n\t"
02357 "psrlq $32, %%mm7 \n\t"
02358 "paddd %%mm6, %%mm7 \n\t"
02359 "psrld $2, %%mm7 \n\t"
02360 "movd %%mm7, %0 \n\t"
02361
02362 : "+r" (i)
02363 : "r"(basis), "r"(rem), "r"(weight), "g"(scale)
02364 );
02365 return i;
02366 }
02367
02368 static void add_8x8basis_mmx(int16_t rem[64], int16_t basis[64], int scale){
02369 long i=0;
02370
02371 if(ABS(scale) < 256){
02372 scale<<= 16 + 1 - BASIS_SHIFT + RECON_SHIFT;
02373 asm volatile(
02374 "pcmpeqw %%mm6, %%mm6 \n\t"
02375 "psrlw $15, %%mm6 \n\t"
02376 "movd %3, %%mm5 \n\t"
02377 "punpcklwd %%mm5, %%mm5 \n\t"
02378 "punpcklwd %%mm5, %%mm5 \n\t"
02379 "1: \n\t"
02380 "movq (%1, %0), %%mm0 \n\t"
02381 "movq 8(%1, %0), %%mm1 \n\t"
02382 "pmulhw %%mm5, %%mm0 \n\t"
02383 "pmulhw %%mm5, %%mm1 \n\t"
02384 "paddw %%mm6, %%mm0 \n\t"
02385 "paddw %%mm6, %%mm1 \n\t"
02386 "psraw $1, %%mm0 \n\t"
02387 "psraw $1, %%mm1 \n\t"
02388 "paddw (%2, %0), %%mm0 \n\t"
02389 "paddw 8(%2, %0), %%mm1 \n\t"
02390 "movq %%mm0, (%2, %0) \n\t"
02391 "movq %%mm1, 8(%2, %0) \n\t"
02392 "add $16, %0 \n\t"
02393 "cmp $128, %0 \n\t"
02394 " jb 1b \n\t"
02395
02396 : "+r" (i)
02397 : "r"(basis), "r"(rem), "g"(scale)
02398 );
02399 }else{
02400 for(i=0; i<8*8; i++){
02401 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
02402 }
02403 }
02404 }
02405
02406 #include "h264dsp_mmx.c"
02407
02408
02409 void ff_mmx_idct(DCTELEM *block);
02410 void ff_mmxext_idct(DCTELEM *block);
02411
02412 void ff_vp3_idct_sse2(int16_t *input_data);
02413 void ff_vp3_idct_mmx(int16_t *data);
02414 void ff_vp3_dsp_init_mmx(void);
02415
02416
02417
02418 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
02419 {
02420 ff_mmx_idct (block);
02421 put_pixels_clamped_mmx(block, dest, line_size);
02422 }
02423 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
02424 {
02425 ff_mmx_idct (block);
02426 add_pixels_clamped_mmx(block, dest, line_size);
02427 }
02428 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
02429 {
02430 ff_mmxext_idct (block);
02431 put_pixels_clamped_mmx(block, dest, line_size);
02432 }
02433 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
02434 {
02435 ff_mmxext_idct (block);
02436 add_pixels_clamped_mmx(block, dest, line_size);
02437 }
02438 static void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block)
02439 {
02440 ff_vp3_idct_sse2(block);
02441 put_signed_pixels_clamped_mmx(block, dest, line_size);
02442 }
02443 static void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block)
02444 {
02445 ff_vp3_idct_sse2(block);
02446 add_pixels_clamped_mmx(block, dest, line_size);
02447 }
02448 static void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block)
02449 {
02450 ff_vp3_idct_mmx(block);
02451 put_signed_pixels_clamped_mmx(block, dest, line_size);
02452 }
02453 static void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block)
02454 {
02455 ff_vp3_idct_mmx(block);
02456 add_pixels_clamped_mmx(block, dest, line_size);
02457 }
02458
02459 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
02460 {
02461 mm_flags = mm_support();
02462
02463 if (avctx->dsp_mask) {
02464 if (avctx->dsp_mask & FF_MM_FORCE)
02465 mm_flags |= (avctx->dsp_mask & 0xffff);
02466 else
02467 mm_flags &= ~(avctx->dsp_mask & 0xffff);
02468 }
02469
02470 #if 0
02471 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
02472 if (mm_flags & MM_MMX)
02473 av_log(avctx, AV_LOG_INFO, " mmx");
02474 if (mm_flags & MM_MMXEXT)
02475 av_log(avctx, AV_LOG_INFO, " mmxext");
02476 if (mm_flags & MM_3DNOW)
02477 av_log(avctx, AV_LOG_INFO, " 3dnow");
02478 if (mm_flags & MM_SSE)
02479 av_log(avctx, AV_LOG_INFO, " sse");
02480 if (mm_flags & MM_SSE2)
02481 av_log(avctx, AV_LOG_INFO, " sse2");
02482 av_log(avctx, AV_LOG_INFO, "\n");
02483 #endif
02484
02485 if (mm_flags & MM_MMX) {
02486 const int idct_algo= avctx->idct_algo;
02487
02488 #ifdef CONFIG_ENCODERS
02489 const int dct_algo = avctx->dct_algo;
02490 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
02491 if(mm_flags & MM_SSE2){
02492 c->fdct = ff_fdct_sse2;
02493 }else if(mm_flags & MM_MMXEXT){
02494 c->fdct = ff_fdct_mmx2;
02495 }else{
02496 c->fdct = ff_fdct_mmx;
02497 }
02498 }
02499 #endif //CONFIG_ENCODERS
02500 if(avctx->lowres==0){
02501 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
02502 c->idct_put= ff_simple_idct_put_mmx;
02503 c->idct_add= ff_simple_idct_add_mmx;
02504 c->idct = ff_simple_idct_mmx;
02505 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
02506 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
02507 if(mm_flags & MM_MMXEXT){
02508 c->idct_put= ff_libmpeg2mmx2_idct_put;
02509 c->idct_add= ff_libmpeg2mmx2_idct_add;
02510 c->idct = ff_mmxext_idct;
02511 }else{
02512 c->idct_put= ff_libmpeg2mmx_idct_put;
02513 c->idct_add= ff_libmpeg2mmx_idct_add;
02514 c->idct = ff_mmx_idct;
02515 }
02516 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
02517 }else if(idct_algo==FF_IDCT_VP3){
02518 if(mm_flags & MM_SSE2){
02519 c->idct_put= ff_vp3_idct_put_sse2;
02520 c->idct_add= ff_vp3_idct_add_sse2;
02521 c->idct = ff_vp3_idct_sse2;
02522 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
02523 }else{
02524 ff_vp3_dsp_init_mmx();
02525 c->idct_put= ff_vp3_idct_put_mmx;
02526 c->idct_add= ff_vp3_idct_add_mmx;
02527 c->idct = ff_vp3_idct_mmx;
02528 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
02529 }
02530 }
02531 }
02532
02533 #ifdef CONFIG_ENCODERS
02534 c->get_pixels = get_pixels_mmx;
02535 c->diff_pixels = diff_pixels_mmx;
02536 #endif //CONFIG_ENCODERS
02537 c->put_pixels_clamped = put_pixels_clamped_mmx;
02538 c->put_signed_pixels_clamped = put_signed_pixels_clamped_mmx;
02539 c->add_pixels_clamped = add_pixels_clamped_mmx;
02540 c->clear_blocks = clear_blocks_mmx;
02541 #ifdef CONFIG_ENCODERS
02542 c->pix_sum = pix_sum16_mmx;
02543 #endif //CONFIG_ENCODERS
02544
02545 c->put_pixels_tab[0][0] = put_pixels16_mmx;
02546 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx;
02547 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx;
02548 c->put_pixels_tab[0][3] = put_pixels16_xy2_mmx;
02549
02550 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_mmx;
02551 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx;
02552 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx;
02553 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_mmx;
02554
02555 c->avg_pixels_tab[0][0] = avg_pixels16_mmx;
02556 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx;
02557 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx;
02558 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx;
02559
02560 c->avg_no_rnd_pixels_tab[0][0] = avg_no_rnd_pixels16_mmx;
02561 c->avg_no_rnd_pixels_tab[0][1] = avg_no_rnd_pixels16_x2_mmx;
02562 c->avg_no_rnd_pixels_tab[0][2] = avg_no_rnd_pixels16_y2_mmx;
02563 c->avg_no_rnd_pixels_tab[0][3] = avg_no_rnd_pixels16_xy2_mmx;
02564
02565 c->put_pixels_tab[1][0] = put_pixels8_mmx;
02566 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx;
02567 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx;
02568 c->put_pixels_tab[1][3] = put_pixels8_xy2_mmx;
02569
02570 c->put_no_rnd_pixels_tab[1][0] = put_pixels8_mmx;
02571 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx;
02572 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx;
02573 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_mmx;
02574
02575 c->avg_pixels_tab[1][0] = avg_pixels8_mmx;
02576 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx;
02577 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx;
02578 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx;
02579
02580 c->avg_no_rnd_pixels_tab[1][0] = avg_no_rnd_pixels8_mmx;
02581 c->avg_no_rnd_pixels_tab[1][1] = avg_no_rnd_pixels8_x2_mmx;
02582 c->avg_no_rnd_pixels_tab[1][2] = avg_no_rnd_pixels8_y2_mmx;
02583 c->avg_no_rnd_pixels_tab[1][3] = avg_no_rnd_pixels8_xy2_mmx;
02584
02585 c->add_bytes= add_bytes_mmx;
02586 #ifdef CONFIG_ENCODERS
02587 c->diff_bytes= diff_bytes_mmx;
02588
02589 c->hadamard8_diff[0]= hadamard8_diff16_mmx;
02590 c->hadamard8_diff[1]= hadamard8_diff_mmx;
02591
02592 c->pix_norm1 = pix_norm1_mmx;
02593 c->sse[0] = sse16_mmx;
02594 c->sse[1] = sse8_mmx;
02595 c->vsad[4]= vsad_intra16_mmx;
02596
02597 c->nsse[0] = nsse16_mmx;
02598 c->nsse[1] = nsse8_mmx;
02599 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
02600 c->vsad[0] = vsad16_mmx;
02601 }
02602
02603 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
02604 c->try_8x8basis= try_8x8basis_mmx;
02605 }
02606 c->add_8x8basis= add_8x8basis_mmx;
02607
02608 #endif //CONFIG_ENCODERS
02609
02610 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
02611 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
02612 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx;
02613
02614 if (mm_flags & MM_MMXEXT) {
02615 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
02616 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
02617
02618 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
02619 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
02620 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
02621
02622 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
02623 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
02624
02625 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
02626 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
02627 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
02628
02629 #ifdef CONFIG_ENCODERS
02630 c->hadamard8_diff[0]= hadamard8_diff16_mmx2;
02631 c->hadamard8_diff[1]= hadamard8_diff_mmx2;
02632 c->vsad[4]= vsad_intra16_mmx2;
02633 #endif //CONFIG_ENCODERS
02634
02635 c->h264_idct_add= ff_h264_idct_add_mmx2;
02636
02637 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
02638 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
02639 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
02640 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
02641 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
02642 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
02643 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
02644 #ifdef CONFIG_ENCODERS
02645 c->vsad[0] = vsad16_mmx2;
02646 #endif //CONFIG_ENCODERS
02647 }
02648
02649 #if 1
02650 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_mmx2)
02651 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_mmx2)
02652 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_mmx2)
02653 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_mmx2)
02654 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_mmx2)
02655 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_mmx2)
02656 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_mmx2)
02657 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_mmx2)
02658 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_mmx2)
02659 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_mmx2)
02660 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_mmx2)
02661 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_mmx2)
02662 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_mmx2)
02663 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_mmx2)
02664 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_mmx2)
02665 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_mmx2)
02666 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_mmx2)
02667 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_mmx2)
02668 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_mmx2)
02669 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_mmx2)
02670 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_mmx2)
02671 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_mmx2)
02672 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_mmx2)
02673 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_mmx2)
02674 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_mmx2)
02675 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_mmx2)
02676 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_mmx2)
02677 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_mmx2)
02678 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_mmx2)
02679 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_mmx2)
02680 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_mmx2)
02681 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_mmx2)
02682 #endif
02683
02684
02685 #define dspfunc(PFX, IDX, NUM) \
02686 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_mmx2; \
02687 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_mmx2; \
02688 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_mmx2; \
02689 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_mmx2; \
02690 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_mmx2; \
02691 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_mmx2; \
02692 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_mmx2; \
02693 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_mmx2; \
02694 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_mmx2; \
02695 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_mmx2; \
02696 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_mmx2; \
02697 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_mmx2; \
02698 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_mmx2; \
02699 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_mmx2; \
02700 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_mmx2; \
02701 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_mmx2
02702
02703 dspfunc(put_h264_qpel, 0, 16);
02704 dspfunc(put_h264_qpel, 1, 8);
02705 dspfunc(put_h264_qpel, 2, 4);
02706 dspfunc(avg_h264_qpel, 0, 16);
02707 dspfunc(avg_h264_qpel, 1, 8);
02708 dspfunc(avg_h264_qpel, 2, 4);
02709 #undef dspfunc
02710
02711 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2;
02712 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_mmx2;
02713 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_mmx2;
02714 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_mmx2;
02715 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_mmx2;
02716 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
02717 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
02718
02719 #ifdef CONFIG_ENCODERS
02720 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
02721 #endif //CONFIG_ENCODERS
02722 } else if (mm_flags & MM_3DNOW) {
02723 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
02724 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
02725
02726 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
02727 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
02728 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
02729
02730 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
02731 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
02732
02733 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
02734 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
02735 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
02736
02737 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
02738 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
02739 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
02740 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
02741 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
02742 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
02743 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
02744 }
02745
02746 SET_QPEL_FUNC(qpel_pixels_tab[0][ 0], qpel16_mc00_3dnow)
02747 SET_QPEL_FUNC(qpel_pixels_tab[0][ 1], qpel16_mc10_3dnow)
02748 SET_QPEL_FUNC(qpel_pixels_tab[0][ 2], qpel16_mc20_3dnow)
02749 SET_QPEL_FUNC(qpel_pixels_tab[0][ 3], qpel16_mc30_3dnow)
02750 SET_QPEL_FUNC(qpel_pixels_tab[0][ 4], qpel16_mc01_3dnow)
02751 SET_QPEL_FUNC(qpel_pixels_tab[0][ 5], qpel16_mc11_3dnow)
02752 SET_QPEL_FUNC(qpel_pixels_tab[0][ 6], qpel16_mc21_3dnow)
02753 SET_QPEL_FUNC(qpel_pixels_tab[0][ 7], qpel16_mc31_3dnow)
02754 SET_QPEL_FUNC(qpel_pixels_tab[0][ 8], qpel16_mc02_3dnow)
02755 SET_QPEL_FUNC(qpel_pixels_tab[0][ 9], qpel16_mc12_3dnow)
02756 SET_QPEL_FUNC(qpel_pixels_tab[0][10], qpel16_mc22_3dnow)
02757 SET_QPEL_FUNC(qpel_pixels_tab[0][11], qpel16_mc32_3dnow)
02758 SET_QPEL_FUNC(qpel_pixels_tab[0][12], qpel16_mc03_3dnow)
02759 SET_QPEL_FUNC(qpel_pixels_tab[0][13], qpel16_mc13_3dnow)
02760 SET_QPEL_FUNC(qpel_pixels_tab[0][14], qpel16_mc23_3dnow)
02761 SET_QPEL_FUNC(qpel_pixels_tab[0][15], qpel16_mc33_3dnow)
02762 SET_QPEL_FUNC(qpel_pixels_tab[1][ 0], qpel8_mc00_3dnow)
02763 SET_QPEL_FUNC(qpel_pixels_tab[1][ 1], qpel8_mc10_3dnow)
02764 SET_QPEL_FUNC(qpel_pixels_tab[1][ 2], qpel8_mc20_3dnow)
02765 SET_QPEL_FUNC(qpel_pixels_tab[1][ 3], qpel8_mc30_3dnow)
02766 SET_QPEL_FUNC(qpel_pixels_tab[1][ 4], qpel8_mc01_3dnow)
02767 SET_QPEL_FUNC(qpel_pixels_tab[1][ 5], qpel8_mc11_3dnow)
02768 SET_QPEL_FUNC(qpel_pixels_tab[1][ 6], qpel8_mc21_3dnow)
02769 SET_QPEL_FUNC(qpel_pixels_tab[1][ 7], qpel8_mc31_3dnow)
02770 SET_QPEL_FUNC(qpel_pixels_tab[1][ 8], qpel8_mc02_3dnow)
02771 SET_QPEL_FUNC(qpel_pixels_tab[1][ 9], qpel8_mc12_3dnow)
02772 SET_QPEL_FUNC(qpel_pixels_tab[1][10], qpel8_mc22_3dnow)
02773 SET_QPEL_FUNC(qpel_pixels_tab[1][11], qpel8_mc32_3dnow)
02774 SET_QPEL_FUNC(qpel_pixels_tab[1][12], qpel8_mc03_3dnow)
02775 SET_QPEL_FUNC(qpel_pixels_tab[1][13], qpel8_mc13_3dnow)
02776 SET_QPEL_FUNC(qpel_pixels_tab[1][14], qpel8_mc23_3dnow)
02777 SET_QPEL_FUNC(qpel_pixels_tab[1][15], qpel8_mc33_3dnow)
02778
02779 #define dspfunc(PFX, IDX, NUM) \
02780 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_3dnow; \
02781 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_3dnow; \
02782 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_3dnow; \
02783 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_3dnow; \
02784 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_3dnow; \
02785 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_3dnow; \
02786 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_3dnow; \
02787 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_3dnow; \
02788 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_3dnow; \
02789 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_3dnow; \
02790 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_3dnow; \
02791 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_3dnow; \
02792 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_3dnow; \
02793 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_3dnow; \
02794 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_3dnow; \
02795 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_3dnow
02796
02797 dspfunc(put_h264_qpel, 0, 16);
02798 dspfunc(put_h264_qpel, 1, 8);
02799 dspfunc(put_h264_qpel, 2, 4);
02800 dspfunc(avg_h264_qpel, 0, 16);
02801 dspfunc(avg_h264_qpel, 1, 8);
02802 dspfunc(avg_h264_qpel, 2, 4);
02803
02804 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow;
02805 }
02806 }
02807
02808 #ifdef CONFIG_ENCODERS
02809 dsputil_init_pix_mmx(c, avctx);
02810 #endif //CONFIG_ENCODERS
02811 #if 0
02812
02813 get_pixels = just_return;
02814 put_pixels_clamped = just_return;
02815 add_pixels_clamped = just_return;
02816
02817 pix_abs16x16 = just_return;
02818 pix_abs16x16_x2 = just_return;
02819 pix_abs16x16_y2 = just_return;
02820 pix_abs16x16_xy2 = just_return;
02821
02822 put_pixels_tab[0] = just_return;
02823 put_pixels_tab[1] = just_return;
02824 put_pixels_tab[2] = just_return;
02825 put_pixels_tab[3] = just_return;
02826
02827 put_no_rnd_pixels_tab[0] = just_return;
02828 put_no_rnd_pixels_tab[1] = just_return;
02829 put_no_rnd_pixels_tab[2] = just_return;
02830 put_no_rnd_pixels_tab[3] = just_return;
02831
02832 avg_pixels_tab[0] = just_return;
02833 avg_pixels_tab[1] = just_return;
02834 avg_pixels_tab[2] = just_return;
02835 avg_pixels_tab[3] = just_return;
02836
02837 avg_no_rnd_pixels_tab[0] = just_return;
02838 avg_no_rnd_pixels_tab[1] = just_return;
02839 avg_no_rnd_pixels_tab[2] = just_return;
02840 avg_no_rnd_pixels_tab[3] = just_return;
02841
02842
02843
02844 #endif
02845 }