00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #define SUMSUB_BA( a, b ) \
00025 "paddw "#b", "#a" \n\t"\
00026 "paddw "#b", "#b" \n\t"\
00027 "psubw "#a", "#b" \n\t"
00028
00029 #define SUMSUB_BADC( a, b, c, d ) \
00030 "paddw "#b", "#a" \n\t"\
00031 "paddw "#d", "#c" \n\t"\
00032 "paddw "#b", "#b" \n\t"\
00033 "paddw "#d", "#d" \n\t"\
00034 "psubw "#a", "#b" \n\t"\
00035 "psubw "#c", "#d" \n\t"
00036
00037 #define SUMSUBD2_AB( a, b, t ) \
00038 "movq "#b", "#t" \n\t"\
00039 "psraw $1 , "#b" \n\t"\
00040 "paddw "#a", "#b" \n\t"\
00041 "psraw $1 , "#a" \n\t"\
00042 "psubw "#t", "#a" \n\t"
00043
00044 #define IDCT4_1D( s02, s13, d02, d13, t ) \
00045 SUMSUB_BA ( s02, d02 )\
00046 SUMSUBD2_AB( s13, d13, t )\
00047 SUMSUB_BADC( d13, s02, s13, d02 )
00048
00049 #define SBUTTERFLY(a,b,t,n)\
00050 "movq " #a ", " #t " \n\t" \
00051 "punpckl" #n " " #b ", " #a " \n\t" \
00052 "punpckh" #n " " #b ", " #t " \n\t" \
00053
00054 #define TRANSPOSE4(a,b,c,d,t)\
00055 SBUTTERFLY(a,b,t,wd) \
00056 SBUTTERFLY(c,d,b,wd) \
00057 SBUTTERFLY(a,c,d,dq) \
00058 SBUTTERFLY(t,b,c,dq)
00059
00060 #define STORE_DIFF_4P( p, t, z ) \
00061 "psraw $6, "#p" \n\t"\
00062 "movd (%0), "#t" \n\t"\
00063 "punpcklbw "#z", "#t" \n\t"\
00064 "paddsw "#t", "#p" \n\t"\
00065 "packuswb "#z", "#p" \n\t"\
00066 "movd "#p", (%0) \n\t"
00067
00068 void ff_h264_idct_add_mmx2(uint8_t *dst, int16_t *block, int stride)
00069 {
00070
00071 asm volatile(
00072 "movq (%0), %%mm0 \n\t"
00073 "movq 8(%0), %%mm1 \n\t"
00074 "movq 16(%0), %%mm2 \n\t"
00075 "movq 24(%0), %%mm3 \n\t"
00076 :: "r"(block) );
00077
00078 asm volatile(
00079
00080 IDCT4_1D( %%mm2, %%mm1, %%mm0, %%mm3, %%mm4 )
00081
00082 "movq %0, %%mm6 \n\t"
00083
00084 TRANSPOSE4( %%mm3, %%mm1, %%mm0, %%mm2, %%mm4 )
00085
00086 "paddw %%mm6, %%mm3 \n\t"
00087
00088
00089 IDCT4_1D( %%mm4, %%mm2, %%mm3, %%mm0, %%mm1 )
00090
00091 "pxor %%mm7, %%mm7 \n\t"
00092 :: "m"(ff_pw_32));
00093
00094 asm volatile(
00095 STORE_DIFF_4P( %%mm0, %%mm1, %%mm7)
00096 "add %1, %0 \n\t"
00097 STORE_DIFF_4P( %%mm2, %%mm1, %%mm7)
00098 "add %1, %0 \n\t"
00099 STORE_DIFF_4P( %%mm3, %%mm1, %%mm7)
00100 "add %1, %0 \n\t"
00101 STORE_DIFF_4P( %%mm4, %%mm1, %%mm7)
00102 : "+r"(dst)
00103 : "r" ((long)stride)
00104 );
00105 }
00106
00107
00108
00109
00110
00111
00112
00113 #define DIFF_GT_MMX(x,y,a,o,t)\
00114 "movq "#y", "#t" \n\t"\
00115 "movq "#x", "#o" \n\t"\
00116 "psubusb "#x", "#t" \n\t"\
00117 "psubusb "#y", "#o" \n\t"\
00118 "por "#t", "#o" \n\t"\
00119 "psubusb "#a", "#o" \n\t"
00120
00121
00122
00123
00124 #define H264_DEBLOCK_MASK(alpha1, beta1) \
00125 "pshufw $0, "#alpha1", %%mm4 \n\t"\
00126 "pshufw $0, "#beta1 ", %%mm5 \n\t"\
00127 "packuswb %%mm4, %%mm4 \n\t"\
00128 "packuswb %%mm5, %%mm5 \n\t"\
00129 DIFF_GT_MMX(%%mm1, %%mm2, %%mm4, %%mm7, %%mm6) \
00130 DIFF_GT_MMX(%%mm0, %%mm1, %%mm5, %%mm4, %%mm6) \
00131 "por %%mm4, %%mm7 \n\t"\
00132 DIFF_GT_MMX(%%mm3, %%mm2, %%mm5, %%mm4, %%mm6) \
00133 "por %%mm4, %%mm7 \n\t"\
00134 "pxor %%mm6, %%mm6 \n\t"\
00135 "pcmpeqb %%mm6, %%mm7 \n\t"
00136
00137
00138
00139
00140 #define H264_DEBLOCK_P0_Q0(pb_01, pb_3f)\
00141 \
00142 "movq %%mm0, %%mm4 \n\t"\
00143 "psubb %%mm3, %%mm4 \n\t"\
00144 "psrlw $2, %%mm4 \n\t"\
00145 "pxor %%mm1, %%mm4 \n\t"\
00146 "pxor %%mm2, %%mm4 \n\t"\
00147 \
00148 "psrlw $2, %%mm3 \n\t"\
00149 "pand "#pb_3f", %%mm3 \n\t"\
00150 "movq %%mm1, %%mm5 \n\t"\
00151 "pxor %%mm3, %%mm5 \n\t"\
00152 \
00153 "psrlw $2, %%mm0 \n\t"\
00154 "pand "#pb_3f", %%mm0 \n\t"\
00155 "movq %%mm2, %%mm6 \n\t"\
00156 "pxor %%mm0, %%mm6 \n\t"\
00157 \
00158 "pxor %%mm5, %%mm6 \n\t"\
00159 "pxor %%mm4, %%mm5 \n\t"\
00160 "pandn %%mm6, %%mm5 \n\t"\
00161 "pand "#pb_01", %%mm5 \n\t"\
00162
00163 \
00164 "pavgb %%mm2, %%mm0 \n\t"\
00165 "movq %%mm5, %%mm6 \n\t"\
00166 "pand %%mm4, %%mm6 \n\t"\
00167 "paddusb %%mm6, %%mm0 \n\t"\
00168 "pavgb %%mm1, %%mm3 \n\t"\
00169 "pandn %%mm5, %%mm4 \n\t"\
00170 "paddusb %%mm4, %%mm3 \n\t"\
00171
00172 \
00173 "movq %%mm0, %%mm4 \n\t"\
00174 "psubusb %%mm3, %%mm0 \n\t"\
00175 "psubusb %%mm4, %%mm3 \n\t"\
00176 "pminub %%mm7, %%mm0 \n\t"\
00177 "pminub %%mm7, %%mm3 \n\t"\
00178 "paddusb %%mm0, %%mm1 \n\t"\
00179 "paddusb %%mm3, %%mm2 \n\t"\
00180 "psubusb %%mm3, %%mm1 \n\t"\
00181 "psubusb %%mm0, %%mm2 \n\t"
00182
00183
00184
00185
00186 #define H264_DEBLOCK_Q1(p1, q2, q2addr, q1addr, tc0, tmp)\
00187 "movq %%mm1, "#tmp" \n\t"\
00188 "pavgb %%mm2, "#tmp" \n\t"\
00189 "pavgb "#tmp", "#q2" \n\t" \
00190 "pxor "q2addr", "#tmp" \n\t"\
00191 "pand %8, "#tmp" \n\t" \
00192 "psubusb "#tmp", "#q2" \n\t" \
00193 "movq "#p1", "#tmp" \n\t"\
00194 "psubusb "#tc0", "#tmp" \n\t"\
00195 "paddusb "#p1", "#tc0" \n\t"\
00196 "pmaxub "#tmp", "#q2" \n\t"\
00197 "pminub "#tc0", "#q2" \n\t"\
00198 "movq "#q2", "q1addr" \n\t"
00199
00200 static inline void h264_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
00201 {
00202 uint64_t tmp0;
00203 uint64_t tc = (uint8_t)tc0[1]*0x01010000 | (uint8_t)tc0[0]*0x0101;
00204
00205 uint32_t mask[2] = { (tc0[0]>=0)*0xffffffff, (tc0[1]>=0)*0xffffffff };
00206
00207 asm volatile(
00208 "movq (%1,%3), %%mm0 \n\t"
00209 "movq (%1,%3,2), %%mm1 \n\t"
00210 "movq (%2), %%mm2 \n\t"
00211 "movq (%2,%3), %%mm3 \n\t"
00212 H264_DEBLOCK_MASK(%6, %7)
00213 "pand %5, %%mm7 \n\t"
00214 "movq %%mm7, %0 \n\t"
00215
00216
00217 "movq (%1), %%mm3 \n\t"
00218 DIFF_GT_MMX(%%mm1, %%mm3, %%mm5, %%mm6, %%mm4)
00219 "pandn %%mm7, %%mm6 \n\t"
00220 "pcmpeqb %%mm7, %%mm6 \n\t"
00221 "pand %%mm7, %%mm6 \n\t"
00222 "pshufw $80, %4, %%mm4 \n\t"
00223 "pand %%mm7, %%mm4 \n\t"
00224 "movq %8, %%mm7 \n\t"
00225 "pand %%mm6, %%mm7 \n\t"
00226 "pand %%mm4, %%mm6 \n\t"
00227 "paddb %%mm4, %%mm7 \n\t"
00228 H264_DEBLOCK_Q1(%%mm0, %%mm3, "(%1)", "(%1,%3)", %%mm6, %%mm4)
00229
00230
00231 "movq (%2,%3,2), %%mm4 \n\t"
00232 DIFF_GT_MMX(%%mm2, %%mm4, %%mm5, %%mm6, %%mm3)
00233 "pandn %0, %%mm6 \n\t"
00234 "pcmpeqb %0, %%mm6 \n\t"
00235 "pand %0, %%mm6 \n\t"
00236 "pshufw $80, %4, %%mm5 \n\t"
00237 "pand %%mm6, %%mm5 \n\t"
00238 "pand %8, %%mm6 \n\t"
00239 "paddb %%mm6, %%mm7 \n\t"
00240 "movq (%2,%3), %%mm3 \n\t"
00241 H264_DEBLOCK_Q1(%%mm3, %%mm4, "(%2,%3,2)", "(%2,%3)", %%mm5, %%mm6)
00242
00243
00244 H264_DEBLOCK_P0_Q0(%8, %9)
00245 "movq %%mm1, (%1,%3,2) \n\t"
00246 "movq %%mm2, (%2) \n\t"
00247
00248 : "=m"(tmp0)
00249 : "r"(pix-3*stride), "r"(pix), "r"((long)stride),
00250 "m"(tc), "m"(*(uint64_t*)mask), "m"(alpha1), "m"(beta1),
00251 "m"(mm_bone), "m"(ff_pb_3F)
00252 );
00253 }
00254
00255 static void h264_v_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00256 {
00257 if((tc0[0] & tc0[1]) >= 0)
00258 h264_loop_filter_luma_mmx2(pix, stride, alpha-1, beta-1, tc0);
00259 if((tc0[2] & tc0[3]) >= 0)
00260 h264_loop_filter_luma_mmx2(pix+8, stride, alpha-1, beta-1, tc0+2);
00261 }
00262 static void h264_h_loop_filter_luma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00263 {
00264
00265
00266 uint8_t trans[8*8];
00267 int i;
00268 for(i=0; i<2; i++, pix+=8*stride, tc0+=2) {
00269 if((tc0[0] & tc0[1]) < 0)
00270 continue;
00271 transpose4x4(trans, pix-4, 8, stride);
00272 transpose4x4(trans +4*8, pix, 8, stride);
00273 transpose4x4(trans+4, pix-4+4*stride, 8, stride);
00274 transpose4x4(trans+4+4*8, pix +4*stride, 8, stride);
00275 h264_loop_filter_luma_mmx2(trans+4*8, 8, alpha-1, beta-1, tc0);
00276 transpose4x4(pix-2, trans +2*8, stride, 8);
00277 transpose4x4(pix-2+4*stride, trans+4+2*8, stride, 8);
00278 }
00279 }
00280
00281 static inline void h264_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha1, int beta1, int8_t *tc0)
00282 {
00283 asm volatile(
00284 "movq (%0), %%mm0 \n\t"
00285 "movq (%0,%2), %%mm1 \n\t"
00286 "movq (%1), %%mm2 \n\t"
00287 "movq (%1,%2), %%mm3 \n\t"
00288 H264_DEBLOCK_MASK(%4, %5)
00289 "movd %3, %%mm6 \n\t"
00290 "punpcklbw %%mm6, %%mm6 \n\t"
00291 "pand %%mm6, %%mm7 \n\t"
00292 H264_DEBLOCK_P0_Q0(%6, %7)
00293 "movq %%mm1, (%0,%2) \n\t"
00294 "movq %%mm2, (%1) \n\t"
00295
00296 :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
00297 "r"(*(uint32_t*)tc0),
00298 "m"(alpha1), "m"(beta1), "m"(mm_bone), "m"(ff_pb_3F)
00299 );
00300 }
00301
00302 static void h264_v_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00303 {
00304 h264_loop_filter_chroma_mmx2(pix, stride, alpha-1, beta-1, tc0);
00305 }
00306
00307 static void h264_h_loop_filter_chroma_mmx2(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
00308 {
00309
00310 uint8_t trans[8*4];
00311 transpose4x4(trans, pix-2, 8, stride);
00312 transpose4x4(trans+4, pix-2+4*stride, 8, stride);
00313 h264_loop_filter_chroma_mmx2(trans+2*8, 8, alpha-1, beta-1, tc0);
00314 transpose4x4(pix-2, trans, stride, 8);
00315 transpose4x4(pix-2+4*stride, trans+4, stride, 8);
00316 }
00317
00318
00319 #define H264_FILTER_CHROMA4(p0, p1, q1, one) \
00320 "movq "#p0", %%mm4 \n\t"\
00321 "pxor "#q1", %%mm4 \n\t"\
00322 "pand "#one", %%mm4 \n\t" \
00323 "pavgb "#q1", "#p0" \n\t"\
00324 "psubusb %%mm4, "#p0" \n\t"\
00325 "pavgb "#p1", "#p0" \n\t" \
00326
00327 static inline void h264_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha1, int beta1)
00328 {
00329 asm volatile(
00330 "movq (%0), %%mm0 \n\t"
00331 "movq (%0,%2), %%mm1 \n\t"
00332 "movq (%1), %%mm2 \n\t"
00333 "movq (%1,%2), %%mm3 \n\t"
00334 H264_DEBLOCK_MASK(%3, %4)
00335 "movq %%mm1, %%mm5 \n\t"
00336 "movq %%mm2, %%mm6 \n\t"
00337 H264_FILTER_CHROMA4(%%mm1, %%mm0, %%mm3, %5)
00338 H264_FILTER_CHROMA4(%%mm2, %%mm3, %%mm0, %5)
00339 "psubb %%mm5, %%mm1 \n\t"
00340 "psubb %%mm6, %%mm2 \n\t"
00341 "pand %%mm7, %%mm1 \n\t"
00342 "pand %%mm7, %%mm2 \n\t"
00343 "paddb %%mm5, %%mm1 \n\t"
00344 "paddb %%mm6, %%mm2 \n\t"
00345 "movq %%mm1, (%0,%2) \n\t"
00346 "movq %%mm2, (%1) \n\t"
00347 :: "r"(pix-2*stride), "r"(pix), "r"((long)stride),
00348 "m"(alpha1), "m"(beta1), "m"(mm_bone)
00349 );
00350 }
00351
00352 static void h264_v_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
00353 {
00354 h264_loop_filter_chroma_intra_mmx2(pix, stride, alpha-1, beta-1);
00355 }
00356
00357 static void h264_h_loop_filter_chroma_intra_mmx2(uint8_t *pix, int stride, int alpha, int beta)
00358 {
00359
00360 uint8_t trans[8*4];
00361 transpose4x4(trans, pix-2, 8, stride);
00362 transpose4x4(trans+4, pix-2+4*stride, 8, stride);
00363 h264_loop_filter_chroma_intra_mmx2(trans+2*8, 8, alpha-1, beta-1);
00364 transpose4x4(pix-2, trans, stride, 8);
00365 transpose4x4(pix-2+4*stride, trans+4, stride, 8);
00366 }
00367
00368
00369
00370
00371
00372 #define QPEL_H264V(A,B,C,D,E,F,OP)\
00373 "movd (%0), "#F" \n\t"\
00374 "movq "#C", %%mm6 \n\t"\
00375 "paddw "#D", %%mm6 \n\t"\
00376 "psllw $2, %%mm6 \n\t"\
00377 "psubw "#B", %%mm6 \n\t"\
00378 "psubw "#E", %%mm6 \n\t"\
00379 "pmullw %4, %%mm6 \n\t"\
00380 "add %2, %0 \n\t"\
00381 "punpcklbw %%mm7, "#F" \n\t"\
00382 "paddw %5, "#A" \n\t"\
00383 "paddw "#F", "#A" \n\t"\
00384 "paddw "#A", %%mm6 \n\t"\
00385 "psraw $5, %%mm6 \n\t"\
00386 "packuswb %%mm6, %%mm6 \n\t"\
00387 OP(%%mm6, (%1), A, d)\
00388 "add %3, %1 \n\t"
00389
00390 #define QPEL_H264HV(A,B,C,D,E,F,OF)\
00391 "movd (%0), "#F" \n\t"\
00392 "movq "#C", %%mm6 \n\t"\
00393 "paddw "#D", %%mm6 \n\t"\
00394 "psllw $2, %%mm6 \n\t"\
00395 "psubw "#B", %%mm6 \n\t"\
00396 "psubw "#E", %%mm6 \n\t"\
00397 "pmullw %3, %%mm6 \n\t"\
00398 "add %2, %0 \n\t"\
00399 "punpcklbw %%mm7, "#F" \n\t"\
00400 "paddw "#F", "#A" \n\t"\
00401 "paddw "#A", %%mm6 \n\t"\
00402 "movq %%mm6, "#OF"(%1) \n\t"
00403
00404 #define QPEL_H264(OPNAME, OP, MMX)\
00405 static void OPNAME ## h264_qpel4_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00406 int h=4;\
00407 \
00408 asm volatile(\
00409 "pxor %%mm7, %%mm7 \n\t"\
00410 "movq %5, %%mm4 \n\t"\
00411 "movq %6, %%mm5 \n\t"\
00412 "1: \n\t"\
00413 "movd -1(%0), %%mm1 \n\t"\
00414 "movd (%0), %%mm2 \n\t"\
00415 "movd 1(%0), %%mm3 \n\t"\
00416 "movd 2(%0), %%mm0 \n\t"\
00417 "punpcklbw %%mm7, %%mm1 \n\t"\
00418 "punpcklbw %%mm7, %%mm2 \n\t"\
00419 "punpcklbw %%mm7, %%mm3 \n\t"\
00420 "punpcklbw %%mm7, %%mm0 \n\t"\
00421 "paddw %%mm0, %%mm1 \n\t"\
00422 "paddw %%mm3, %%mm2 \n\t"\
00423 "movd -2(%0), %%mm0 \n\t"\
00424 "movd 3(%0), %%mm3 \n\t"\
00425 "punpcklbw %%mm7, %%mm0 \n\t"\
00426 "punpcklbw %%mm7, %%mm3 \n\t"\
00427 "paddw %%mm3, %%mm0 \n\t"\
00428 "psllw $2, %%mm2 \n\t"\
00429 "psubw %%mm1, %%mm2 \n\t"\
00430 "pmullw %%mm4, %%mm2 \n\t"\
00431 "paddw %%mm5, %%mm0 \n\t"\
00432 "paddw %%mm2, %%mm0 \n\t"\
00433 "psraw $5, %%mm0 \n\t"\
00434 "packuswb %%mm0, %%mm0 \n\t"\
00435 OP(%%mm0, (%1),%%mm6, d)\
00436 "add %3, %0 \n\t"\
00437 "add %4, %1 \n\t"\
00438 "decl %2 \n\t"\
00439 " jnz 1b \n\t"\
00440 : "+a"(src), "+c"(dst), "+m"(h)\
00441 : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
00442 : "memory"\
00443 );\
00444 }\
00445 static void OPNAME ## h264_qpel4_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00446 src -= 2*srcStride;\
00447 asm volatile(\
00448 "pxor %%mm7, %%mm7 \n\t"\
00449 "movd (%0), %%mm0 \n\t"\
00450 "add %2, %0 \n\t"\
00451 "movd (%0), %%mm1 \n\t"\
00452 "add %2, %0 \n\t"\
00453 "movd (%0), %%mm2 \n\t"\
00454 "add %2, %0 \n\t"\
00455 "movd (%0), %%mm3 \n\t"\
00456 "add %2, %0 \n\t"\
00457 "movd (%0), %%mm4 \n\t"\
00458 "add %2, %0 \n\t"\
00459 "punpcklbw %%mm7, %%mm0 \n\t"\
00460 "punpcklbw %%mm7, %%mm1 \n\t"\
00461 "punpcklbw %%mm7, %%mm2 \n\t"\
00462 "punpcklbw %%mm7, %%mm3 \n\t"\
00463 "punpcklbw %%mm7, %%mm4 \n\t"\
00464 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
00465 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
00466 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
00467 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
00468 \
00469 : "+a"(src), "+c"(dst)\
00470 : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
00471 : "memory"\
00472 );\
00473 }\
00474 static void OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
00475 int h=4;\
00476 int w=3;\
00477 src -= 2*srcStride+2;\
00478 while(w--){\
00479 asm volatile(\
00480 "pxor %%mm7, %%mm7 \n\t"\
00481 "movd (%0), %%mm0 \n\t"\
00482 "add %2, %0 \n\t"\
00483 "movd (%0), %%mm1 \n\t"\
00484 "add %2, %0 \n\t"\
00485 "movd (%0), %%mm2 \n\t"\
00486 "add %2, %0 \n\t"\
00487 "movd (%0), %%mm3 \n\t"\
00488 "add %2, %0 \n\t"\
00489 "movd (%0), %%mm4 \n\t"\
00490 "add %2, %0 \n\t"\
00491 "punpcklbw %%mm7, %%mm0 \n\t"\
00492 "punpcklbw %%mm7, %%mm1 \n\t"\
00493 "punpcklbw %%mm7, %%mm2 \n\t"\
00494 "punpcklbw %%mm7, %%mm3 \n\t"\
00495 "punpcklbw %%mm7, %%mm4 \n\t"\
00496 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*3)\
00497 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*3)\
00498 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*3)\
00499 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*3)\
00500 \
00501 : "+a"(src)\
00502 : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
00503 : "memory"\
00504 );\
00505 tmp += 4;\
00506 src += 4 - 9*srcStride;\
00507 }\
00508 tmp -= 3*4;\
00509 asm volatile(\
00510 "movq %4, %%mm6 \n\t"\
00511 "1: \n\t"\
00512 "movq (%0), %%mm0 \n\t"\
00513 "paddw 10(%0), %%mm0 \n\t"\
00514 "movq 2(%0), %%mm1 \n\t"\
00515 "paddw 8(%0), %%mm1 \n\t"\
00516 "movq 4(%0), %%mm2 \n\t"\
00517 "paddw 6(%0), %%mm2 \n\t"\
00518 "psubw %%mm1, %%mm0 \n\t"\
00519 "psraw $2, %%mm0 \n\t"\
00520 "psubw %%mm1, %%mm0 \n\t"\
00521 "paddsw %%mm2, %%mm0 \n\t"\
00522 "psraw $2, %%mm0 \n\t"\
00523 "paddw %%mm6, %%mm2 \n\t"\
00524 "paddw %%mm2, %%mm0 \n\t"\
00525 "psraw $6, %%mm0 \n\t"\
00526 "packuswb %%mm0, %%mm0 \n\t"\
00527 OP(%%mm0, (%1),%%mm7, d)\
00528 "add $24, %0 \n\t"\
00529 "add %3, %1 \n\t"\
00530 "decl %2 \n\t"\
00531 " jnz 1b \n\t"\
00532 : "+a"(tmp), "+c"(dst), "+m"(h)\
00533 : "S"((long)dstStride), "m"(ff_pw_32)\
00534 : "memory"\
00535 );\
00536 }\
00537 \
00538 static void OPNAME ## h264_qpel8_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00539 int h=8;\
00540 asm volatile(\
00541 "pxor %%mm7, %%mm7 \n\t"\
00542 "movq %5, %%mm6 \n\t"\
00543 "1: \n\t"\
00544 "movq (%0), %%mm0 \n\t"\
00545 "movq 1(%0), %%mm2 \n\t"\
00546 "movq %%mm0, %%mm1 \n\t"\
00547 "movq %%mm2, %%mm3 \n\t"\
00548 "punpcklbw %%mm7, %%mm0 \n\t"\
00549 "punpckhbw %%mm7, %%mm1 \n\t"\
00550 "punpcklbw %%mm7, %%mm2 \n\t"\
00551 "punpckhbw %%mm7, %%mm3 \n\t"\
00552 "paddw %%mm2, %%mm0 \n\t"\
00553 "paddw %%mm3, %%mm1 \n\t"\
00554 "psllw $2, %%mm0 \n\t"\
00555 "psllw $2, %%mm1 \n\t"\
00556 "movq -1(%0), %%mm2 \n\t"\
00557 "movq 2(%0), %%mm4 \n\t"\
00558 "movq %%mm2, %%mm3 \n\t"\
00559 "movq %%mm4, %%mm5 \n\t"\
00560 "punpcklbw %%mm7, %%mm2 \n\t"\
00561 "punpckhbw %%mm7, %%mm3 \n\t"\
00562 "punpcklbw %%mm7, %%mm4 \n\t"\
00563 "punpckhbw %%mm7, %%mm5 \n\t"\
00564 "paddw %%mm4, %%mm2 \n\t"\
00565 "paddw %%mm3, %%mm5 \n\t"\
00566 "psubw %%mm2, %%mm0 \n\t"\
00567 "psubw %%mm5, %%mm1 \n\t"\
00568 "pmullw %%mm6, %%mm0 \n\t"\
00569 "pmullw %%mm6, %%mm1 \n\t"\
00570 "movd -2(%0), %%mm2 \n\t"\
00571 "movd 7(%0), %%mm5 \n\t"\
00572 "punpcklbw %%mm7, %%mm2 \n\t"\
00573 "punpcklbw %%mm7, %%mm5 \n\t"\
00574 "paddw %%mm3, %%mm2 \n\t"\
00575 "paddw %%mm5, %%mm4 \n\t"\
00576 "movq %6, %%mm5 \n\t"\
00577 "paddw %%mm5, %%mm2 \n\t"\
00578 "paddw %%mm5, %%mm4 \n\t"\
00579 "paddw %%mm2, %%mm0 \n\t"\
00580 "paddw %%mm4, %%mm1 \n\t"\
00581 "psraw $5, %%mm0 \n\t"\
00582 "psraw $5, %%mm1 \n\t"\
00583 "packuswb %%mm1, %%mm0 \n\t"\
00584 OP(%%mm0, (%1),%%mm5, q)\
00585 "add %3, %0 \n\t"\
00586 "add %4, %1 \n\t"\
00587 "decl %2 \n\t"\
00588 " jnz 1b \n\t"\
00589 : "+a"(src), "+c"(dst), "+m"(h)\
00590 : "d"((long)srcStride), "S"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
00591 : "memory"\
00592 );\
00593 }\
00594 \
00595 static void OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00596 int h= 2;\
00597 src -= 2*srcStride;\
00598 \
00599 while(h--){\
00600 asm volatile(\
00601 "pxor %%mm7, %%mm7 \n\t"\
00602 "movd (%0), %%mm0 \n\t"\
00603 "add %2, %0 \n\t"\
00604 "movd (%0), %%mm1 \n\t"\
00605 "add %2, %0 \n\t"\
00606 "movd (%0), %%mm2 \n\t"\
00607 "add %2, %0 \n\t"\
00608 "movd (%0), %%mm3 \n\t"\
00609 "add %2, %0 \n\t"\
00610 "movd (%0), %%mm4 \n\t"\
00611 "add %2, %0 \n\t"\
00612 "punpcklbw %%mm7, %%mm0 \n\t"\
00613 "punpcklbw %%mm7, %%mm1 \n\t"\
00614 "punpcklbw %%mm7, %%mm2 \n\t"\
00615 "punpcklbw %%mm7, %%mm3 \n\t"\
00616 "punpcklbw %%mm7, %%mm4 \n\t"\
00617 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
00618 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
00619 QPEL_H264V(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP)\
00620 QPEL_H264V(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP)\
00621 QPEL_H264V(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP)\
00622 QPEL_H264V(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP)\
00623 QPEL_H264V(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP)\
00624 QPEL_H264V(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP)\
00625 \
00626 : "+a"(src), "+c"(dst)\
00627 : "S"((long)srcStride), "D"((long)dstStride), "m"(ff_pw_5), "m"(ff_pw_16)\
00628 : "memory"\
00629 );\
00630 src += 4-13*srcStride;\
00631 dst += 4-8*dstStride;\
00632 }\
00633 }\
00634 static void OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
00635 int h=8;\
00636 int w=4;\
00637 src -= 2*srcStride+2;\
00638 while(w--){\
00639 asm volatile(\
00640 "pxor %%mm7, %%mm7 \n\t"\
00641 "movd (%0), %%mm0 \n\t"\
00642 "add %2, %0 \n\t"\
00643 "movd (%0), %%mm1 \n\t"\
00644 "add %2, %0 \n\t"\
00645 "movd (%0), %%mm2 \n\t"\
00646 "add %2, %0 \n\t"\
00647 "movd (%0), %%mm3 \n\t"\
00648 "add %2, %0 \n\t"\
00649 "movd (%0), %%mm4 \n\t"\
00650 "add %2, %0 \n\t"\
00651 "punpcklbw %%mm7, %%mm0 \n\t"\
00652 "punpcklbw %%mm7, %%mm1 \n\t"\
00653 "punpcklbw %%mm7, %%mm2 \n\t"\
00654 "punpcklbw %%mm7, %%mm3 \n\t"\
00655 "punpcklbw %%mm7, %%mm4 \n\t"\
00656 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 0*8*4)\
00657 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 1*8*4)\
00658 QPEL_H264HV(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, 2*8*4)\
00659 QPEL_H264HV(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, 3*8*4)\
00660 QPEL_H264HV(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, 4*8*4)\
00661 QPEL_H264HV(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, 5*8*4)\
00662 QPEL_H264HV(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, 6*8*4)\
00663 QPEL_H264HV(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, 7*8*4)\
00664 \
00665 : "+a"(src)\
00666 : "c"(tmp), "S"((long)srcStride), "m"(ff_pw_5)\
00667 : "memory"\
00668 );\
00669 tmp += 4;\
00670 src += 4 - 13*srcStride;\
00671 }\
00672 tmp -= 4*4;\
00673 asm volatile(\
00674 "movq %4, %%mm6 \n\t"\
00675 "1: \n\t"\
00676 "movq (%0), %%mm0 \n\t"\
00677 "movq 8(%0), %%mm3 \n\t"\
00678 "movq 2(%0), %%mm1 \n\t"\
00679 "movq 10(%0), %%mm4 \n\t"\
00680 "paddw %%mm4, %%mm0 \n\t"\
00681 "paddw %%mm3, %%mm1 \n\t"\
00682 "paddw 18(%0), %%mm3 \n\t"\
00683 "paddw 16(%0), %%mm4 \n\t"\
00684 "movq 4(%0), %%mm2 \n\t"\
00685 "movq 12(%0), %%mm5 \n\t"\
00686 "paddw 6(%0), %%mm2 \n\t"\
00687 "paddw 14(%0), %%mm5 \n\t"\
00688 "psubw %%mm1, %%mm0 \n\t"\
00689 "psubw %%mm4, %%mm3 \n\t"\
00690 "psraw $2, %%mm0 \n\t"\
00691 "psraw $2, %%mm3 \n\t"\
00692 "psubw %%mm1, %%mm0 \n\t"\
00693 "psubw %%mm4, %%mm3 \n\t"\
00694 "paddsw %%mm2, %%mm0 \n\t"\
00695 "paddsw %%mm5, %%mm3 \n\t"\
00696 "psraw $2, %%mm0 \n\t"\
00697 "psraw $2, %%mm3 \n\t"\
00698 "paddw %%mm6, %%mm2 \n\t"\
00699 "paddw %%mm6, %%mm5 \n\t"\
00700 "paddw %%mm2, %%mm0 \n\t"\
00701 "paddw %%mm5, %%mm3 \n\t"\
00702 "psraw $6, %%mm0 \n\t"\
00703 "psraw $6, %%mm3 \n\t"\
00704 "packuswb %%mm3, %%mm0 \n\t"\
00705 OP(%%mm0, (%1),%%mm7, q)\
00706 "add $32, %0 \n\t"\
00707 "add %3, %1 \n\t"\
00708 "decl %2 \n\t"\
00709 " jnz 1b \n\t"\
00710 : "+a"(tmp), "+c"(dst), "+m"(h)\
00711 : "S"((long)dstStride), "m"(ff_pw_32)\
00712 : "memory"\
00713 );\
00714 }\
00715 static void OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00716 OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
00717 OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
00718 src += 8*srcStride;\
00719 dst += 8*dstStride;\
00720 OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
00721 OPNAME ## h264_qpel8_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
00722 }\
00723 \
00724 static void OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
00725 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
00726 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
00727 src += 8*srcStride;\
00728 dst += 8*dstStride;\
00729 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
00730 OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
00731 }\
00732 \
00733 static void OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
00734 OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\
00735 OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp , src+8, dstStride, tmpStride, srcStride);\
00736 src += 8*srcStride;\
00737 dst += 8*dstStride;\
00738 OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride);\
00739 OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(dst+8, tmp , src+8, dstStride, tmpStride, srcStride);\
00740 }\
00741
00742 #define H264_MC(OPNAME, SIZE, MMX) \
00743 static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
00744 OPNAME ## pixels ## SIZE ## _mmx(dst, src, stride, SIZE);\
00745 }\
00746 \
00747 static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00748 uint64_t temp[SIZE*SIZE/8];\
00749 uint8_t * const half= (uint8_t*)temp;\
00750 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\
00751 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\
00752 }\
00753 \
00754 static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00755 OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
00756 }\
00757 \
00758 static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00759 uint64_t temp[SIZE*SIZE/8];\
00760 uint8_t * const half= (uint8_t*)temp;\
00761 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(half, src, SIZE, stride);\
00762 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+1, half, stride, stride, SIZE);\
00763 }\
00764 \
00765 static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00766 uint64_t temp[SIZE*SIZE/8];\
00767 uint8_t * const half= (uint8_t*)temp;\
00768 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\
00769 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, half, stride, stride, SIZE);\
00770 }\
00771 \
00772 static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00773 OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
00774 }\
00775 \
00776 static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00777 uint64_t temp[SIZE*SIZE/8];\
00778 uint8_t * const half= (uint8_t*)temp;\
00779 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(half, src, SIZE, stride);\
00780 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, half, stride, stride, SIZE);\
00781 }\
00782 \
00783 static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00784 uint64_t temp[SIZE*SIZE/4];\
00785 uint8_t * const halfH= (uint8_t*)temp;\
00786 uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
00787 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
00788 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
00789 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
00790 }\
00791 \
00792 static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00793 uint64_t temp[SIZE*SIZE/4];\
00794 uint8_t * const halfH= (uint8_t*)temp;\
00795 uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
00796 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
00797 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
00798 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
00799 }\
00800 \
00801 static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00802 uint64_t temp[SIZE*SIZE/4];\
00803 uint8_t * const halfH= (uint8_t*)temp;\
00804 uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
00805 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
00806 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
00807 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
00808 }\
00809 \
00810 static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00811 uint64_t temp[SIZE*SIZE/4];\
00812 uint8_t * const halfH= (uint8_t*)temp;\
00813 uint8_t * const halfV= ((uint8_t*)temp) + SIZE*SIZE;\
00814 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
00815 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
00816 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfV, stride, SIZE, SIZE);\
00817 }\
00818 \
00819 static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00820 uint64_t temp[SIZE*(SIZE+8)/4];\
00821 int16_t * const tmp= (int16_t*)temp;\
00822 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, tmp, src, stride, SIZE, stride);\
00823 }\
00824 \
00825 static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00826 uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
00827 uint8_t * const halfH= (uint8_t*)temp;\
00828 uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
00829 int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
00830 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src, SIZE, stride);\
00831 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
00832 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\
00833 }\
00834 \
00835 static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00836 uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
00837 uint8_t * const halfH= (uint8_t*)temp;\
00838 uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
00839 int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
00840 put_h264_qpel ## SIZE ## _h_lowpass_ ## MMX(halfH, src + stride, SIZE, stride);\
00841 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
00842 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfH, halfHV, stride, SIZE, SIZE);\
00843 }\
00844 \
00845 static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00846 uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
00847 uint8_t * const halfV= (uint8_t*)temp;\
00848 uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
00849 int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
00850 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src, SIZE, stride);\
00851 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
00852 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\
00853 }\
00854 \
00855 static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
00856 uint64_t temp[SIZE*(SIZE+8)/4 + SIZE*SIZE/4];\
00857 uint8_t * const halfV= (uint8_t*)temp;\
00858 uint8_t * const halfHV= ((uint8_t*)temp) + SIZE*SIZE;\
00859 int16_t * const tmp= ((int16_t*)temp) + SIZE*SIZE;\
00860 put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(halfV, src+1, SIZE, stride);\
00861 put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, tmp, src, SIZE, SIZE, stride);\
00862 OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, halfV, halfHV, stride, SIZE, SIZE);\
00863 }\
00864
00865
00866 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
00867 #define AVG_3DNOW_OP(a,b,temp, size) \
00868 "mov" #size " " #b ", " #temp " \n\t"\
00869 "pavgusb " #temp ", " #a " \n\t"\
00870 "mov" #size " " #a ", " #b " \n\t"
00871 #define AVG_MMX2_OP(a,b,temp, size) \
00872 "mov" #size " " #b ", " #temp " \n\t"\
00873 "pavgb " #temp ", " #a " \n\t"\
00874 "mov" #size " " #a ", " #b " \n\t"
00875
00876 QPEL_H264(put_, PUT_OP, 3dnow)
00877 QPEL_H264(avg_, AVG_3DNOW_OP, 3dnow)
00878 QPEL_H264(put_, PUT_OP, mmx2)
00879 QPEL_H264(avg_, AVG_MMX2_OP, mmx2)
00880
00881 H264_MC(put_, 4, 3dnow)
00882 H264_MC(put_, 8, 3dnow)
00883 H264_MC(put_, 16,3dnow)
00884 H264_MC(avg_, 4, 3dnow)
00885 H264_MC(avg_, 8, 3dnow)
00886 H264_MC(avg_, 16,3dnow)
00887 H264_MC(put_, 4, mmx2)
00888 H264_MC(put_, 8, mmx2)
00889 H264_MC(put_, 16,mmx2)
00890 H264_MC(avg_, 4, mmx2)
00891 H264_MC(avg_, 8, mmx2)
00892 H264_MC(avg_, 16,mmx2)
00893
00894
00895 #define H264_CHROMA_OP(S,D)
00896 #define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx
00897 #include "dsputil_h264_template_mmx.c"
00898 #undef H264_CHROMA_OP
00899 #undef H264_CHROMA_MC8_TMPL
00900
00901 #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
00902 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2
00903 #include "dsputil_h264_template_mmx.c"
00904 #undef H264_CHROMA_OP
00905 #undef H264_CHROMA_MC8_TMPL
00906
00907 #define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t"
00908 #define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_3dnow
00909 #include "dsputil_h264_template_mmx.c"
00910 #undef H264_CHROMA_OP
00911 #undef H264_CHROMA_MC8_TMPL
00912